# Imports, install and mount

<!--  -->

In [2]:
# ! pip install cuda
# ! pip install torch_geometric
# ! pip install nxontology
# ! pip install tensordict
# ! pip install pandas
# ! pip install tensorflow
# ! pip install scipy
# ! pip install matplotlib

# ! pip3 install torch==2.0.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch_geometric
from torch_geometric.nn import ComplEx
from torch_geometric.data import Data
import pandas as pd
from tqdm import tqdm
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import RandomLinkSplit

import wandb

import pickle



# Settings


In [3]:
# ComplEx embeddings :

hidden_channels = 220
batch_size = 4096
epochs = 1000

params_save_name = f"PARAMS_ComplEx_HC_6_times_{hidden_channels}_on_full_Os_GO"
params_save_path = "/home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/"+params_save_name

# Datas
mapped_iric_path = '/home/elliot/Documents/ESL2024/data/mapped_Os_to_GO_iric.tsv'
datasets_save_path = '/home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/dataset_'
val_path = datasets_save_path + 'VAL' +  '.pickle'
test_path = datasets_save_path + 'TEST' +  '.pickle'
train_path = datasets_save_path + 'TRAIN' +  '.pickle'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
print(device)

# wandb.init(
#     settings=wandb.Settings(start_method="fork"),
#     # set the wandb project where this run will be logged
#     project="ComplEx on Os_to_GO_iric",
    
#     # track hyperparameters and run metadata
#     config={
#     "architecture": "ComplEx",
#     "dataset": "genes_to_phenotypes_iric.tsv",
#     "epochs": epochs,
#     'hidden_channels' : hidden_channels,
#     'batch_size' : batch_size
#     }
# )

cpu


  return torch._C._cuda_getDeviceCount() > 0


# DATAS

In [16]:
mapped_iric = pd.read_csv(mapped_iric_path, sep = '\t')
display(mapped_iric)

GO_to_map = {row['object']: row['mapped_object'] for index, row in mapped_iric.iterrows()}
map_to_GO = {key: value for key, value in GO_to_map.items()}

print('Dict looks ok :', bool(GO_to_map[mapped_iric['object'][0]]==mapped_iric['mapped_object'][0]))

Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object
0,OsNippo01g010050,gene ontology,GO:0031267,8201,0,6566
1,OsNippo01g010050,gene ontology,GO:0006886,8201,0,20154
2,OsNippo01g010050,gene ontology,GO:0005622,8201,0,20826
3,OsNippo01g010050,gene ontology,GO:0005623,8201,0,10373
4,OsNippo01g010050,gene ontology,GO:0090630,8201,0,2733
...,...,...,...,...,...,...
169243,OsNippo12g248550,gene ontology,GO:0009409,20245,0,12440
169244,OsNippo12g248550,gene ontology,GO:0001666,20245,0,4625
169245,OsNippo12g250550,gene ontology,GO:0008270,20383,0,15186
169246,OsNippo12g255100,gene ontology,GO:0005576,29052,0,8295


In [15]:
val_data = torch.load(val_path)
test_data = torch.load(test_path)
train_data = torch.load(train_path)

print("Datatsets look OK ? (val, train, test) :",
val_data.validate(),
test_data.validate(),
train_data.validate())

print(val_data)
print(test_data)
print(train_data)

Datatsets look OK ? (val, train, test) : True True True
Data(edge_index=[2, 135400], edge_attr=[135400], num_nodes=30396, edge_label=[33848], edge_label_index=[2, 33848])
Data(edge_index=[2, 152324], edge_attr=[152324], num_nodes=30396, edge_label=[33848], edge_label_index=[2, 33848])
Data(edge_index=[2, 135400], edge_attr=[135400], num_nodes=30396, edge_label=[135400], edge_label_index=[2, 135400])


# MODELS


## Iniating models and loaders

In [20]:
# Initiating models

complex_model = ComplEx(
    num_nodes=train_data.num_nodes,
    num_relations = train_data.edge_index.size()[1],
    hidden_channels=hidden_channels,
).to(device)

# Initiaing loader
head_index = train_data.edge_index[0]
tail_index = train_data.edge_index[1]
rel_type = train_data.edge_attr

loader = complex_model.loader(
    head_index = head_index,
    tail_index = tail_index,
    rel_type = rel_type,
    batch_size=batch_size,
    shuffle=True,
)

print("Loader type :", type(loader))

# initiating optimizers
complex_optimizer = optim.Adam(complex_model.parameters())

print(batch_size)

Loader type : <class 'torch_geometric.nn.kge.loader.KGTripletLoader'>
4096


## Train and test functions

In [19]:
@torch.no_grad()
def test(data, model):
    model.eval()
    return model.test(
        head_index=data.edge_index[0],
        tail_index=data.edge_index[1],
        rel_type=data.edge_attr,
        batch_size=batch_size,
        k=10, #The k in Hit@k
    )

def train(loader, model, optimizer):
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples

## Train and test

In [22]:
for a in range(0,20):
    print(a)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [20]:
torch.set_grad_enabled(True)

complex_model.reset_parameters()
complex_model.to(device)

losses = []
for epoch in range(1, epochs+1):
    loss = train(model=complex_model, optimizer=complex_optimizer)
    losses.append(loss)
    wandb.log({"loss": loss})

    if epoch%10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

    if epoch % epochs%500 == 0:
        rank, mrr, hits = test(val_data, model=complex_model)
        print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}',
              f'Val MRR: {mrr:.4f}, Val Hits@10: {hits:.4f}')
        wandb.log({"Val Mean Rank" : rank, "Val MRR" : mrr, "hits@10": hits})


torch.set_grad_enabled(False)

Epoch: 000, Loss: 0.6931


100%|██████████| 152324/152324 [15:32<00:00, 163.37it/s]


Epoch: 000, Val Mean Rank: 8673.97 Val MRR: 0.0084, Val Hits@10: 0.0152
Epoch: 001, Loss: 0.6931
Epoch: 002, Loss: 0.6919
Epoch: 003, Loss: 0.6790
Epoch: 004, Loss: 0.6337
Epoch: 005, Loss: 0.5650
Epoch: 006, Loss: 0.5067
Epoch: 007, Loss: 0.4716
Epoch: 008, Loss: 0.4505
Epoch: 009, Loss: 0.4338
Epoch: 010, Loss: 0.4214
Epoch: 011, Loss: 0.4078
Epoch: 012, Loss: 0.3963
Epoch: 013, Loss: 0.3798
Epoch: 014, Loss: 0.3634
Epoch: 015, Loss: 0.3454
Epoch: 016, Loss: 0.3260
Epoch: 017, Loss: 0.3041
Epoch: 018, Loss: 0.2823
Epoch: 019, Loss: 0.2609
Epoch: 020, Loss: 0.2403
Epoch: 021, Loss: 0.2203
Epoch: 022, Loss: 0.2039
Epoch: 023, Loss: 0.1869
Epoch: 024, Loss: 0.1727
Epoch: 025, Loss: 0.1618
Epoch: 026, Loss: 0.1496
Epoch: 027, Loss: 0.1387
Epoch: 028, Loss: 0.1305
Epoch: 029, Loss: 0.1216
Epoch: 030, Loss: 0.1159
Epoch: 031, Loss: 0.1104
Epoch: 032, Loss: 0.1032
Epoch: 033, Loss: 0.0997
Epoch: 034, Loss: 0.0942
Epoch: 035, Loss: 0.0915
Epoch: 036, Loss: 0.0873
Epoch: 037, Loss: 0.0825
Epo

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
wandb.finish()
print("WandB finished.")

torch.save(complex_model.state_dict(), params_save_path)
print("Model saved at", params_save_path)

0,1
Val MRR,▁█
Val Mean Rank,█▁
hits@10,▁█
loss,█▅▅▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Val MRR,0.16405
Val Mean Rank,230.30151
hits@10,0.36778
loss,0.06995


WandB finished.
Model saved at /home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/PARAMS_ComplEx_HC_6_times_5_on_full_Os_GO
