# Imports, install and mount

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# import sys
# my_local_drive='/content/gdrive/MyDrive/SL2024/datas'
# # Ajout du path pour les librairies, fonctions et données
# sys.path.append(my_local_drive)
# # Se positionner sur le répertoire associé
# %cd $my_local_drive
# # %pwd
# print("DRIVE MOUNTED")

<!--  -->

In [2]:
# ! pip install cuda
# ! pip install torch_geometric
# ! pip install nxontology
# ! pip install tensordict
# ! pip install pandas
# ! pip install tensorflow
# ! pip install scipy
# ! pip install matplotlib

import matplotlib.pyplot as plt
import torch
import torch.optim as optim
from torch_geometric.nn import ComplEx
from torch_geometric.data import Data
import pandas as pd
from tqdm import tqdm
from torch_geometric.loader import DataLoader

import wandb



# Settings


In [3]:
# ComplEx embeddings :

hidden_channels = 10
batch_size = 1000
epochs = 10

file_path = "/home/elliot/Documents/ESL2024/data/little_genes_to_phenotypes_iric.tsv"

params_save_name = f"ComplEx_HC_{hidden_channels}_BS_{batch_size}_epochs_{epochs}"
params_save_path = "/home/elliot/Documents/ESL2024/code/models_parameters/ComplEx"+params_save_name

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
wandb.init(
    # set the wandb project where this run will be logged
    project="ComplEx on little_Os_to_GO_iric",
    
    # track hyperparameters and run metadata
    config={
    "architecture": "ComplEx",
    "dataset": "head - 10000 genes_to_phenotypes_iric.tsv",
    "epochs": epochs,
    'hidden_channels' : hidden_channels,
    'batch_size' : batch_size
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mbutzelliot[0m ([33mesl2024[0m). Use [1m`wandb login --relogin`[0m to force relogin


Problem at: /tmp/ipykernel_75497/1336610842.py 1 <module>


KeyboardInterrupt: 

# DATAS

## Reading and mapping graph

What we want : Create a Data object with all the properties I want to use later

	- x (tensorised and processed node attributes) (Not for now)
	- edge_index (a tensor of shape (num_edges, 2) indicates the source node index and the destination node index)
	- y (desired edge labels - optional, can be defined as node labels if needed) (Not for now)
	- any other things you want to use later

In [None]:
iric = pd.read_csv(file_path, delimiter='\t', names = ['subject', 'predicate','object'])
display(iric)

# Mapping entities to ids

entity_set = set(iric['object']).union(set(iric['subject']))
entity_to_mapping = {entity: int(i) for i, entity in enumerate(entity_set)}
relation_set = set(iric['predicate'])
relation_to_mapping = {relation: int(i) for i, relation in enumerate(relation_set)}

mapped_iric = iric.copy()
mapped_iric['object'] = mapped_iric['object'].apply(lambda x: entity_to_mapping[x])
mapped_iric['subject'] = mapped_iric['subject'].apply(lambda x: entity_to_mapping[x])
mapped_iric['predicate'] = mapped_iric['predicate'].apply(lambda x: relation_to_mapping[x])

# display(mapped_iric)

# print('OK')
# print("Minima in mappings :")
# print('subject :', min(mapped_iric['subject']))
# print('predicate :',min(mapped_iric['predicate']))
# print('object :',min(mapped_iric['object']))

# print(entity_to_mapping)
# print(relation_to_mapping)

mapping_to_entity = {v: k for k, v in entity_to_mapping.items()}
mapping_to_relation = {v: k for k, v in relation_to_mapping.items()}

# print("Mapping to entity dict :",mapping_to_entity)
# print("Mapping to relation dict",mapping_to_relation)

Unnamed: 0,subject,predicate,object
0,OsNippo01g010050,gene ontology,GO:0031267
1,OsNippo01g010050,gene ontology,GO:0006886
2,OsNippo01g010050,gene ontology,GO:0005622
3,OsNippo01g010050,gene ontology,GO:0005623
4,OsNippo01g010050,gene ontology,GO:0090630
...,...,...,...
9996,OsNippo01g223000,gene ontology,GO:0005784
9997,OsNippo01g223050,gene ontology,GO:0005634
9998,OsNippo01g223050,gene ontology,GO:0005737
9999,OsNippo01g223050,gene ontology,GO:0003676


## Building init vars for Data :

In [None]:
# # Initial nodes states :
# x = torch.ones(len(entity_set), 1)  # Chaque nœud a 1 pour état initial
# print('X : \n',x)

# Edges index
heads = list(mapped_iric['subject'])
tails = list(mapped_iric['object'])
edge_index = torch.tensor([heads,tails], dtype=torch.long)
print('\nEDGE INDEX : \n',edge_index)

# edges states
edge_attributes = torch.tensor(mapped_iric['predicate'])
print('\nEDGES ATTRIBUTES : \n',edge_attributes)

iric_pyg = Data(
                # x = x,
                num_nodes = len(entity_set),
                edge_index = edge_index,
                edge_attr = edge_attributes)
print('\nDATASET :\n',iric_pyg)


print("\nDataset looks valid ? \n",iric_pyg.validate(raise_on_error=True))


EDGE INDEX : 
 tensor([[ 409,  409,  409,  ..., 1982, 1982, 1982],
        [ 779, 1868, 2623,  ..., 3025, 3066, 1578]])

EDGES ATTRIBUTES : 
 tensor([0, 0, 0,  ..., 0, 0, 0])

DATASET :
 Data(edge_index=[2, 10001], edge_attr=[10001], num_nodes=3343)

Dataset looks valid ? 
 True


## Setting up datas and model


## Splitting dataset

In [None]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
                            num_val = 0.1,
                            num_test = 0.1,
                            is_undirected=False,
                            add_negative_train_samples=False,
                            )

train_data, val_data, test_data = transform(iric_pyg)

print(f"Hole Dataset :\n {iric_pyg}\n\nTrain:\n{train_data}\n\nTest :\n{test_data}\n\nValidation :\n{val_data}")
# Il ne faut pas regarder num_edges parce que RLS cache les arêtes mais ne les sort pas du graph.
# print(f"Number of edges in datasets : \n  Hole Dataset : {iric_pyg.num_edges}\n\n  Train: {train_data.num_edges}\n\n  Test : {test_data.num_edges}\n\n  Validation : {val_data.num_edges}")
print(f"Number of edges in datasets : \n  Train: {list(train_data.edge_label.size())[0]}\n\n  Test : {list(test_data.edge_label.size())[0]}\n\n  Validation : {list(val_data.edge_label.size())[0]}")

train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

# print('\n\n',train_data.num_nodes)
# print(train_data.num_edge_types)
# print(train_data.__dict__)
# print(train_data.edge_index[0].size())
# print(train_data.edge_index[1].size())
# print(train_data.edge_attr.size())
# print(train_data.edge_attr)
# print(train_data.num_nodes)
# print(train_data.edge_index.size()[1])

Hole Dataset :
 Data(edge_index=[2, 10001], edge_attr=[10001], num_nodes=3343)

Train:
Data(edge_index=[2, 8001], edge_attr=[8001], num_nodes=3343, edge_label=[8001], edge_label_index=[2, 8001])

Test :
Data(edge_index=[2, 9001], edge_attr=[9001], num_nodes=3343, edge_label=[2000], edge_label_index=[2, 2000])

Validation :
Data(edge_index=[2, 8001], edge_attr=[8001], num_nodes=3343, edge_label=[2000], edge_label_index=[2, 2000])
Number of edges in datasets : 
  Train: 8001

  Test : 2000

  Validation : 2000


# MODELS


## Iniating models and loaders

In [None]:
# Initiating models

complex_model = ComplEx(
    num_nodes=train_data.num_nodes,
    num_relations = train_data.edge_index.size()[1],
    hidden_channels=hidden_channels,
).to(device)

# Initiaing loader
head_index = train_data.edge_index[0]
tail_index = train_data.edge_index[1]
rel_type = train_data.edge_attr

loader = complex_model.loader(
    head_index = head_index,
    tail_index = tail_index,
    rel_type = rel_type,
    batch_size=batch_size,
    shuffle=True,
)

# initiating optimizers
complex_optimizer = optim.Adam(complex_model.parameters())

print(batch_size)

1000


## Train and test functions

In [None]:
@torch.no_grad()
def test(data, model):
    model.eval()
    return model.test(
        head_index=test_data.edge_index[0],
        tail_index=test_data.edge_index[1],
        rel_type=test_data.edge_attr,
        batch_size=1000,
        k=10,
    )

def train(model, optimizer):
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples

def plot_loss(loss_list):
    plt.plot(loss_list, label='Loss')
    plt.title('Evolution des Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def plot_loss_log(loss_list):
    plt.semilogy(loss_list, label='Loss')
    plt.title('Evolution des Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def running_mean(list,
                 half_window: int # Number of elements that the function will consider
                                  # ahead and behind the position X to calculate running mean at X.
                 ):
    running_means = []

    for i in range(0,len(list)):

        left_bound = max(0,i-half_window)
        right_bound = min(len(list)-1, i + half_window)
        sublist = list[left_bound:right_bound+1]
        running_means.append(sum(sublist)/len(sublist))

    return running_means

## Train and test

In [None]:
torch.set_grad_enabled(True)

complex_model.reset_parameters()
complex_model.to(device)

start_rank, start_mrr, start_hits_at_10 = test(test_data, model=complex_model)
print(f'Start Test Mean Rank: {start_rank:.2f}, Start Test MRR: {start_mrr:.4f}, '
      f'Start Test Hits@10: {start_hits_at_10:.4f}')

losses = []
for epoch in range(0, epochs+1):
    loss = train(model=complex_model, optimizer=complex_optimizer)
    losses.append(loss)
    wandb.log({"loss": loss})
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

    if epoch % epochs/100 == 0:
        rank, mrr, hits = test(val_data, model=complex_model)
        print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}',
              f'Val MRR: {mrr:.4f}, Val Hits@10: {hits:.4f}')
        wandb.log({"Val Mean Rank" : rank, "Val MRR" : mrr, "hits@10": hits})

rank, mrr, hits_at_10 = test(test_data, model=complex_model)

torch.set_grad_enabled(False)

100%|██████████| 9001/9001 [00:10<00:00, 866.91it/s]


Start Test Mean Rank: 1671.30, Start Test MRR: 0.0029, Start Test Hits@10: 0.0031
Epoch: 001, Loss: 0.6931


100%|██████████| 9001/9001 [00:10<00:00, 897.81it/s]


Epoch: 001, Val Mean Rank: 1492.04 Val MRR: 0.0048, Val Hits@10: 0.0080
Epoch: 002, Loss: 0.6931


100%|██████████| 9001/9001 [00:10<00:00, 882.79it/s]


Epoch: 002, Val Mean Rank: 1323.60 Val MRR: 0.0073, Val Hits@10: 0.0107
Epoch: 003, Loss: 0.6931


100%|██████████| 9001/9001 [00:09<00:00, 914.51it/s]


Epoch: 003, Val Mean Rank: 1151.69 Val MRR: 0.0124, Val Hits@10: 0.0222
Epoch: 004, Loss: 0.6931


100%|██████████| 9001/9001 [00:10<00:00, 892.95it/s]


Epoch: 004, Val Mean Rank: 979.59 Val MRR: 0.0214, Val Hits@10: 0.0422
Epoch: 005, Loss: 0.6931


100%|██████████| 9001/9001 [00:10<00:00, 863.17it/s]


Epoch: 005, Val Mean Rank: 811.16 Val MRR: 0.0394, Val Hits@10: 0.0734
Epoch: 006, Loss: 0.6931


100%|██████████| 9001/9001 [00:10<00:00, 860.13it/s]


Epoch: 006, Val Mean Rank: 656.05 Val MRR: 0.0636, Val Hits@10: 0.1192
Epoch: 007, Loss: 0.6931


  1%|          | 97/9001 [00:00<00:10, 826.69it/s]


KeyboardInterrupt: 

In [None]:
wandb.finish()

In [None]:
torch.save(complex_model.state_dict(), params_save_path)