In [6]:
import pandas as pd
import numpy as np

import os
import sys
sys.path.append('/home/ebutz/ESL2024/code/utils') # Ajouter le dossier contenant constants.py
import constants as c

import torch
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.loader import DataLoader
import torch.optim as optim
from torch_geometric.nn import ComplEx

import random

import wandb
from tqdm import tqdm

In [12]:
# Datas :
iric_csv_path = "/home/ebutz/ESL2024/data/full_iric/iric.csv"
test_ratio = 0.1
val_ratio  = 0.1

# ComplEx embeddings :
hidden_channels = 1
batch_size = 4096
epochs = 3
neg_per_pos = 1 #Number of negatives per positive during training
K = 10 #K from Hit@K

device = 'cuda' if torch.cuda.is_available() else 'cpu'
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
print(device)

cpu


In [13]:
def triples_from_csv(path_to_csv, columns_to_use = c.IricNode.features.value):
    """
    Creates triples from a CSV.

    Parameters:
    - path_to_csv (str): Filepath or buffer to the input CSV file.
    - columns_to_use (list): A list of the columns to consider.

    Returns:
    - triples (pandas.DataFrame): Output DataFrame in triple format.
                                  Subjects are index items, predicates are column names from columns_to_use, objects are non-NaN values in columns.
    """

    df = pd.read_csv(filepath_or_buffer=path_to_csv, sep = ',', index_col = 0)
    df.columns = df.columns.str.lower()
    
    # Create a list of triples
    triples = []
    # Drop feature columns
    columns_to_drop = [col for col in columns_to_use if col in df.columns]
    df.drop(columns=columns_to_drop, inplace=True)
    df = df.replace({np.nan:None})
    
    for index, row in df.iterrows():
        for column in df.columns:
            if row[column] is not None:
                for predicate in row[column].split('|'):
                    triples.append([index, column, predicate])

    # Create a dataframe from the list of triples
    return pd.DataFrame(triples, columns=['subject', 'predicate', 'object'])

# Extracting triples from original csv :
iric_triples = triples_from_csv(path_to_csv = iric_csv_path)

# Mapping entities and relations to integers ids :
entity_set = set(iric_triples['object']).union(set(iric_triples['subject']))
entity_to_mapping = {entity: int(i) for i, entity in enumerate(entity_set)}
relation_set = set(iric_triples['predicate'])
relation_to_mapping = {relation: int(i) for i, relation in enumerate(relation_set)}

# Triples to mapped triples :
iric_triples['mapped_subject'] = iric_triples['subject'].apply(lambda x: entity_to_mapping[x])
iric_triples['mapped_predicate'] = iric_triples['predicate'].apply(lambda x: relation_to_mapping[x])
iric_triples['mapped_object'] = iric_triples['object'].apply(lambda x: entity_to_mapping[x])
display(iric_triples)

  df = pd.read_csv(filepath_or_buffer=path_to_csv, sep = ',', index_col = 0)


Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object
0,GO:0000001,is_a,GO:0048311,14772,0,73157
1,GO:0000001,is_a,GO:0048308,14772,0,53234
2,GO:0000002,is_a,GO:0007005,42827,0,26877
3,GO:0000003,is_a,GO:0008150,34404,0,32918
4,GO:0000006,is_a,GO:0005385,63164,0,68346
...,...,...,...,...,...,...
1452516,OsNippo12g255000,interacts_with,OsNippo07g025800,41355,4,47669
1452517,OsNippo12g255000,interacts_with,OsNippo07g207000,41355,4,18233
1452518,OsNippo12g255000,interacts_with,OsNippo07g207600,41355,4,2548
1452519,OsNippo12g255000,interacts_with,OsNippo10g150350,41355,4,56335


In [14]:
print(min(iric_triples['mapped_subject']))

0


In [28]:
# Shuffling ontology triples only :
onto = iric_triples[iric_triples['mapped_predicate'] == 0]
shuffled_subjects = onto['mapped_subject'].sample(frac=1).values
shuffled_objects = onto['mapped_object'].sample(frac=1).values
iric_triples_with_randomised_GO = iric_triples.copy()
iric_triples_with_randomised_GO.loc[iric_triples['mapped_predicate'] == 0, 'mapped_subject'] = shuffled_subjects
iric_triples_with_randomised_GO.loc[iric_triples['mapped_predicate'] == 0, 'mapped_object'] = shuffled_objects
iric_triples_with_randomised_GO

Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object
0,GO:0000001,is_a,GO:0048311,43379,0,31216
1,GO:0000001,is_a,GO:0048308,53302,0,53679
2,GO:0000002,is_a,GO:0007005,49353,0,528
3,GO:0000003,is_a,GO:0008150,11925,0,79718
4,GO:0000006,is_a,GO:0005385,37469,0,68571
...,...,...,...,...,...,...
71872,TO:0020026,is_a,TO:0020019,15767,0,5008
71873,TO:0020026,is_a,TO:0020028,8118,0,11095
71874,TO:0020027,is_a,TO:0020028,36066,0,16012
71875,TO:0020027,is_a,TO:0020020,26072,0,52851


array([57749, 53531, 11094, ..., 25578, 75564, 24184])

array([35310, 70011, 74058, ..., 34868, 14557, 22309])

Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object
0,GO:0000001,is_a,GO:0048311,57749,0,35310
1,GO:0000001,is_a,GO:0048308,53531,0,70011
2,GO:0000002,is_a,GO:0007005,11094,0,74058
3,GO:0000003,is_a,GO:0008150,24223,0,8468
4,GO:0000006,is_a,GO:0005385,54825,0,25017
...,...,...,...,...,...,...
1452516,OsNippo12g255000,interacts_with,OsNippo07g025800,41355,4,47669
1452517,OsNippo12g255000,interacts_with,OsNippo07g207000,41355,4,18233
1452518,OsNippo12g255000,interacts_with,OsNippo07g207600,41355,4,2548
1452519,OsNippo12g255000,interacts_with,OsNippo10g150350,41355,4,56335


In [19]:
a = [0,1,2,3,4,5,6,7]
random.shuffle(a)
a

[6, 1, 2, 7, 4, 3, 0, 5]

In [29]:
# Triples to pyg framework :

# Edges index :
heads = list(iric_triples['mapped_subject'])
tails = list(iric_triples['mapped_object'])
edge_index = torch.tensor([heads,tails], dtype=torch.long)
edge_attributes = torch.tensor(iric_triples['mapped_predicate'])

iric_pyg = Data(
                num_nodes = len(entity_set),
                edge_index = edge_index,
                edge_attr = edge_attributes
                )

print(iric_pyg)

print("\nDataset looks valid :",iric_pyg.validate(raise_on_error=True))

transform = RandomLinkSplit(
                            num_val = val_ratio,
                            num_test = test_ratio,
                            is_undirected=False,
                            add_negative_train_samples=False,
                            )

train_data, val_data, test_data = transform(iric_pyg)
print("Train, test, val sets look valid :",train_data.validate(raise_on_error=True), test_data.validate(raise_on_error=True), val_data.validate(raise_on_error=True))

Data(edge_index=[2, 1452521], edge_attr=[1452521], num_nodes=82249)

Dataset looks valid : True
Train, test, val sets look valid : True True True


In [30]:
# Initiating model :
complex = ComplEx(
    num_nodes=train_data.num_nodes,
    num_relations = train_data.edge_index.size()[1],
    hidden_channels=hidden_channels,
).to(device)
complex.reset_parameters()
complex.to(device)

# Initiaing loader :
head_index = train_data.edge_index[0]
tail_index = train_data.edge_index[1]
rel_type = train_data.edge_attr

loader = complex.loader(
    head_index = head_index,
    tail_index = tail_index,
    rel_type = rel_type,
    batch_size=batch_size,
    shuffle=True,
)
print("Loader type :", type(loader))

# initiating optimizers :
complex_optimizer = optim.Adam(complex.parameters())

# Defining test and train functions :
@torch.no_grad()
def test(data, model):
    model.eval()
    return model.test(
        head_index=data.edge_index[0],
        tail_index=data.edge_index[1],
        rel_type=data.edge_attr,
        batch_size=batch_size, # No need for Tail_Only_ComplEx because one use only 1000 random sample instead of the full dataset.
        k=K, #The k in Hit@k
    )

def train(loader, model, optimizer):
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples

Loader type : <class 'torch_geometric.nn.kge.loader.KGTripletLoader'>


In [32]:
# Running XP :

wandb.init(
    settings=wandb.Settings(start_method="fork"),
    project="Complex_on_Iric_with_randomized_Ontology",
    
    # track hyperparameters and run metadata
    config={
    "architecture": "ComplEx on Full Iric",
    "dataset": "Iric",
    "epochs": epochs,
    'hidden_channels' : hidden_channels,
    'batch_size' : batch_size
    }
)

losses = []
for epoch in range(1, epochs+1):
    loss = train(model=to_complex, loader = loader, optimizer=complex_optimizer)
    losses.append(loss)
    wandb.log({"loss": loss})

    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

    rank, mrr, hit = test(val_data, model=to_complex)
    print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}', f'Val MRR: {mrr:.4f}, Val Hits@10: {hit:.4f}')

    wandb.log({"Val Mean Rank" : rank, "Val MRR" : mrr, "hits@10": hit})


wandb.finish()


Epoch: 001, Loss: 0.6931


  1%|          | 7002/1162017 [02:40<5:15:36, 60.99it/s] 

In [None]:
# Triples to pyg framework :

# Edges index :
heads = list(iric_triples_with_randomised_GO['mapped_subject'])
tails = list(iric_triples_with_randomised_GO['mapped_object'])
edge_index = torch.tensor([heads,tails], dtype=torch.long)
edge_attributes = torch.tensor(iric_triples_with_randomised_GO['mapped_predicate'])

iric_pyg = Data(
                num_nodes = len(entity_set),
                edge_index = edge_index,
                edge_attr = edge_attributes
                )

print(iric_pyg)

print("\nDataset looks valid :",iric_pyg.validate(raise_on_error=True))

transform = RandomLinkSplit(
                            num_val = val_ratio,
                            num_test = test_ratio,
                            is_undirected=False,
                            add_negative_train_samples=False,
                            )

train_data, val_data, test_data = transform(iric_pyg)
print("Train, test, val sets look valid :",train_data.validate(raise_on_error=True), test_data.validate(raise_on_error=True), val_data.validate(raise_on_error=True))

# Initiating model :
complex = ComplEx(
    num_nodes=train_data.num_nodes,
    num_relations = train_data.edge_index.size()[1],
    hidden_channels=hidden_channels,
).to(device)
complex.reset_parameters()
complex.to(device)

# Initiaing loader :
head_index = train_data.edge_index[0]
tail_index = train_data.edge_index[1]
rel_type = train_data.edge_attr

loader = complex.loader(
    head_index = head_index,
    tail_index = tail_index,
    rel_type = rel_type,
    batch_size=batch_size,
    shuffle=True,
)
print("Loader type :", type(loader))

# initiating optimizers :
complex_optimizer = optim.Adam(complex.parameters())

In [8]:
# To recreate the bug : 
# Modifying complex

def shuffle_tensor(t: torch.Tensor):
    '''
    Shuffles elements of a tensor.
    WARNING :
    shuffle_tensor(torch.tensor([[0,1,2,3,4,5],[6,7,8,9,0,1]])) returns :
    tensor([[0, 1, 2, 3, 4, 5],  OR tensor([[6, 7, 8, 9, 0, 1],
            [6, 7, 8, 9, 0, 1]])            [0, 1, 2, 3, 4, 5]])
    '''
    idx = torch.randperm(t.shape[0])
    return t[idx].view(t.size())

def random_sample(
    head_index: torch.Tensor,
    rel_type: torch.Tensor,
    tail_index: torch.Tensor,
    ):

    """
    Randomly samples negative triplets by replacing the tail.
    Args:
    head_index (torch.Tensor): The head indices.
    rel_type (torch.Tensor): The relation type.
    tail_index (torch.Tensor): The tail indices.
    """

    tail_index = shuffle_tensor(tail_index.clone()).to(device)

    return head_index, rel_type, tail_index

def make_pos_and_neg(head_index, rel_type, tail_index, neg_for_pos=1):
    pos = head_index, rel_type, tail_index
    neg = list(random_sample(*pos))

    for i in range(3):
        neg[i] = torch.cat([pos[i].clone().detach()]*neg_for_pos)

    neg[2] = shuffle_tensor(neg[2])
    
    return tuple(pos), tuple(neg)


class tail_only_ComplEx(ComplEx):
    
    '''
    Overwritting random_sample() to make negative triples by setting a random tail to each triple,
    instead of setting a random head or tail.

    Moreover, test() is made on 1000 random triples instead of all existing triples. The eval is much faster like this on large datasets.

    I add a method to have different numbers of negative per positive when calculating the loss.

    '''

    def __init__(
        self,
        num_nodes: int,
        num_relations: int,
        hidden_channels: int,
        sparse: bool = False,
        negative_per_positive = 1
                ):
        super().__init__(num_nodes, num_relations, hidden_channels)

        self.num_nodes = num_nodes
        self.num_relations = num_relations
        self.hidden_channels = hidden_channels

        self.node_emb = torch.nn.Embedding(num_nodes, hidden_channels, sparse=sparse)
        self.rel_emb = torch.nn.Embedding(num_relations, hidden_channels, sparse=sparse)
        self.neg_for_pos = negative_per_positive

    @torch.no_grad()
    def test(
        self,
        head_index: torch.Tensor,
        rel_type: torch.Tensor,
        tail_index: torch.Tensor,
        k: int = 10,
        log: bool = True,
    ):
        r"""Evaluates the model quality by computing Mean Rank, MRR and
        Hits@:math:`k` across 1000 tail entities.

        Args:
            head_index (torch.Tensor): The head indices.
            rel_type (torch.Tensor): The relation type.
            tail_index (torch.Tensor): The tail indices.
            batch_size (int): The batch size to use for evaluating.
            k (int, optional): The :math:`k` in Hits @ :math:`k`.
                (default: :obj:`10`)
            log (bool, optional): If set to :obj:`False`, will not print a
                progress bar to the console. (default: :obj:`True`)
        """
         
        mean_ranks, reciprocal_ranks, hits_at_k = [], [], []

        random_index = random.sample(range(tail_index.numel()), 1000)
        tested_triples_index = tqdm(random_index,
                                    desc=f"Calculating mean rank; MRR, Hit@10 among 1000 triples") if log else random_index
        
        # List the tails existing in the graph :
        tails = tail_index.unique().tolist()

        for i in tested_triples_index:

            h, r, t = head_index[i], rel_type[i], tail_index[i]

            # Select the index of 999 false tails :
            tail_indices = random.sample(tails, 999)
            # Add the true tail's index :
            tail_indices.append(i)
            # Score each triple (h, r, false_t) :
            ts = torch.tensor(tail_indices).to(device)
            rels = r.expand_as(ts).to(device)
            heads = h.expand_as(ts).to(device)
            self.to(device)
            scores = self(heads, rels, ts)
            # Sort the scores and find the rank of the true triplet score
            sorted_indices = torch.argsort(scores, descending=True)
            rank = (sorted_indices == 999).nonzero().item()

            # Using rank to precalculate metrics :
            mean_ranks.append(rank)
            reciprocal_ranks.append(1 / (rank + 1))
            hits_at_k.append(rank < k)

        # Calculate metrics :
        mean_rank = float(torch.tensor(mean_ranks, dtype=torch.float).mean())
        mrr = float(torch.tensor(reciprocal_ranks, dtype=torch.float).mean())
        hits_at_k = int(torch.tensor(hits_at_k).sum()) / len(hits_at_k)

        return mean_rank, mrr, hits_at_k