# Imports, install and mount

<!--  -->

In [2]:
import torch
from torch_geometric.data import Data
import pandas as pd
import pickle
from torch_geometric.transforms import RandomLinkSplit

# Settings


In [5]:
file_path = "/home/ebutz/ESL2024/data/Os_to_GO_iric/genes_to_phenotypes_iric.tsv"

altail_mapping_save_path = "/home/ebutz/ESL2024/data/Os_to_GO_iric/altailed_Os_to_GO_iric.pickle"
altail_dict_save_path = "/home/ebutz/ESL2024/data/little_iric/DICT_altailed_Os_to_GO_iric.pickle"
datasets_save_path = '/home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/little_dataset_'
val_path = datasets_save_path + 'VAL'
test_path = datasets_save_path + 'TEST'
train_path = datasets_save_path + 'TRAIN'

device = 'cpu'

## Reading and mapping graph

In [11]:
iric = pd.read_csv(file_path, delimiter='\t', names=['subject', 'predicate','object'])
display(iric)

Unnamed: 0,subject,predicate,object
0,OsNippo01g010050,gene ontology,GO:0031267
1,OsNippo01g010050,gene ontology,GO:0006886
2,OsNippo01g010050,gene ontology,GO:0005622
3,OsNippo01g010050,gene ontology,GO:0005623
4,OsNippo01g010050,gene ontology,GO:0090630
...,...,...,...
169243,OsNippo12g248550,gene ontology,GO:0009409
169244,OsNippo12g248550,gene ontology,GO:0001666
169245,OsNippo12g250550,gene ontology,GO:0008270
169246,OsNippo12g255100,gene ontology,GO:0005576


In [12]:
# Mapping entities and relations to ids
entity_set = set(iric['object']).union(set(iric['subject']))
entity_to_mapping = {entity: int(i) for i, entity in enumerate(entity_set)}
relation_set = set(iric['predicate'])
relation_to_mapping = {relation: int(i) for i, relation in enumerate(relation_set)}

iric['mapped_subject'] = iric['subject'].apply(lambda x: entity_to_mapping[x])
iric['mapped_predicate'] = iric['predicate'].apply(lambda x: relation_to_mapping[x])
iric['mapped_object'] = iric['object'].apply(lambda x: entity_to_mapping[x])
print(iric.dtypes)
display(iric)

subject             object
predicate           object
object              object
mapped_subject       int64
mapped_predicate     int64
mapped_object        int64
dtype: object


Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object
0,OsNippo01g010050,gene ontology,GO:0031267,29269,0,8105
1,OsNippo01g010050,gene ontology,GO:0006886,29269,0,20179
2,OsNippo01g010050,gene ontology,GO:0005622,29269,0,25142
3,OsNippo01g010050,gene ontology,GO:0005623,29269,0,11107
4,OsNippo01g010050,gene ontology,GO:0090630,29269,0,11397
...,...,...,...,...,...,...
169243,OsNippo12g248550,gene ontology,GO:0009409,12214,0,1993
169244,OsNippo12g248550,gene ontology,GO:0001666,12214,0,12298
169245,OsNippo12g250550,gene ontology,GO:0008270,24465,0,27212
169246,OsNippo12g255100,gene ontology,GO:0005576,887,0,3993


In [13]:
from tqdm import tqdm
tqdm.pandas()

iric['mapped_alt_tails'] = iric.progress_apply(
                                             lambda row: 
                                             iric.loc[(iric['mapped_subject'] == row['mapped_subject']) & 
                                                      (iric['mapped_predicate'] == row['mapped_predicate']) & 
                                                      (iric['mapped_object'] != row['mapped_object']), 
                                                                                                               'mapped_object'].values, 
                                             axis=1)


display(iric)
iric.to_pickle(altail_mapping_save_path)
print(f"Saved mapping at {altail_mapping_save_path} :")
display(pd.read_pickle(altail_mapping_save_path))

100%|██████████| 169248/169248 [02:13<00:00, 1263.45it/s]


Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object,mapped_alt_tails
0,OsNippo01g010050,gene ontology,GO:0031267,29269,0,8105,"[20179, 25142, 11107, 11397, 25543, 24490]"
1,OsNippo01g010050,gene ontology,GO:0006886,29269,0,20179,"[8105, 25142, 11107, 11397, 25543, 24490]"
2,OsNippo01g010050,gene ontology,GO:0005622,29269,0,25142,"[8105, 20179, 11107, 11397, 25543, 24490]"
3,OsNippo01g010050,gene ontology,GO:0005623,29269,0,11107,"[8105, 20179, 25142, 11397, 25543, 24490]"
4,OsNippo01g010050,gene ontology,GO:0090630,29269,0,11397,"[8105, 20179, 25142, 11107, 25543, 24490]"
...,...,...,...,...,...,...,...
169243,OsNippo12g248550,gene ontology,GO:0009409,12214,0,1993,"[25466, 17386, 6443, 18599, 16454, 12298]"
169244,OsNippo12g248550,gene ontology,GO:0001666,12214,0,12298,"[25466, 17386, 6443, 18599, 16454, 1993]"
169245,OsNippo12g250550,gene ontology,GO:0008270,24465,0,27212,[]
169246,OsNippo12g255100,gene ontology,GO:0005576,887,0,3993,[13967]


Saved mapping at /home/ebutz/ESL2024/data/Os_to_GO_iric/altailed_Os_to_GO_iric.pickle :


Unnamed: 0,subject,predicate,object,mapped_subject,mapped_predicate,mapped_object,mapped_alt_tails
0,OsNippo01g010050,gene ontology,GO:0031267,29269,0,8105,"[20179, 25142, 11107, 11397, 25543, 24490]"
1,OsNippo01g010050,gene ontology,GO:0006886,29269,0,20179,"[8105, 25142, 11107, 11397, 25543, 24490]"
2,OsNippo01g010050,gene ontology,GO:0005622,29269,0,25142,"[8105, 20179, 11107, 11397, 25543, 24490]"
3,OsNippo01g010050,gene ontology,GO:0005623,29269,0,11107,"[8105, 20179, 25142, 11397, 25543, 24490]"
4,OsNippo01g010050,gene ontology,GO:0090630,29269,0,11397,"[8105, 20179, 25142, 11107, 25543, 24490]"
...,...,...,...,...,...,...,...
169243,OsNippo12g248550,gene ontology,GO:0009409,12214,0,1993,"[25466, 17386, 6443, 18599, 16454, 12298]"
169244,OsNippo12g248550,gene ontology,GO:0001666,12214,0,12298,"[25466, 17386, 6443, 18599, 16454, 1993]"
169245,OsNippo12g250550,gene ontology,GO:0008270,24465,0,27212,[]
169246,OsNippo12g255100,gene ontology,GO:0005576,887,0,3993,[13967]


In [14]:
def update_alt_tails(row, mapped_alt_tails):
    key = (row['mapped_subject'], row['mapped_predicate'])
    if key not in mapped_alt_tails:
        try:
            mapped_alt_tails[key] = set(row['mapped_alt_tails'])
        except Exception as e:
            print(f"Problem with {row['mapped_alt_tails']}: {e}")
    else:
        try:
            mapped_alt_tails[key].update(set(row['mapped_alt_tails']))
        except Exception as e:
            print(f"Problem with {row['mapped_alt_tails']}: {e}")

mapped_alt_tails = {}
tqdm.pandas(desc="Making dict of alternatives tails")
iric.progress_apply(lambda row: update_alt_tails(row, mapped_alt_tails), axis=1)

for key, value in mapped_alt_tails.items():
    mapped_alt_tails[key]=list(value)

print('First key-value pair in dict :', list(mapped_alt_tails.items())[0])

Making dict of alternatives tails:   0%|          | 0/169248 [00:00<?, ?it/s]

Making dict of alternatives tails: 100%|██████████| 169248/169248 [00:01<00:00, 85767.72it/s]

First key-value pair in dict : ((29269, 0), [11107, 11397, 25543, 8105, 24490, 20179, 25142])





In [15]:
with open(altail_dict_save_path , 'wb') as handle:
    pickle.dump(mapped_alt_tails, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
with open(altail_dict_save_path, 'rb') as handle:
    unserialized_data = pickle.load(handle)
print(list(unserialized_data.items())[0])

((29269, 0), [11107, 11397, 25543, 8105, 24490, 20179, 25142])


## Building init vars for Data :

In [None]:
# # Edges index
# heads = list(iric['mapped_subject'])
# tails = list(iric['mapped_object'])
# edge_index = torch.tensor([heads,tails], dtype=torch.long)
# # edges states
# edge_attributes = torch.tensor(iric['mapped_predicate'])

# iric_pyg = Data(
#                 num_nodes = len(entity_set),
#                 edge_index = edge_index,
#                 edge_attr = edge_attributes
#                 )

# print(iric_pyg)

# print("\nDataset looks valid ? \n",iric_pyg.validate(raise_on_error=True))

Data(edge_index=[2, 10001], edge_attr=[10001], num_nodes=3343)

Dataset looks valid ? 
 True


## Splitting dataset

In [None]:
# transform = RandomLinkSplit(
#                             num_val = 0.1,
#                             num_test = 0.1,
#                             is_undirected=False,
#                             add_negative_train_samples=False,
#                             )

# train, val, test = transform(iric_pyg)

# torch.save(obj=train, f = train_path)
# torch.save(obj=test, f = test_path)
# torch.save(obj=val, f = val_path)

# print(f'test saved at {test_path}\nval saved at : {val_path}\ntrain saved at : {train_path}\n')

# print('Loaded datasets look valid (val, test, train):',
# torch.load(val_path).validate(raise_on_error=True),
# torch.load(test_path).validate(raise_on_error=True),
# torch.load(train_path).validate(raise_on_error=True),'\n')

# print('Before :', val)
# print(' After :', torch.load(val_path))

test saved at /home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/little_dataset_TEST.pickle
val saved at : /home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/little_dataset_VAL.pickle
train saved at : /home/elliot/Documents/ESL2024/data/mapping_datasets_and_model_for_genes_to_phenotypes_iric/little_dataset_TRAIN.pickle

Loaded datasets look valid (val, test, train): True True True 

Before : Data(edge_index=[2, 8001], edge_attr=[8001], num_nodes=3343, edge_label=[2000], edge_label_index=[2, 2000])
 After : Data(edge_index=[2, 8001], edge_attr=[8001], num_nodes=3343, edge_label=[2000], edge_label_index=[2, 2000])
