In [39]:
import matplotlib.pyplot as plt
import pickle
import numpy as np
import csv
import torch
import pandas as pd
import scipy.sparse as sp

In [40]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Map the proteins nodes </h1>

In [41]:
path = "../"

pp_df = pd.read_csv(path + 'datasets/bio-decagon-ppi.csv')  

In [42]:
pp_df

Unnamed: 0,Gene 1,Gene 2
0,114787,375519
1,114787,285613
2,114787,7448
3,114787,4914
4,114787,51343
...,...,...
715607,5634,5636
715608,5635,5636
715609,5635,5631
715610,5636,5631


In [43]:
genes_1 = list(pp_df["Gene 1"])
genes_2 = list(pp_df["Gene 2"])
targets = torch.tensor((genes_1 + genes_2)).unique().tolist()

In [44]:
len(targets)

19081

In [45]:
new_protein_map = {}
index = 0
for gene in targets:
    new_protein_map[gene] = index
    index += 1

In [46]:
pp_df["Gene 1"] = pp_df["Gene 1"].apply(lambda x: new_protein_map[x])
pp_df["Gene 2"] = pp_df["Gene 2"].apply(lambda x: new_protein_map[x])
pp_df

Unnamed: 0,Gene 1,Gene 2
0,14460,17695
1,14460,17140
2,14460,5157
3,14460,3289
4,14460,9828
...,...,...
715607,3826,3828
715608,3827,3828
715609,3827,3825
715610,3828,3825


In [47]:
genes_1 = list(pp_df["Gene 1"])
genes_2 = list(pp_df["Gene 2"])
pp_edge_index = torch.tensor([genes_1, genes_2])
pp_edge_index

tensor([[14460, 14460, 14460,  ...,  3827,  3828,  5630],
        [17695, 17140,  5157,  ...,  3825,  3825,  5635]])

<h1>Full side effect dataset</h1>

In [48]:
df = pd.read_csv(path + 'datasets/bio-decagon-combo.csv')  

In [49]:
df

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,CID000002173,CID000003345,C0151714,hypermagnesemia
1,CID000002173,CID000003345,C0035344,retinopathy of prematurity
2,CID000002173,CID000003345,C0004144,atelectasis
3,CID000002173,CID000003345,C0002063,alkalosis
4,CID000002173,CID000003345,C0004604,Back Ache
...,...,...,...,...
4649436,CID000003461,CID000003954,C0149871,deep vein thromboses
4649437,CID000003461,CID000003954,C0035410,rhabdomyolysis
4649438,CID000003461,CID000003954,C0043096,loss of weight
4649439,CID000003461,CID000003954,C0003962,ascites


In [50]:
from utils.utils import get_side_effect_index_from_text, \
    get_drug_index_from_text

df["STITCH 1"] = df["STITCH 1"].apply(lambda x: get_drug_index_from_text(x))
df["STITCH 2"] = df["STITCH 2"].apply(lambda x: get_drug_index_from_text(x))
df["Polypharmacy Side Effect"] = df["Polypharmacy Side Effect"].apply(lambda x: get_side_effect_index_from_text(x))

In [51]:
df

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,2173,3345,151714,hypermagnesemia
1,2173,3345,35344,retinopathy of prematurity
2,2173,3345,4144,atelectasis
3,2173,3345,2063,alkalosis
4,2173,3345,4604,Back Ache
...,...,...,...,...
4649436,3461,3954,149871,deep vein thromboses
4649437,3461,3954,35410,rhabdomyolysis
4649438,3461,3954,43096,loss of weight
4649439,3461,3954,3962,ascites


In [52]:
ddi_dataset_grouped = df.groupby(["Polypharmacy Side Effect"], dropna=False).size().reset_index()
ddi_dataset_grouped

Unnamed: 0,Polypharmacy Side Effect,0
0,731,12369
1,737,21410
2,768,2862
3,786,1482
4,814,122
...,...,...
1312,1527344,6101
1313,1527383,104
1314,1527407,519
1315,1527411,230


In [53]:
ddi_dataset_grouped = ddi_dataset_grouped[ddi_dataset_grouped.iloc[:, 1] > 250]
permitted_side_effects = ddi_dataset_grouped["Polypharmacy Side Effect"].values.tolist()
ddi_dataset = df[df["Polypharmacy Side Effect"].isin(permitted_side_effects)]
ddi_dataset

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,2173,3345,151714,hypermagnesemia
2,2173,3345,4144,atelectasis
3,2173,3345,2063,alkalosis
4,2173,3345,4604,Back Ache
5,2173,3345,34063,lung edema
...,...,...,...,...
4649436,3461,3954,149871,deep vein thromboses
4649437,3461,3954,35410,rhabdomyolysis
4649438,3461,3954,43096,loss of weight
4649439,3461,3954,3962,ascites


In [54]:
ddi_dataset_check = ddi_dataset.groupby(["Polypharmacy Side Effect"], dropna=False).size().reset_index()
ddi_dataset_check

Unnamed: 0,Polypharmacy Side Effect,0
0,731,12369
1,737,21410
2,768,2862
3,786,1482
4,833,6782
...,...,...
1092,1510475,2763
1093,1527336,1147
1094,1527344,6101
1095,1527407,519


In [55]:
g7 = ddi_dataset_check.iloc[:, 1].sort_values()
g7

895      252
620      253
579      257
891      259
931      259
       ...  
591    24430
514    25190
224    26037
30     27006
405    28568
Name: 0, Length: 1097, dtype: int64

Create new mappings for the dataset

In [56]:
combos = ddi_dataset_check["Polypharmacy Side Effect"].tolist()
len(combos)

1097

In [57]:
new_combo_map = {}
index = 0
for combo in combos:
    new_combo_map[combo] = index
    index += 1

In [58]:
drugs_1 = list(ddi_dataset["STITCH 1"])
drugs_2 = list(ddi_dataset["STITCH 2"])
drugs = torch.tensor((drugs_1 + drugs_2)).unique().tolist()

In [59]:
len(drugs)

645

In [60]:
new_drug_map = {}
index = 0
for drug in drugs:
    new_drug_map[drug] = index
    index += 1

Apply the new mappings to the dataset

In [61]:
ddi_dataset["STITCH 1"] = ddi_dataset["STITCH 1"].apply(lambda x: new_drug_map[x])
ddi_dataset["STITCH 2"] = ddi_dataset["STITCH 2"].apply(lambda x: new_drug_map[x])
ddi_dataset["Polypharmacy Side Effect"] = ddi_dataset["Polypharmacy Side Effect"].apply(lambda x: new_combo_map[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddi_dataset["STITCH 1"] = ddi_dataset["STITCH 1"].apply(lambda x: new_drug_map[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddi_dataset["STITCH 2"] = ddi_dataset["STITCH 2"].apply(lambda x: new_drug_map[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddi_dataset["Polypharmacy Side Effect"]

In [62]:
ddi_dataset

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,55,186,865,hypermagnesemia
2,55,186,70,atelectasis
3,55,186,19,alkalosis
4,55,186,76,Back Ache
5,55,186,623,lung edema
...,...,...,...,...
4649436,213,271,851,deep vein thromboses
4649437,213,271,650,rhabdomyolysis
4649438,213,271,783,loss of weight
4649439,213,271,64,ascites


<h1>New dp network</h1>

In [63]:
dp_df = pd.read_csv(path + 'datasets/bio-decagon-targets.csv') 
dp_df["STITCH"] = dp_df["STITCH"].apply(lambda x: get_drug_index_from_text(x))

In [64]:
drugs_list = torch.tensor(dp_df["STITCH"].tolist()).unique().tolist()
len(drugs_list)

284

In [65]:
dp_df

Unnamed: 0,STITCH,Gene
0,3488,1559
1,3488,8647
2,77992,3351
3,77992,3350
4,77992,3352
...,...,...
18685,5152,8484
18686,5152,81491
18687,5152,83551
18688,5152,680


Select only the drugs that exist in the ddi and the proteins that exist in the ppi

In [66]:
dp_df = dp_df[(dp_df['STITCH'].isin(drugs)) & (dp_df['Gene'].isin(targets))]
dp_df

Unnamed: 0,STITCH,Gene
0,3488,1559
1,3488,8647
2,77992,3351
3,77992,3350
4,77992,3352
...,...,...
18685,5152,8484
18686,5152,81491
18687,5152,83551
18688,5152,680


Apply the mappings to the dpi network

In [67]:
dp_df["Gene"] = dp_df["Gene"].apply(lambda x: new_protein_map[x])
dp_df["STITCH"] = dp_df["STITCH"].apply(lambda x: new_drug_map[x])
dp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_df["Gene"] = dp_df["Gene"].apply(lambda x: new_protein_map[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_df["STITCH"] = dp_df["STITCH"].apply(lambda x: new_drug_map[x])


Unnamed: 0,STITCH,Gene
0,218,1103
1,218,5757
2,554,2282
3,554,2281
4,554,2283
...,...,...
18685,403,5634
18686,403,13075
18687,403,13228
18688,403,474


In [68]:
genes = list(dp_df["Gene"])
drugs_list = list(dp_df["STITCH"])
dp_edge_index = torch.tensor([genes, drugs_list])
dp_edge_index

tensor([[ 1103,  5757,  2282,  ..., 13228,   474,  7795],
        [  218,   218,   554,  ...,   403,   403,   403]])

<h1>Export the dataframe for training</h1>

In [69]:
new_dd_pairs = ddi_dataset[["STITCH 1", "STITCH 2"]]
ddi_dataset["Pair"] = new_dd_pairs.values.tolist()
ddi_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddi_dataset["Pair"] = new_dd_pairs.values.tolist()


Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name,Pair
0,55,186,865,hypermagnesemia,"[55, 186]"
2,55,186,70,atelectasis,"[55, 186]"
3,55,186,19,alkalosis,"[55, 186]"
4,55,186,76,Back Ache,"[55, 186]"
5,55,186,623,lung edema,"[55, 186]"
...,...,...,...,...,...
4649436,213,271,851,deep vein thromboses,"[213, 271]"
4649437,213,271,650,rhabdomyolysis,"[213, 271]"
4649438,213,271,783,loss of weight,"[213, 271]"
4649439,213,271,64,ascites,"[213, 271]"


In [70]:
g4 = ddi_dataset.groupby(["Polypharmacy Side Effect"], dropna=False)["Pair"].apply(list).reset_index()
g4

Unnamed: 0,Polypharmacy Side Effect,Pair
0,0,"[[55, 186], [307, 539], [289, 378], [450, 510]..."
1,1,"[[30, 395], [401, 450], [205, 529], [210, 613]..."
2,2,"[[139, 518], [162, 474], [531, 558], [40, 43],..."
3,3,"[[205, 529], [30, 373], [30, 131], [171, 260],..."
4,4,"[[30, 395], [210, 613], [293, 444], [210, 491]..."
...,...,...
1092,1092,"[[339, 421], [210, 613], [310, 568], [157, 487..."
1093,1093,"[[11, 113], [25, 374], [42, 424], [290, 467], ..."
1094,1094,"[[210, 613], [50, 313], [210, 490], [52, 199],..."
1095,1095,"[[371, 563], [531, 558], [177, 558], [262, 632..."


Save the ddi network

In [71]:
# dd_edge_index_list = []
# for index, row in g4.iterrows():
#     edge_index = torch.tensor(row['Pair'])
#     side_effect_id = row["Polypharmacy Side Effect"]
#     dd_edge_index_list.append(edge_index)
#     torch.save(torch.t(edge_index),
#         f"../index_map/dd_edge_index/edge_index_{side_effect_id}.pt")
# dd_edge_index_list[0]

Save the ppi network

In [72]:
# torch.save(pp_edge_index,
#         '../index_map/pp_edge_index.pt')

Save the dpi network

In [73]:
# torch.save(dp_edge_index,
#         '../index_map/dp_edge_index.pt')

Save the new mapping dicts

In [74]:
# import pickle

# def save_to_pkl(path, obj):
#     with open(path, 'wb') as ff:
#         pickle.dump(obj, ff)

# WRITE_DATA_PATH = '../index_map/'

# save_to_pkl(WRITE_DATA_PATH+"mapping_dicts/new_drug_map.pkl", new_drug_map)
# save_to_pkl(WRITE_DATA_PATH+"mapping_dicts/new_protein_map.pkl", new_protein_map)
# save_to_pkl(WRITE_DATA_PATH+"mapping_dicts/new_combo_map.pkl", new_combo_map)

Save the new mapped dfs

In [75]:
# pp_df.to_pickle('../index_map/mapped_dataframes/pp_df.pkl')
# dp_df.to_pickle('../index_map/mapped_dataframes/dp_df.pkl')
# ddi_dataset.to_pickle('../index_map/mapped_dataframes/ddi_df.pkl')

# to load it ==> df = pd.read_pickle(file_name)

Save the statistics

In [76]:
# combo_num = len(dd_edge_index_list)
# drug_num = len(drugs)
# protein_num = len(targets)

# WRITE_DATA_PATH = '../index_map/'
# save_to_pkl(WRITE_DATA_PATH+"statistics.pkl", (drug_num, protein_num, combo_num))