## Creates the labels that will be used for training from knowlege Graphs

In [11]:
import pandas as pd
from random import randint
import random
import json

In [12]:
with open('../../data/SND/SNF_diseases_matched.json', 'r') as file:
    snf_disease = json.load(file)

In [6]:
# Loading the drug-disease interaction network
df = pd.read_csv('../../data/SND/drugDiseaseInteractions.txt',sep='\t', header=None)

In [8]:
# removing the tag from drug names
df = df.loc[:, df.columns[::-1]]
df.columns = ["# Disease(MESH)","Chemical"]

In [9]:
df

Unnamed: 0,# Disease(MESH),Chemical
0,C0007102,DB00002
1,C0003872,DB00005
2,C0003873,DB00005
3,C0038013,DB00005
4,C0406317,DB00005
...,...,...
5680,C0220989,DB09046
5681,C0206682,DB09078
5682,C0238463,DB09078
5683,C0220987,DB09144


In [16]:
df['# Disease(MESH)'] = df['# Disease(MESH)'].replace(snf_disease)
valid_diseases = list(snf_disease.values())
df = df[df['# Disease(MESH)'].isin(valid_diseases)]

In [17]:
df

Unnamed: 0,# Disease(MESH),Chemical
0,D003110,DB00002
1,D015535,DB00005
2,D001172,DB00005
3,D013167,DB00005
5,D001171,DB00005
...,...,...
5670,D020165,DB06819
5671,D020159,DB06819
5676,D009196,DB08877
5678,D052497,DB09046


In [18]:
# loading the similarity matrices fro which labels are to be obtained
drug_df = pd.read_csv('../../data/processed_drug_simmat_SNF19.csv', index_col=0)
disease_df = pd.read_csv('../../data/processed_disease_simmat_SNF14.csv', index_col=0)

drug_names = drug_df.index
disease_names = disease_df.index

In [19]:
print(len(set(df.iloc[:,1])),len(drug_names))

815 705


In [20]:
print(len(set(df.iloc[:,0])),len(disease_names))

419 436


In [21]:
# Extracting the edges containing wanted drugs and diseases
df = df[df['Chemical'].isin(drug_names)]
df = df[df['# Disease(MESH)'].isin(disease_names)]
df

Unnamed: 0,# Disease(MESH),Chemical
0,D003110,DB00002
1,D015535,DB00005
2,D001172,DB00005
3,D013167,DB00005
5,D001171,DB00005
...,...,...
5668,D013622,DB06803
5669,D004169,DB06803
5676,D009196,DB08877
5678,D052497,DB09046


In [22]:
interaction_set = set([(i['# Disease(MESH)'],i['Chemical']) for j,i in df.iterrows()])
len(interaction_set)

2597

In [26]:
non_interaction_set = set()
imbalance = 1
while len(non_interaction_set) != 2*len(interaction_set):
    random_integer = randint(0, len(drug_names)-1)
    drug = (drug_names[random_integer])
    random_integer = randint(0, len(disease_names)-1)
    disease = (disease_names[random_integer])
    if (drug,disease) not in interaction_set:
        non_interaction_set.add((disease,drug))

In [27]:
non_interaction_list = list(non_interaction_set)
interaction_list = list(interaction_set)

non_interaction_df = pd.DataFrame(non_interaction_list) 
non_interaction_df[2] = [0]*len(non_interaction_df)

interaction_df = pd.DataFrame(interaction_list) 
interaction_df[2] = [1]*len(interaction_df)

df_concatenated = pd.concat([non_interaction_df, interaction_df], axis=0)
df_concatenated

Unnamed: 0,0,1,2
0,D007969,DB00628,0
1,D019958,DB00798,0
2,D016889,DB01015,0
3,D003456,DB00530,0
4,D010547,DB00091,0
...,...,...,...
2592,D003876,DB01130,1
2593,D007634,DB00860,1
2594,D001759,DB01167,1
2595,D001289,DB01577,1


In [28]:
df_concatenated.to_csv('../../data/labels_SNFd14_SNFd19_im1_orig.csv')

In [None]:
# Code to get subsets of labels..

In [14]:
interaction_count = 5000

interaction_list = list(interaction_set)
random.seed(42)
random.shuffle(interaction_list)
selected_interactions = interaction_list[:interaction_count]

In [15]:
non_interaction_list = list(non_interaction_set)
random.seed(42)
random.shuffle(non_interaction_list)
selected_non_interactions = non_interaction_list[:interaction_count]

In [16]:
non_interaction_list = selected_non_interactions
interaction_list = selected_interactions

non_interaction_df = pd.DataFrame(non_interaction_list) 
non_interaction_df[2] = [0]*len(non_interaction_df)

interaction_df = pd.DataFrame(interaction_list) 
interaction_df[2] = [1]*len(interaction_df)

df_concatenated = pd.concat([non_interaction_df, interaction_df], axis=0)
df_concatenated

Unnamed: 0,0,1,2
0,D000686,DB00472,0
1,D001171,DB00706,0
2,D007500,DB00307,0
3,D020820,DB00637,0
4,D002389,DB01577,0
...,...,...,...
4995,D013345,DB00853,1
4996,D001321,DB00571,1
4997,D002277,DB01242,1
4998,D012208,DB00695,1


In [17]:
df_concatenated.to_csv('labelssmall'+str(interaction_count)+'.csv')