## Code to generate input for the Model

In [1]:
import pandas as pd

In [None]:
prefix = "SNF_im1"
# loading the similarity matrices
drug_df = pd.read_csv('../../data/processed_drug_simmat_SNF19.csv', index_col=0)
disease_df = pd.read_csv('../../data/processed_disease_simmat_SNF14.csv', index_col=0)
labels = pd.read_csv('../../data/labels_SNFd14_SNFd19_im1.csv', index_col=0)

In [3]:
# loading the PPI
df = pd.read_csv('../data/HumanNet-PI.tsv', sep='\t')
network = []
for i in range(len(df)):
    network.append(df.index[i])
# print(network)
net = pd.DataFrame(network)

In [4]:
import networkx as nx
# creating the graph
G = nx.from_pandas_edgelist(net, source=0, target=1, create_using=nx.Graph())
valid_nodes = list(G.nodes)

#network proximity calcuation based on shortest path(Dijkstras)
def calculate_network_proximity1(G, node1, node2, k ):
    try:
        shortest_path = nx.shortest_path_length(G, source=node1, target=node2)
        network_proximity = 1 / shortest_path
        return network_proximity
    except:
        return 0
        print("exception")

# jaccard similarity
def jaccard_similarity(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    if len(neighbors1) == 0 or len(neighbors2) == 0:
        return 0
    return len(neighbors1.intersection(neighbors2)) / len(neighbors1.union(neighbors2))

# Cosine similarity
def cosine_similarity(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    if len(neighbors1) == 0 or len(neighbors2) == 0:
        return 0
    intersection = len(neighbors1.intersection(neighbors2))
    norm1 = len(neighbors1)
    norm2 = len(neighbors2)
    return intersection / ((norm1 * norm2) ** 0.5)

# Adamic-Adar index
def adamic_adar_index(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    common_neighbors = neighbors1.intersection(neighbors2)
    score = 0
    for neighbor in common_neighbors:
        degree = G.degree(neighbor)
        if degree > 1:
            score += 1 / (degree ** 0.5)
    return score

# Preferential attachment
def preferential_attachment(G, node1, node2):
    degree1 = G.degree(node1)
    degree2 = G.degree(node2)
    return degree1 * degree2

In [6]:
import json
# loading the known drug-gene interactions
with open('../data/gene_dict_drug.json', 'r') as f:
    gene_dict_drug = json.load(f)
with open('../data/gene_dict_disease.json', 'r') as f:
    gene_dict_disease = json.load(f)

In [7]:
drug_names = drug_df.index
disease_names = disease_df.index

In [8]:
new_gene_dict_drug = {}
for key, value in gene_dict_drug.items():
    if(key in drug_names):
        value_int_list = [int(x) for x in value]
        intersection = list(set(value_int_list).intersection(valid_nodes))
        new_gene_dict_drug[key] = intersection

In [9]:
new_gene_dict_disease = {}
for key, value in gene_dict_disease.items():
    if(key in disease_names):
        value_int_list = [int(x) for x in value]
        intersection = list(set(value_int_list).intersection(valid_nodes))
        new_gene_dict_disease[key] = intersection

In [10]:
from tqdm import tqdm

In [12]:
labels

Unnamed: 0,0,1,2
0,D007969,DB00628,0
1,D019958,DB00798,0
2,D016889,DB01015,0
3,D003456,DB00530,0
4,D010547,DB00091,0
...,...,...,...
2592,D003876,DB01130,1
2593,D007634,DB00860,1
2594,D001759,DB01167,1
2595,D001289,DB01577,1


In [14]:
from multiprocessing import Pool

def calculate_proximities(args):
    index, row, new_gene_dict_drug = args
    drug_genes = new_gene_dict_drug[row[1]]
    disease_genes = new_gene_dict_disease[row[0]]

    network_proximities = []
    for drug_gene in drug_genes:
        for disease_gene in disease_genes:
            network_proximities.append(calculate_network_proximity1(G, drug_gene, disease_gene,10000))

    mean = sum(network_proximities) / len(network_proximities)
    median = sorted(network_proximities)[len(network_proximities) // 2]
    minimum = min(network_proximities)
    maximum = max(network_proximities)

    return (mean, median, minimum, maximum)

proximities = []
with Pool() as p:
    for result in tqdm(p.imap(calculate_proximities, [(index, row, new_gene_dict_drug) for index, row in labels.iterrows()]), total=len(labels)):
        proximities.append(result)


  1%|▍                                                 | 67/7791 [01:00<1:55:50,  1.11it/s]


KeyboardInterrupt: 

In [33]:
proximities

[(0.285418686002611, 0.25, 0, 1.0),
 (0.2703090149138751, 0.25, 0, 1.0),
 (0.30194518933851006, 0.3333333333333333, 0, 1.0),
 (0.30684475986431975, 0.3333333333333333, 0, 1.0),
 (0.3074615782767849, 0.3333333333333333, 0, 1.0),
 (0.2786309357006147, 0.25, 0, 1.0),
 (0.25871792207250344, 0.25, 0, 1.0),
 (0.25283067460334663, 0.25, 0, 1.0),
 (0.31484801255686196, 0.3333333333333333, 0, 1.0),
 (0.31075268817204277, 0.3333333333333333, 0.16666666666666666, 1.0),
 (0.3086418153749696, 0.3333333333333333, 0, 1.0),
 (0.2782592308129357, 0.25, 0, 1.0),
 (0.2529781491928099, 0.25, 0, 1.0),
 (0.3416666666666662, 0.3333333333333333, 0.2, 0.5),
 (0.28205208150190686, 0.25, 0, 1.0),
 (0.3042204585537687, 0.3333333333333333, 0, 1.0),
 (0.3024624744648667, 0.3333333333333333, 0, 1.0),
 (0.29914282952563565, 0.3333333333333333, 0, 1.0),
 (0.2714009238175921, 0.25, 0, 1.0),
 (0.3027462121212124, 0.3333333333333333, 0.2, 0.5),
 (0.310997425997433, 0.3333333333333333, 0, 1.0),
 (0.29004884004884024, 0.25

In [13]:
import numpy as np
dd_list = []
label_list = []
dds_list = []
proximity_dict = {}
for index, row in labels.iterrows():
    drug_name = row[1]
    disease_name = row[0]
    label = row[2]
    drug_vector = list(drug_df.loc[drug_name,:].values)
    disease_vector = list(disease_df.loc[disease_name,:].values)
    proximity_vector = list(proximities[index])
    proximity_dict[(drug_name,disease_name)]  = proximity_vector
    label_list.append(label)
    dd_vector = drug_vector + disease_vector + [label]
    dds_vector = drug_vector + disease_vector + proximity_vector + [label]
    # print(len(dd_vector), len(dds_vector))
    # break
    dd_vector = np.array(dd_vector)
    dds_vector = np.array(dds_vector)
    dd_list.append(dd_vector)
    dds_list.append(dds_vector)

with open('../../data/prox_dict_'+ prefix +'.json', 'w') as f:
    # Write the dictionary to the file as JSON
    json.dump(my_dict, f)

NameError: name 'proximities' is not defined

In [45]:
import numpy as np

# Save the list of arrays
np.save('../../data/dds_'+prefix + '.npy', dds_list)
# np.save('dd10000_o.npy', dd_list)