## Code to generate input for the Model

In [15]:
import pandas as pd

In [55]:
prefix = "SNF_im2_orig_adam"
# loading the similarity matrices
drug_df = pd.read_csv('../../data/processed_drug_simmat_SNF19.csv', index_col=0)
disease_df = pd.read_csv('../../data/processed_disease_simmat_SNF14.csv', index_col=0)
labels = pd.read_csv('../../data/labels_SNFd14_SNFd19_im2_orig.csv', index_col=0)

In [56]:
# loading the PPI
df = pd.read_csv('../data/HumanNet-PI.tsv', sep='\t')
network = []
for i in range(len(df)):
    network.append(df.index[i])
# print(network)
net = pd.DataFrame(network)

In [57]:
import networkx as nx
# creating the graph
G = nx.from_pandas_edgelist(net, source=0, target=1, create_using=nx.Graph())
valid_nodes = list(G.nodes)

#network proximity calcuation based on shortest path(Dijkstras)
def calculate_network_proximity1(G, node1, node2, k ):
    try:
        shortest_path = nx.shortest_path_length(G, source=node1, target=node2)
        network_proximity = 1 / shortest_path
        return network_proximity
    except:
        return 0
        print("exception")

# jaccard similarity
def jaccard_similarity(G, node1, node2,k):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    if len(neighbors1) == 0 or len(neighbors2) == 0:
        return 0
    return len(neighbors1.intersection(neighbors2)) / len(neighbors1.union(neighbors2))

# Cosine similarity
def cosine_similarity(G, node1, node2,k):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    if len(neighbors1) == 0 or len(neighbors2) == 0:
        return 0
    intersection = len(neighbors1.intersection(neighbors2))
    norm1 = len(neighbors1)
    norm2 = len(neighbors2)
    return intersection / ((norm1 * norm2) ** 0.5)

# Adamic-Adar index
def adamic_adar_index(G, node1, node2,k):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    common_neighbors = neighbors1.intersection(neighbors2)
    score = 0
    for neighbor in common_neighbors:
        degree = G.degree(neighbor)
        if degree > 1:
            score += 1 / (degree ** 0.5)
    return score

# Preferential attachment
def preferential_attachment(G, node1, node2,k):
    degree1 = G.degree(node1)
    degree2 = G.degree(node2)
    return degree1 * degree2

In [58]:
import json
# loading the known drug-gene interactions
with open('../data/gene_dict_drug.json', 'r') as f:
    gene_dict_drug = json.load(f)
with open('../data/gene_dict_disease.json', 'r') as f:
    gene_dict_disease = json.load(f)

In [59]:
drug_names = drug_df.index
disease_names = disease_df.index

In [60]:
new_gene_dict_drug = {}
for key, value in gene_dict_drug.items():
    if(key in drug_names):
        value_int_list = [int(x) for x in value]
        intersection = list(set(value_int_list).intersection(valid_nodes))
        new_gene_dict_drug[key] = intersection

In [61]:
new_gene_dict_disease = {}
for key, value in gene_dict_disease.items():
    if(key in disease_names):
        value_int_list = [int(x) for x in value]
        intersection = list(set(value_int_list).intersection(valid_nodes))
        new_gene_dict_disease[key] = intersection

In [62]:
from tqdm import tqdm

In [63]:
labels

Unnamed: 0,0,1,2
0,D020521,DB00596,0
1,D017674,DB00770,0
2,D014235,DB00404,0
3,D001249,DB00956,0
4,D015451,DB01263,0
...,...,...,...
2592,D003876,DB01130,1
2593,D007634,DB00860,1
2594,D001759,DB01167,1
2595,D001289,DB01577,1


In [64]:
from multiprocessing import Pool

def calculate_proximities(args):
    index, row, new_gene_dict_drug = args
    drug_genes = new_gene_dict_drug[row[1]]
    disease_genes = new_gene_dict_disease[row[0]]

    network_proximities = []
    for drug_gene in drug_genes:
        for disease_gene in disease_genes:
            network_proximities.append(adamic_adar_index(G, drug_gene, disease_gene,10000))

    mean = sum(network_proximities) / len(network_proximities)
    median = sorted(network_proximities)[len(network_proximities) // 2]
    minimum = min(network_proximities)
    maximum = max(network_proximities)

    return (mean, median, minimum, maximum)

proximities = []
with Pool() as p:
    for result in tqdm(p.imap(calculate_proximities, [(index, row, new_gene_dict_drug) for index, row in labels.iterrows()]), total=len(labels)):
        proximities.append(result)


100%|██████████████████████████████████████████████████| 7791/7791 [01:24<00:00, 92.59it/s]


In [65]:
proximities

[(0.04808063722458072, 0, 0, 20.48721480795754),
 (0.002731072644273163, 0, 0, 0.5773502691896258),
 (0.007727843919309096, 0, 0, 2.8015645320252673),
 (0.007133082301056406, 0, 0, 4.408336801824895),
 (0.005368346205914879, 0, 0, 1.3798817335714197),
 (0.008383432619915565, 0, 0, 7.537143199606266),
 (0.01869722077197283, 0, 0, 13.14199912955752),
 (0.01822394190885901, 0, 0, 1.3798817335714197),
 (0.030674868719593924, 0, 0, 20.48721480795754),
 (0.03760058999684583, 0, 0, 0.5389517388041382),
 (0.005920708150779269, 0, 0, 1.0622084810793673),
 (0.0009633194974401232, 0, 0, 0.38130643285972254),
 (0.002580918875942654, 0, 0, 1.3798817335714197),
 (0.017661608276662657, 0, 0, 6.476986174171758),
 (0.0, 0, 0, 0),
 (0.0024575351323083607, 0, 0, 2.8015645320252673),
 (0.03456935723728683, 0, 0, 26.673090833804743),
 (0.0019427624045450587, 0, 0, 0.8973308969457598),
 (0.01139551753627456, 0, 0, 1.3798817335714197),
 (0.0028638912533977025, 0, 0, 1.3798817335714197),
 (0.01592561861178320

In [66]:
import numpy as np
dd_list = []
label_list = []
dds_list = []
proximity_dict = {}
for index, row in labels.iterrows():
    drug_name = row[1]
    disease_name = row[0]
    label = row[2]
    drug_vector = list(drug_df.loc[drug_name,:].values)
    disease_vector = list(disease_df.loc[disease_name,:].values)
    proximity_vector = list(proximities[index])
    proximity_dict[(drug_name,disease_name)]  = proximity_vector
    label_list.append(label)
    dd_vector = drug_vector + disease_vector + [label]
    dds_vector = drug_vector + disease_vector + proximity_vector + [label]
    # print(len(dd_vector), len(dds_vector))
    # break
    dd_vector = np.array(dd_vector)
    dds_vector = np.array(dds_vector)
    dd_list.append(dd_vector)
    dds_list.append(dds_vector)

proximity_dict_str_keys = {str(k): v for k, v in proximity_dict.items()}

with open('../../data/prox_dict_'+ prefix +'.json', 'w') as f:
    # Write the dictionary with string keys to the file as JSON
    json.dump(json.dumps(proximity_dict_str_keys, default=lambda x: list(x)), f)

  dd_vector = np.array(dd_vector)
  dds_vector = np.array(dds_vector)


In [67]:
import numpy as np

# Save the list of arrays
np.save('../../data/dds_'+prefix + '.npy', dds_list)
# np.save('dd10000_o.npy', dd_list)

  arr = np.asanyarray(arr)
