In [1]:
import pandas as pd

df = pd.read_csv('HumanNet-PI.tsv', sep='\t')
print(df)

                 #EntrezGeneID1 EntrezGeneID2 LLS
283459    55278                          5.300852
5623      64096                          5.227007
56339     57721                          5.206936
100134934 80169                          5.145033
83605     889                            5.131085
...                                           ...
2771      79959                          1.396652
          781                            1.396652
          290                            1.396652
374378    84866                          1.396646
55177     55848                          1.396613

[158499 rows x 1 columns]


In [6]:
df.index[0]

(283459, 55278)

In [11]:
network = []
for i in range(len(df)):
    network.append(df.index[i])
# print(network)
net = pd.DataFrame(network)

In [51]:
import networkx as nx

G = nx.from_pandas_edgelist(net, source=0, target=1, create_using=nx.Graph())
valid_nodes = list(G.nodes)

def calculate_network_proximity(G, node1, node2):
    try:
        shortest_path = nx.shortest_path_length(G, source=node1, target=node2)
        network_proximity = 1 / shortest_path
        return network_proximity
    except:
        return 0
def jaccard_similarity(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    if len(neighbors1) == 0 or len(neighbors2) == 0:
        return 0
    return len(neighbors1.intersection(neighbors2)) / len(neighbors1.union(neighbors2))

# Cosine similarity
def cosine_similarity(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    if len(neighbors1) == 0 or len(neighbors2) == 0:
        return 0
    intersection = len(neighbors1.intersection(neighbors2))
    norm1 = len(neighbors1)
    norm2 = len(neighbors2)
    return intersection / ((norm1 * norm2) ** 0.5)

# Adamic-Adar index
def adamic_adar_index(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    common_neighbors = neighbors1.intersection(neighbors2)
    score = 0
    for neighbor in common_neighbors:
        degree = G.degree(neighbor)
        if degree > 1:
            score += 1 / (degree ** 0.5)
    return score

# Preferential attachment
def preferential_attachment(G, node1, node2):
    degree1 = G.degree(node1)
    degree2 = G.degree(node2)
    return degree1 * degree2

In [26]:
calculate_network_proximity(net, 55278,283459)

1.0

In [30]:
drug_df = pd.read_csv('processed_drug_simmat.csv', index_col=0)
disease_df = pd.read_csv('processed_disease_simmat.csv', index_col=0)

drug_names = drug_df.index
disease_names = disease_df.index

Index(['DB00007', 'DB00010', 'DB00014', 'DB00017', 'DB00035', 'DB00067',
       'DB00091', 'DB00104', 'DB00114', 'DB00115',
       ...
       'DB01623', 'DB01677', 'DB02546', 'DB03796', 'DB03904', 'DB04272',
       'DB04844', 'DB04861', 'DB06285', 'DB06287'],
      dtype='object', length=587)

In [None]:
disease_names

In [32]:
import json

with open('gene_dict_drug.json', 'r') as f:
    gene_dict_drug = json.load(f)
with open('gene_dict_disease.json', 'r') as f:
    gene_dict_disease = json.load(f)
    
gene_dict_drug

{'DB00001': [2147.0],
 'DB00002': [2215.0,
  713.0,
  716.0,
  712.0,
  2213.0,
  2212.0,
  9103.0,
  2209.0,
  715.0,
  2214.0,
  1956.0,
  714.0],
 'DB00004': [3561.0, 3559.0, 3560.0],
 'DB00005': [2214.0,
  9103.0,
  4049.0,
  2212.0,
  7133.0,
  2215.0,
  714.0,
  713.0,
  2213.0,
  7124.0,
  716.0,
  712.0,
  2209.0,
  715.0,
  5743.0],
 'DB00006': [4353.0, 2147.0],
 'DB00007': [2798.0],
 'DB00008': [3454.0, 3455.0],
 'DB00009': [2243.0, 5054.0, 5340.0, 5329.0],
 'DB00010': [2692.0],
 'DB00011': [3454.0, 3455.0],
 'DB00012': [2057.0],
 'DB00013': [5329.0,
  5055.0,
  6768.0,
  5054.0,
  4811.0,
  5104.0,
  5327.0,
  4036.0,
  5340.0,
  5328.0],
 'DB00014': [3973.0, 2798.0],
 'DB00015': [5329.0, 5340.0, 2243.0, 5054.0],
 'DB00016': [2057.0],
 'DB00017': [799.0],
 'DB00018': [3454.0, 3455.0],
 'DB00019': [1991.0, 1441.0],
 'DB00020': [1439.0, 3563.0, 5553.0, 6383.0, 1438.0],
 'DB00021': [6344.0],
 'DB00022': [3454.0, 3455.0],
 'DB00024': [7253.0],
 'DB00025': [433.0,
  5264.0,
  399

In [46]:
new_gene_dict_drug = {}
for key, value in gene_dict_drug.items():
    if(key in drug_names):
        value_int_list = [int(x) for x in value]
        intersection = list(set(value_int_list).intersection(valid_nodes))
        new_gene_dict_drug[key] = intersection

In [48]:
new_gene_dict_disease = {}
for key, value in gene_dict_disease.items():
    if(key in disease_names):
        value_int_list = [int(x) for x in value]
        intersection = list(set(value_int_list).intersection(valid_nodes))
        new_gene_dict_disease[key] = intersection

In [None]:
from tqdm import tqdm

In [None]:
indices = []
proximities = []

for drug in tqdm(new_gene_dict_drug.keys()):
    for disease in new_gene_dict_disease.keys():
        index = (drug,disease)
        drug_genes = new_gene_dict_drug[drug]
        disease_genes = new_gene_dict_disease[disease]
        
        network_proximities = []
        for drug_gene in drug_genes:
            for disease_gene in disease_genes:
                network_proximities.append(calculate_network_proximity(G, drug_gene, disease_gene))
        
                                           
        mean = sum(network_proximities) / len(network_proximities)
        median = sorted(network_proximities)[len(network_proximities) // 2]
        minimum = min(network_proximities)
        maximum = max(network_proximities)                                   
        proximities.append((mean, median, minimum, maximum))
        indices.append(index)                     