In [1]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn.functional as F

In [2]:
x_dict = torch.load('x_dict.pth')
len(x_dict['perturbation'])

5528

In [3]:
therapeutic_node_embedding = x_dict["perturbation"][-1]
therapeutic_node_embedding = torch.unsqueeze(therapeutic_node_embedding, 0)
therapeutic_node_embedding.shape

torch.Size([1, 128])

In [4]:
gene_embeddings =  x_dict["gene"]
gene_embeddings.shape

torch.Size([18315, 128])

In [5]:
edge_index = np.zeros((2, 18315), dtype=int)
#edge_index[0, -1] = 1.0
edge_index[1,:] = np.arange(0, 18315)
#edge_index[1,18315:] = np.arange(0, 3)
edge_label_index = torch.tensor(edge_index)
edge_label_index

tensor([[    0,     0,     0,  ...,     0,     0,     0],
        [    0,     1,     2,  ..., 18312, 18313, 18314]])

In [6]:
from model_definition import Classifier

classifier = Classifier()
classifier.load_state_dict(torch.load('classifier_trained_model.pth'))


<All keys matched successfully>

In [7]:
classifier.eval()
with torch.no_grad():
    dot_products = classifier(therapeutic_node_embedding, gene_embeddings, edge_label_index)

dot_products = dot_products.to('cpu')

In [8]:
edge_labels = torch.tensor(np.ones(18315))
edge_labels

tensor([1., 1., 1.,  ..., 1., 1., 1.], dtype=torch.float64)

In [9]:
edge_losses = []
for i in range(len(edge_labels)):
    edge_label = edge_labels[i]
    dot_product = dot_products[i]
    edge_loss = F.binary_cross_entropy_with_logits(dot_product, edge_label)
    edge_losses.append(float(edge_loss))
edge_losses = torch.tensor(edge_losses)
edge_losses

tensor([3.4744e-01, 2.6226e-06, 1.1156e-01,  ..., 4.0135e+00, 2.9541e+00,
        9.0785e-01])

In [10]:
predicted_edge_scores = 1 - edge_losses

In [11]:

k = 20
# Use torch.topk to get the indices and values of the k smallest elements
top_edge_scores, top_indices = torch.topk(predicted_edge_scores, k)
top_edge_scores = top_edge_scores.tolist()
top_indices = top_indices.tolist()
    
# Negate the top smallest values back to original values
#top_edge_scores = -top_smallest_losses

print("Top predicted targets:", top_edge_scores)
print("Indices of targets:", top_indices)


Top predicted targets: [0.9999998807907104, 0.9999997615814209, 0.9999992847442627, 0.9999991655349731, 0.999998927116394, 0.9999988079071045, 0.9999988079071045, 0.9999986886978149, 0.9999984502792358, 0.9999974966049194, 0.9999973773956299, 0.9999960660934448, 0.999995231628418, 0.9999943971633911, 0.9999943971633911, 0.9999936819076538, 0.999992847442627, 0.9999897480010986, 0.9999876022338867, 0.9999865293502808]
Indices of targets: [14, 501, 19, 843, 584, 106, 77, 176, 11, 7, 1, 25, 1083, 1252, 583, 427, 1753, 472, 307, 108]


In [12]:
# gene_mapping_dictionary from json file, consider datatypes in the dictionary!
import json
file_path = 'mapping_dicts/genes_dict.json'
with open(file_path, 'r') as json_file:
    genes_dict_json = json.load(json_file)

#convert back strings into integers as they got mutated in json conversion! 
    genes_dict = {value: int(key) for key, value in genes_dict_json.items()}
    
print(len(genes_dict))

18315


In [58]:
gene_names = pd.read_table("gene_names.txt")
gene_names = gene_names.dropna()
gene_names["entrezId"] = gene_names["entrezId"].astype("int")
gene_names = gene_names.set_index("entrezId")

In [14]:
print("Gene_Id".ljust(10), "Gene_symbol".ljust(20), "Score")
gene_ids = []
gene_symbols =[]
scores = []
for i in range(len(top_indices)):
    gene_id = genes_dict[top_indices[i]]
    gene_ids.append(gene_id)
    score = round(top_edge_scores[i], 5)
    scores.append(score)
    if gene_id in gene_names["geneSymbol"]:
        gene_symbol = gene_names.loc[gene_id, "geneSymbol"]
    else:
        gene_symbol = "Unknown"
    gene_symbols.append(gene_symbol)

for i in range(len(gene_ids)):
    print(str(gene_ids[i]).ljust(10), str(gene_symbols[i]).ljust(20), str(scores[i]))

Gene_Id    Gene_symbol          Score
23         ABCF1                1.0
728        C5AR1                1.0
28         ABO                  1.0
1188       Unknown              1.0
836        CASP3                1.0
154        ADRB2                1.0
116        ADCYAP1              1.0
258        AMBN                 1.0
20         ABCA2                1.0
15         AANAT                1.0
2          A2M                  1.0
34         ACADM                1.0
1544       CYP1A2               1.0
1810       DR1                  0.99999
835        CASP2                0.99999
634        CEACAM1              0.99999
2658       GDF2                 0.99999
684        BST2                 0.99999
429        ASCL1                0.99999
156        ADRBK1               0.99999


In [44]:
mean_values = df.groupby('id')['score'].mean().reset_index()
mean_values

Unnamed: 0,id,score
0,1,0.953167
1,2,0.890750
2,9,0.942900
3,10,0.971429
4,12,0.969500
...,...,...
1049,127343,0.763000
1050,127829,0.731000
1051,143098,0.691000
1052,153768,0.667000


In [45]:
# Sort by counts in descending order
counts = df.groupby('id').size().reset_index(name='Count')
counts

Unnamed: 0,id,Count
0,1,12
1,2,8
2,9,10
3,10,7
4,12,4
...,...,...
1049,127343,1
1050,127829,1
1051,143098,1
1052,153768,1


In [56]:
# Merge counts back into the original DataFrame
df_merged = pd.merge(mean_values, counts, on='id')
df_merged

Unnamed: 0,id,score,Count
0,1,0.953167,12
1,2,0.890750,8
2,9,0.942900,10
3,10,0.971429,7
4,12,0.969500,4
...,...,...,...
1049,127343,0.763000,1
1050,127829,0.731000,1
1051,143098,0.691000,1
1052,153768,0.667000,1


In [57]:
import numpy as np

df_merged['adj_score'] = np.log(df_merged['Count']) * df_merged['score']
df_merged = df_merged.sort_values(by='adj_score', ascending=False).reset_index(drop=True)

df_merged.head(20)

Unnamed: 0,id,score,Count,adj_score
0,22,0.929,13,2.382838
1,1,0.953167,12,2.36853
2,13,0.952545,11,2.284104
3,30,0.9701,10,2.233738
4,36,0.9533,10,2.195054
5,9,0.9429,10,2.171107
6,19,0.9358,10,2.154759
7,18,0.978778,9,2.150595
8,21,0.957778,9,2.104453
9,28,0.967,8,2.01082


In [61]:
symbols = []
for i in range(len(counts_df)):
    gene_id = df_merged.loc[i,'id'] 
    if gene_id in gene_names["geneSymbol"]:
        gene_symbol = gene_names.loc[gene_id, "geneSymbol"]
    else:
        gene_symbol = "Unknown"
    symbols.append(symbol)
len(symbols)

1054

In [80]:
#symbols

In [62]:
df_merged['symbols'] = symbols
df_merged

Unnamed: 0,id,score,Count,adj_score,symbols
0,22,0.929000,13,2.382838,ACO2
1,1,0.953167,12,2.368530,ACO2
2,13,0.952545,11,2.284104,ACO2
3,30,0.970100,10,2.233738,ACO2
4,36,0.953300,10,2.195054,ACO2
...,...,...,...,...,...
1049,1075,0.873000,1,0.000000,ACO2
1050,1081,0.997000,1,0.000000,ACO2
1051,1087,0.970000,1,0.000000,ACO2
1052,1088,0.997000,1,0.000000,ACO2


In [None]:
print("Gene_Id".ljust(10), "Gene_symbol".ljust(20), "Score")
gene_ids = []
gene_symbols =[]
scores = []
for i in range(len(counts_df)):
    gene_id = genes_dict[counts_df.loc[i,'id']]
    gene_ids.append(gene_id)
    score = round(top_edge_scores[i], 5)
    scores.append(score)
    if gene_id in gene_names["geneSymbol"]:
        gene_symbol = gene_names.loc[gene_id, "geneSymbol"]
    else:
        gene_symbol = "Unknown"
    gene_symbols.append(gene_symbol)

for i in range(len(gene_ids)):
    print(str(gene_ids[i]).ljust(10), str(gene_symbols[i]).ljust(20), str(scores[i]))

Unnamed: 0,id,symbol,score,Count
0,580,BARD1,0.999,3
1,580,BARD1,0.972,3
2,580,BARD1,0.995,3
3,417,ART1,0.999,3
4,417,ART1,0.995,3
...,...,...,...,...
1995,1488,CTBP2,0.986,1
1996,367,AR,0.946,1
1997,1588,CYP19A1,0.923,1
1998,1018,CDK3,0.906,1


In [8]:
sorted_df = df_merged.sort_values(by='Count', ascending=False)
sorted_df

Unnamed: 0,id,symbol,score,Count
54,22,ABCB7,0.909,13
55,22,ABCB7,0.972,13
53,22,ABCB7,0.996,13
52,22,ABCB7,0.975,13
51,22,ABCB7,0.991,13
...,...,...,...,...
564,831,CAST,0.944,1
1518,629,CFB,0.954,1
563,343,AQP8,0.945,1
562,2066,ERBB4,0.947,1
