In [2]:
import pandas as pd
from bicm import BipartiteGraph
import numpy as np
from tqdm import tqdm
import csv
import itertools 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.metrics import confusion_matrix, f1_score, classification_report

In [3]:
train = pd.read_csv('../data/DeepPurpose_train.csv')
test = pd.read_csv('../data/DeepPurpose_test.csv')

# Reading the Ligand and Target Lists

In [4]:
text_file = open("ligands.txt", "r") # Rows of the adjacency matrix in order
ligands = text_file.readlines()

text_file = open("targets.txt", "r") # Columns of the adjacency matrix in order 
targets = text_file.readlines()

In [5]:
ligands = [j.replace('\n','') for j in tqdm(ligands)]
targets = [j.replace('\n','') for j in tqdm(targets)]

100%|██████████| 10416/10416 [00:00<00:00, 967209.16it/s]
100%|██████████| 1391/1391 [00:00<00:00, 624454.34it/s]


In [6]:
number_ligands = len(ligands)
number_targets = len(targets)
print('Ligands: ', number_ligands)
print('Targets: ',number_targets)

Ligands:  10416
Targets:  1391


# Preparing Degree Ratio Dictionaries from Training Data

In [7]:
train_pos = train[train['Label'] == 1]
train_neg = train[train['Label'] == 0]

In [8]:
pos_deg_0_ligands = []
pos_deg_0_targets = []
neg_deg_0_ligands = []
neg_deg_0_targets = []

In [9]:
ligand_degree_ratio = dict()
ligand_all_average = []

for ligand in tqdm(ligands):
    pos_deg = len(train_pos[train_pos['SMILES'] == ligand])
    neg_deg = len(train_neg[train_neg['SMILES'] == ligand])
    ligand_degree_ratio[ligand] = dict()
    ligand_degree_ratio[ligand]['deg_ratio'] = pos_deg / (pos_deg + neg_deg)
    ligand_degree_ratio[ligand]['deg_avg'] = pos_deg / number_targets 
    ligand_all_average.append(pos_deg / number_targets)
    if pos_deg == 0:
        pos_deg_0_ligands.append(ligand)
    if neg_deg == 0:
        neg_deg_0_ligands.append(ligand)
    
ligands_all_avg = sum(ligand_all_average) / number_ligands

100%|██████████| 10416/10416 [01:03<00:00, 164.83it/s]


In [10]:
targets_degree_ratio = dict()
target_all_average = []

for target in tqdm(targets):
    pos_deg = len(train_pos[train_pos['Target Sequence'] == target])
    neg_deg = len(train_neg[train_neg['Target Sequence'] == target])
    targets_degree_ratio[target] = dict()
    targets_degree_ratio[target]['deg_ratio'] = pos_deg / (pos_deg + neg_deg)
    targets_degree_ratio[target]['deg_avg'] = pos_deg / number_ligands
    target_all_average.append(pos_deg / number_ligands)
    if pos_deg == 0:
        pos_deg_0_targets.append(target)
    if neg_deg == 0:
        neg_deg_0_targets.append(target)
    
targets_all_avg = sum(target_all_average) / number_targets

100%|██████████| 1391/1391 [00:08<00:00, 166.94it/s]


In [11]:
print('Ligands with positive degree 0: ',len(pos_deg_0_ligands))
print('Ligands with negative degree 0: ',len(neg_deg_0_ligands))
print('Targets with positive degree 0: ',len(pos_deg_0_targets))
print('Targets with negative degree 0: ',len(neg_deg_0_targets))

Ligands with positive degree 0:  6539
Ligands with negative degree 0:  3084
Targets with positive degree 0:  556
Targets with negative degree 0:  168


# Naive Model

In [12]:
test_probabilty_predicted_naive = []

for index, row in tqdm(test.iterrows()):
    
    if row['SMILES'] in ligands and row['Target Sequence'] in targets:
        p_naive = ligand_degree_ratio[row['SMILES']]['deg_ratio'] * targets_degree_ratio[row['Target Sequence']]['deg_ratio']
        test_probabilty_predicted_naive.append(p_naive) 
    elif row['SMILES'] in ligands and row['Target Sequence'] not in targets:
        p_naive = ligand_degree_ratio[row['SMILES']]['deg_ratio']  
        test_probabilty_predicted_naive.append(p_naive) 
    elif row['SMILES'] not in ligands and row['Target Sequence'] in targets:
        p_naive = targets_degree_ratio[row['Target Sequence']]['deg_ratio'] 
        test_probabilty_predicted_naive.append(p_naive) 
    else:
        test_probabilty_predicted_naive.append(1)
        

14150it [00:04, 3210.13it/s]


In [13]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), test_probabilty_predicted_naive))
print('AUP: ', average_precision_score(test['Label'].tolist(), test_probabilty_predicted_naive))

AUC:  0.8491380951089191
AUP:  0.6402460532069669


# Configuration Model - Single Layer

In [14]:
## Annotated means the node has at least 1 binding information

pos_annotated_ligands = list(set(ligands)-set(pos_deg_0_ligands))
pos_annotated_targets = list(set(targets)-set(pos_deg_0_targets))

neg_annotated_ligands = list(set(ligands)-set(neg_deg_0_ligands))
neg_annotated_targets = list(set(targets)-set(neg_deg_0_targets))

In [15]:
P = np.loadtxt(open("P.csv", "rb"), delimiter=",", skiprows=0) # Output of MATLAB run 

In [16]:
predicted_probability_test_single_layer = []

for index, row in tqdm(test.iterrows()):
    if row['SMILES'] in pos_annotated_ligands and row['Target Sequence'] in pos_annotated_targets:
        p = P[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
    elif row['SMILES'] in pos_annotated_ligands and row['Target Sequence'] not in pos_annotated_targets:
        p = sum(P[ligands.index(row['SMILES']),:]) / len(targets) # Average binding probaility of the ligand
    elif row['SMILES'] not in pos_annotated_ligands and row['Target Sequence'] in pos_annotated_targets:
        p = sum(P[:,targets.index(row['Target Sequence'])]) / len(ligands) # Average binding probability of the target
    else:
        p = np.sum(P) / (len(ligands)*len(targets)) # Both unseen - average over all probabilties 
                  
    predicted_probability_test_single_layer.append(p)

14150it [00:25, 557.07it/s]


In [17]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), predicted_probability_test_single_layer))
print('AUP: ', average_precision_score(test['Label'].tolist(), predicted_probability_test_single_layer))

AUC:  0.6491480927112449
AUP:  0.35039460079161866


# Configuration Model - Duplex - Unconditioned

In [18]:
summat10 = np.loadtxt(open("summat10.csv", "rb"), delimiter=",", skiprows=0) # Output of MATLAB run 

In [19]:
summat01 = np.loadtxt(open("summat01.csv", "rb"), delimiter=",", skiprows=0) # Output of MATLAB run 

In [20]:
predicted_probability_test_unconditioned = []

for index, row in tqdm(test.iterrows()):
    if row['SMILES'] in pos_annotated_ligands and row['Target Sequence'] in pos_annotated_targets:
        p10 = summat10[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
    elif row['SMILES'] in pos_annotated_ligands and row['Target Sequence'] not in pos_annotated_targets:
        p10 = sum(summat10[ligands.index(row['SMILES']),:]) / len(targets) # Average binding probbaility of the ligand
    elif row['SMILES'] not in pos_annotated_ligands and row['Target Sequence'] in pos_annotated_targets:
        p10 = sum(summat10[:,targets.index(row['Target Sequence'])]) / len(ligands) # Average binding probability of the target
    else:
        p10 = np.sum(summat10) / (len(ligands)*len(targets)) # Both unseen - average over all probabilties 
                  
    predicted_probability_test_unconditioned.append(p10)

14150it [00:25, 558.82it/s]


In [21]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), predicted_probability_test_unconditioned))
print('AUP: ', average_precision_score(test['Label'].tolist(), predicted_probability_test_unconditioned))

AUC:  0.6491480927112449
AUP:  0.35039460079161866


# Configuration Model - Duplex - Conditioned

## Positively Annotated Seen Nodes

In [24]:
test_probabilty_predicted_conditioned = []

## Average conditional probability
conditoned_summat = np.divide(summat10,np.add(summat10,summat01)) # Elementwise pos_deg / (pos_deg + neg_deg)
conditoned_summat = conditoned_summat[~np.isnan(conditoned_summat)] 
average_conditional_probability = sum(conditoned_summat) / len(conditoned_summat) # Average over valid conditional probabilities

for index, row in tqdm(test.iterrows()):      
    
    if row['SMILES'] in pos_annotated_ligands and row['Target Sequence'] in pos_annotated_targets:
        p10 = summat10[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
        p01 = summat01[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
        p10_conditioned = p10 / (p10 + p01)
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        
    elif row['SMILES'] in pos_annotated_ligands and row['Target Sequence'] not in pos_annotated_targets:
        p10_conditioned = ligand_degree_ratio[row['SMILES']]['deg_ratio']  ## k_+ / (k_+ + k_-)
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        
    elif row['SMILES'] not in pos_annotated_ligands and row['Target Sequence'] in pos_annotated_targets:
        p10_conditioned = targets_degree_ratio[row['Target Sequence']]['deg_ratio'] ## k_+ / (k_+ + k_-)
        test_probabilty_predicted_conditioned.append(p10_conditioned)

    else:
        p10_conditioned = average_conditional_probability
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        

  after removing the cwd from sys.path.
14150it [00:05, 2645.87it/s]


In [25]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), test_probabilty_predicted_conditioned))
print('AUP: ', average_precision_score(test['Label'].tolist(), test_probabilty_predicted_conditioned))

AUC:  0.8567519511002759
AUP:  0.6238655911703138


## Both Annotated Seen Nodes

In [22]:
test_probabilty_predicted_conditioned = []

## Average conditional probability
conditoned_summat = np.divide(summat10,np.add(summat10,summat01)) # Elementwise pos_deg / (pos_deg + neg_deg)
conditoned_summat = conditoned_summat[~np.isnan(conditoned_summat)] 
average_conditional_probability = sum(conditoned_summat) / len(conditoned_summat) # Average over valid conditional probabilities

for index, row in tqdm(test.iterrows()):      
    
    if row['SMILES'] in pos_annotated_ligands and row['SMILES'] in neg_annotated_ligands and row['Target Sequence'] in pos_annotated_targets and row['Target Sequence'] in neg_annotated_targets:
        p10 = summat10[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
        p01 = summat01[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
        p10_conditioned = p10 / (p10 + p01)
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        
    elif row['SMILES'] in pos_annotated_ligands and row['SMILES'] in neg_annotated_ligands:
        p10_conditioned = ligand_degree_ratio[row['SMILES']]['deg_ratio']  ## k_+ / (k_+ + k_-)
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        
    elif row['Target Sequence'] in pos_annotated_targets and row['Target Sequence'] in neg_annotated_targets:
        p10_conditioned = targets_degree_ratio[row['Target Sequence']]['deg_ratio'] ## k_+ / (k_+ + k_-)
        test_probabilty_predicted_conditioned.append(p10_conditioned)

    else:
        p10_conditioned = average_conditional_probability
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        

  after removing the cwd from sys.path.
14150it [00:06, 2248.48it/s]


In [23]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), test_probabilty_predicted_conditioned))
print('AUP: ', average_precision_score(test['Label'].tolist(), test_probabilty_predicted_conditioned))

AUC:  0.8290993013491732
AUP:  0.46688101903990786


# Performances for Unseen Edges, Unseen Targets and Unseen Nodes

In [20]:
test['test_probabilty_predicted_conditioned'] = test_probabilty_predicted_conditioned

In [21]:
train_ligands = list(set(train['SMILES'].tolist()))
train_targets = list(set(train['Target Sequence'].tolist()))

test_minus_train_ligands = list(set(test['SMILES'].tolist()).difference(train_ligands))
test_minus_train_targets = list(set(test['Target Sequence'].tolist()).difference(train_targets))

In [23]:
print('Seen ligands: ', len(train_ligands))
print('Seen targets: ', len(train_targets))
print('Unseen ligands: ', len(test_minus_train_ligands))
print('Unseen targets: ', len(test_minus_train_targets))

Seen ligands:  10416
Seen targets:  1391
Unseen ligands:  2051
Unseen targets:  75


In [30]:
# Unseen Edges

test_unseen_edges = test[test['SMILES'].isin(train_ligands) & test['Target Sequence'].isin(train_targets)]
print('Unseen edges: ', len(test_unseen_edges))

print('AUC: ', roc_auc_score(test_unseen_edges['Label'].tolist(), test_unseen_edges['test_probabilty_predicted_conditioned']))
print('AUP: ', average_precision_score(test_unseen_edges['Label'].tolist(), test_unseen_edges['test_probabilty_predicted_conditioned']))

Unseen edges:  11927
AUC:  0.8407253806394404
AUP:  0.5139294916148266


In [34]:
# Unseen Targets

test_unseen_targets = test[test['Target Sequence'].isin(test_minus_train_targets)]
print('Unseen targets: ', len(test_unseen_targets))

print('AUC: ', roc_auc_score(test_unseen_targets['Label'].tolist(), test_unseen_targets['test_probabilty_predicted_conditioned']))
print('AUP: ', average_precision_score(test_unseen_targets['Label'].tolist(), test_unseen_targets['test_probabilty_predicted_conditioned']))

Unseen targets:  80
AUC:  0.7823984526112185
AUP:  0.7404484670034909


In [36]:
# Unseen Nodes

test_unseen_nodes = test[test['SMILES'].isin(test_minus_train_ligands) & test['Target Sequence'].isin(test_minus_train_targets)]
print('Unseen nodes: ', len(test_unseen_nodes))

print('AUC: ', roc_auc_score(test_unseen_nodes['Label'].tolist(), test_unseen_nodes['test_probabilty_predicted_conditioned']))
print('AUP: ', average_precision_score(test_unseen_nodes['Label'].tolist(), test_unseen_nodes['test_probabilty_predicted_conditioned']))

Unseen nodes:  34
AUC:  0.5
AUP:  0.35294117647058826
