In [1]:
import pandas as pd
from bicm import BipartiteGraph
import numpy as np
from tqdm import tqdm
import csv
import itertools 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.metrics import confusion_matrix, f1_score, classification_report

In [16]:
train = pd.read_csv('DeepPurpose_train.csv')
test = pd.read_csv('DeepPurpose_test.csv')

# Reading the Ligand and Target Lists

In [3]:
text_file = open("ligands.txt", "r") # Rows of the adjacency matrix in order
ligands = text_file.readlines()

text_file = open("targets.txt", "r") # Columns of the adjacency matrix in order 
targets = text_file.readlines()

In [27]:
ligands = [j.replace('\n','') for j in tqdm(ligands)]
targets = [j.replace('\n','') for j in tqdm(targets)]

100%|██████████| 10416/10416 [00:00<00:00, 1077515.61it/s]
100%|██████████| 1391/1391 [00:00<00:00, 745118.37it/s]


In [28]:
number_ligands = len(ligands)
number_targets = len(targets)
print('Ligands: ', number_ligands)
print('Targets: ',number_targets)

Ligands:  10416
Targets:  1391


# Preparing Degree Ratio Dictionaries from Training Data

In [17]:
train_pos = train[train['Label'] == 1]
train_neg = train[train['Label'] == 0]

In [37]:
ligand_degree_ratio = dict()
ligand_all_average = []

for ligand in tqdm(ligands):
    pos_deg = len(train_pos[train_pos['SMILES'] == ligand])
    neg_deg = len(train_neg[train_neg['SMILES'] == ligand])
    ligand_degree_ratio[ligand] = dict()
    ligand_degree_ratio[ligand]['deg_ratio'] = pos_deg / (pos_deg + neg_deg)
    ligand_degree_ratio[ligand]['deg_avg'] = pos_deg / number_targets 
    ligand_all_average.append(pos_deg / number_targets)
    
ligands_all_avg = sum(ligand_all_average) / number_ligands

100%|██████████| 10416/10416 [01:01<00:00, 168.67it/s]


In [40]:
targets_degree_ratio = dict()
target_all_average = []

for target in tqdm(targets):
    pos_deg = len(train_pos[train_pos['Target Sequence'] == target])
    neg_deg = len(train_neg[train_neg['Target Sequence'] == target])
    targets_degree_ratio[target] = dict()
    targets_degree_ratio[target]['deg_ratio'] = pos_deg / (pos_deg + neg_deg)
    targets_degree_ratio[target]['deg_avg'] = pos_deg / number_ligands
    target_all_average.append(pos_deg / number_ligands)
    
targets_all_avg = sum(target_all_average) / number_targets

100%|██████████| 1391/1391 [00:08<00:00, 169.54it/s]


# Naive Model

In [86]:
test_probabilty_predicted_naive = []

for index, row in tqdm(test.iterrows()):
    
    if row['SMILES'] in ligands and row['Target Sequence'] in targets:
        p_naive = ligand_degree_ratio[row['SMILES']]['deg_ratio'] * targets_degree_ratio[row['Target Sequence']]['deg_ratio']
        test_probabilty_predicted_naive.append(min(1,p_naive))
    elif row['SMILES'] in ligands and row['Target Sequence'] not in targets:
        p_naive = ligand_degree_ratio[row['SMILES']]['deg_ratio']  
        test_probabilty_predicted_naive.append(min(1,p_naive))
    elif row['SMILES'] not in ligands and row['Target Sequence'] in targets:
        p_naive = targets_degree_ratio[row['Target Sequence']]['deg_ratio'] 
        test_probabilty_predicted_naive.append(min(1,p_naive))
    else:
        test_probabilty_predicted_naive.append(1)
        

14150it [00:04, 3288.28it/s]


In [87]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), test_probabilty_predicted_naive))
print('AUP: ', average_precision_score(test['Label'].tolist(), test_probabilty_predicted_naive))

AUC:  0.8491380951089191
AUP:  0.6402460532069669


# Configuration Model - Single Layer

In [5]:
P = np.loadtxt(open("P.csv", "rb"), delimiter=",", skiprows=0) # Output of MATLAB run 

In [30]:
predicted_probability_test_single_layer = []

for index, row in tqdm(test.iterrows()):
    if row['SMILES'] in ligands and row['Target Sequence'] in targets:
        p = P[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
    elif row['SMILES'] in ligands and row['Target Sequence'] not in targets:
        p = sum(P[ligands.index(row['SMILES']),:]) / len(targets) # Average binding probaility of the ligand
    elif row['SMILES'] not in ligands and row['Target Sequence'] in targets:
        p = sum(P[:,targets.index(row['Target Sequence'])]) / len(ligands) # Average binding probability of the target
    else:
        p = np.sum(P) / (len(ligands)*len(targets)) # Both unseen - average over all probabilties 
                  
    predicted_probability_test_single_layer.append(p)

14150it [00:13, 1011.22it/s]


In [38]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), predicted_probability_test_single_layer))
print('AUP: ', average_precision_score(test['Label'].tolist(), predicted_probability_test_single_layer))

AUC:  0.6718648547306816
AUP:  0.3579757116887733


# Configuration Model - Duplex - Unconditioned

In [32]:
summat10 = np.loadtxt(open("summat10.csv", "rb"), delimiter=",", skiprows=0) # Output of MATLAB run 

In [33]:
summat01 = np.loadtxt(open("summat01.csv", "rb"), delimiter=",", skiprows=0) # Output of MATLAB run 

In [34]:
predicted_probability_test_unconditioned = []

for index, row in tqdm(test.iterrows()):
    if row['SMILES'] in ligands and row['Target Sequence'] in targets:
        p10 = summat10[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
    elif row['SMILES'] in ligands and row['Target Sequence'] not in targets:
        p10 = sum(summat10[ligands.index(row['SMILES']),:]) / len(targets) # Average binding probbaility of the ligand
    elif row['SMILES'] not in ligands and row['Target Sequence'] in targets:
        p10 = sum(summat10[:,targets.index(row['Target Sequence'])]) / len(ligands) # Average binding probability of the target
    else:
        p10 = np.sum(summat10) / (len(ligands)*len(targets)) # Both unseen - average over all probabilties 
                  
    predicted_probability_test_unconditioned.append(p10)

14150it [00:13, 1027.07it/s]


In [39]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), predicted_probability_test_unconditioned))
print('AUP: ', average_precision_score(test['Label'].tolist(), predicted_probability_test_unconditioned))

AUC:  0.6718648547306816
AUP:  0.3579757116887733


# Configuration Model - Duplex - Conditioned

In [84]:
test_probabilty_predicted_conditioned = []

## Average conditional probability
conditoned_summat = np.divide(summat10,np.add(summat10,summat01)) # Elementwise pos_deg / (pos_deg + neg_deg)
conditoned_summat = np.nan_to_num(conditoned_summat)
conditoned_summat = np.minimum(conditoned_summat,1) # Take minimum of 1 and the computed conditional probability 
average_conditional_probability = np.sum(conditoned_summat) / (number_ligands * number_targets)

for index, row in tqdm(test.iterrows()):
    
    if row['SMILES'] in ligands and row['Target Sequence'] in targets:
        p10 = summat10[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
        p01 = summat01[ligands.index(row['SMILES']),targets.index(row['Target Sequence'])]
        p10_conditioned = p10 / (p10 + p01)
        test_probabilty_predicted_conditioned.append(min(1,p10_conditioned))
    elif row['SMILES'] in ligands and row['Target Sequence'] not in targets:
        p10_conditioned = ligand_degree_ratio[row['SMILES']]['deg_ratio']  
        test_probabilty_predicted_conditioned.append(min(1,p10_conditioned))
    elif row['SMILES'] not in ligands and row['Target Sequence'] in targets:
        p10_conditioned = targets_degree_ratio[row['Target Sequence']]['deg_ratio'] 
        test_probabilty_predicted_conditioned.append(min(1,p10_conditioned))
    else:
        p10_conditioned = average_conditional_probability
        test_probabilty_predicted_conditioned.append(p10_conditioned)
        

  after removing the cwd from sys.path.
  
14150it [00:06, 2123.63it/s]


In [85]:
## Performance on the test dataset

print('AUC: ', roc_auc_score(test['Label'].tolist(), test_probabilty_predicted_conditioned))
print('AUP: ', average_precision_score(test['Label'].tolist(), test_probabilty_predicted_conditioned))

AUC:  0.8543987955332084
AUP:  0.6452633855586876
