# Performance Analysis

This notebook used to build the performance analysis of predictors on main and UEP datasets. 

These datasets are the output of "creating_SARS-CoV-2-RBD_ACE2_benchmarking_datasets.ipynb" notebook.

In [1]:
import pathlib
import pandas as pd
import numpy as np

In [2]:
rootdir = pathlib.Path('.').resolve(strict=True)
outputdir = rootdir.parents [1] / 'files/output_files'

In [3]:
# import main and UEP datasets
main_dataset = pd.read_csv(outputdir / 'SARS-CoV-2-RBD_ACE2_benchmarking_dataset.csv', delimiter=',')
UEP_dataset = pd.read_csv(outputdir / 'UEP_SARS-CoV-2-RBD_ACE2_benchmarking_dataset.csv', delimiter=',')

In [4]:
def performance_calculation(df):
    """
    This fuction calculates performance of predictors from input dataset by using <xx>-prediction columns. 
    These columns represent the binary prediction status of predictor -success or failure.
    This function counts success cases and calculates the performance of predictor.
    """
    columns=df.columns
    # get <xx>-prediction columns as a list
    prediction_columns=columns[columns.str.contains('prediction')]
    performances = []
    for i in prediction_columns:
        predictor_name = f'{i}'
        success_rate= [f'{predictor_name}', round(sum(df[f'{i}'].str.count("success")/len(df['#case_id'])*100),2)]
        performances.append(success_rate)
    performances = pd.DataFrame(performances)
    
    return performances

## Performances of our Predictors on Main Dataset

In [5]:
# Total performances of predictors were calculated. 
# Additionally performance calculation according to mutation type(Enriched, Depleted) and protein type (ACE2, RBD) were builded.  

Total=performance_calculation(main_dataset)
Enriched=performance_calculation(main_dataset[main_dataset['exp_binding']>0])
Depleted=performance_calculation(main_dataset[main_dataset['exp_binding']<0])
ACE2=performance_calculation(main_dataset[main_dataset.protein =='ACE2'])
ACE2_Enriched=performance_calculation(main_dataset[(main_dataset.protein == 'ACE2') & (main_dataset['exp_binding'] >0)])
ACE2_Depleted=performance_calculation(main_dataset[(main_dataset.protein == 'ACE2') & (main_dataset['exp_binding'] <0)])
RBD=performance_calculation(main_dataset[main_dataset.protein =='RBD'])
RBD_Enriched=performance_calculation(main_dataset[(main_dataset.protein == 'RBD') & (main_dataset['exp_binding'] >0)])
RBD_Depleted=performance_calculation(main_dataset[(main_dataset.protein == 'RBD') & (main_dataset['exp_binding'] <0)])

main_performance_table = pd.concat([Total,Enriched[1], Depleted[1], ACE2[1], ACE2_Enriched[1], ACE2_Depleted[1], RBD[1], RBD_Enriched[1], RBD_Depleted[1]], axis=1)
main_performance_table.columns=['Predictors', 'Total', 'Enriched', 'Depleted', 'ACE2', 'ACE2 Enriched', 'ACE2 Depleted', 'RBD', 'RBD Enriched', 'RBD Depleted']
main_performance_table['Predictors'] = ['HADDOCK', 'FoldX', 'FoldXwater', 'EvoEF1', 'MutaBind2', 'SSIPe']
main_performance_table.set_index('Predictors')

Unnamed: 0_level_0,Total,Enriched,Depleted,ACE2,ACE2 Enriched,ACE2 Depleted,RBD,RBD Enriched,RBD Depleted
Predictors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HADDOCK,53.99,51.91,56.06,55.87,46.07,65.56,50.0,64.29,35.71
FoldX,63.88,51.15,76.52,62.01,47.19,76.67,67.86,59.52,76.19
FoldXwater,62.74,51.15,74.24,60.89,47.19,74.44,66.67,59.52,73.81
EvoEF1,57.41,48.85,65.91,57.54,48.31,66.67,57.14,50.0,64.29
MutaBind2,55.51,15.27,95.45,53.07,7.87,97.78,60.71,30.95,90.48
SSIPe,55.51,21.37,89.39,51.4,14.61,87.78,64.29,35.71,92.86


## Performances of our Predictors on UEP Dataset

In [6]:
# Total performances of predictors were calculated over UEP dataset. 
# Additionally performance calculation according to mutation type(Enriched, Depleted) and protein type (ACE2, RBD) were builded.  

Total=performance_calculation(UEP_dataset)
Enriched=performance_calculation(UEP_dataset[UEP_dataset['exp_binding']>0])
Depleted=performance_calculation(UEP_dataset[UEP_dataset['exp_binding']<0])
ACE2=performance_calculation(UEP_dataset[UEP_dataset.protein =='ACE2'])
ACE2_Enriched=performance_calculation(UEP_dataset[(UEP_dataset.protein == 'ACE2') & (UEP_dataset['exp_binding'] >0)])
ACE2_Depleted=performance_calculation(UEP_dataset[(UEP_dataset.protein == 'ACE2') & (UEP_dataset['exp_binding'] <0)])
RBD=performance_calculation(UEP_dataset[UEP_dataset.protein =='RBD'])
RBD_Enriched=performance_calculation(UEP_dataset[(UEP_dataset.protein == 'RBD') & (UEP_dataset['exp_binding'] >0)])
RBD_Depleted=performance_calculation(UEP_dataset[(UEP_dataset.protein == 'RBD') & (UEP_dataset['exp_binding'] <0)])

UEP_performance_table = pd.concat([Total,Enriched[1], Depleted[1], ACE2[1], ACE2_Enriched[1], ACE2_Depleted[1], RBD[1], RBD_Enriched[1], RBD_Depleted[1]], axis=1)
UEP_performance_table.columns=['Predictors', 'Total', 'Enriched', 'Depleted', 'ACE2', 'ACE2 Enriched', 'ACE2 Depleted', 'RBD', 'RBD Enriched', 'RBD Depleted']
UEP_performance_table['Predictors'] = ['HADDOCK', 'FoldX', 'FoldXwater', 'EvoEF1', 'MutaBind2', 'SSIPe', 'UEP']
UEP_performance_table.set_index('Predictors')

Unnamed: 0_level_0,Total,Enriched,Depleted,ACE2,ACE2 Enriched,ACE2 Depleted,RBD,RBD Enriched,RBD Depleted
Predictors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HADDOCK,48.84,43.1,53.52,47.56,33.33,60.47,51.06,63.16,42.86
FoldX,68.99,51.72,83.1,67.07,48.72,83.72,72.34,57.89,82.14
FoldXwater,65.89,46.55,81.69,62.2,38.46,83.72,72.34,63.16,78.57
EvoEF1,58.91,37.93,76.06,57.32,33.33,79.07,61.7,47.37,71.43
MutaBind2,58.91,13.79,95.77,52.44,2.56,97.67,70.21,36.84,92.86
SSIPe,63.57,32.76,88.73,51.22,15.38,83.72,85.11,68.42,96.43
UEP,51.94,53.45,50.7,54.88,56.41,53.49,46.81,47.37,46.43
