In [12]:
import pathlib
import pandas as pd
import numpy as np

rootdir = pathlib.Path('.').resolve(strict=True)
datadir = rootdir.parents [1] / 'ace2-rbd-point-mutation-benchmark/files/input_files' # githuba koyup analizleri yaptığımız dosyalar
outputdir = rootdir.parents [1] / 'ace2-rbd-point-mutation-benchmark/files/output_files'
newfiles = rootdir.parents [1] / 'ace2-rbd-point-mutation-benchmark/run_scripts_errors/files' # yeni ürettiğimiz scorelar

# import dataset
ACE2 = pd.read_csv(datadir / 'ACE2_Experimental_dataset.csv', delimiter=',')
ACE2.columns=['#case_id','exp_binding']
#create protein column to represent which case belong which protein 
ACE2['protein'] = "ACE2"

RBD = pd.read_csv(datadir / 'RBD_Experimental_dataset.csv', delimiter=',')
RBD = RBD[['#case_id', 'RBD_bind_avg']]
RBD.columns=['#case_id','exp_binding']
RBD['protein'] = "RBD"

# concatanate ACE2 and RBD dataset to build experimental dataset of our study
Experimental_dataset = pd.concat([ACE2,RBD])
# setting case id as an index is necessary for joining datasets in an accurate order to the next level
Experimental_dataset=Experimental_dataset.set_index('#case_id')

In [5]:
FoldX_originalpdb = pd.read_csv(newfiles / 'FoldX_originalpdb.csv', delimiter=',')
FoldX_originalpdb
# calculate ∆∆G
FoldX_originalpdb['foldx-ddg'] = FoldX_originalpdb['mutant_originalpdb'] - FoldX_originalpdb['wt_originalpdb']

# remove foldx-score-mut and foldx-score-wt column
FoldX_scores = FoldX_originalpdb.drop(labels=['mutant_originalpdb', 'wt_originalpdb'], axis=1)
FoldX_scores=FoldX_scores.set_index('#case_id')

predictors = [Experimental_dataset, FoldX_scores]
contol_dataset = pd.concat(predictors, axis=1,join="inner")


In [113]:
pd.set_option('mode.chained_assignment', None)
def prediction_status(df):
    columns=df.columns
    ddg_columns=columns[columns.str.contains('ddg')]

    for i in ddg_columns:
        enriched = df[df['exp_binding']>0]
        depleted = df[df['exp_binding']<0]

        condition_enr = [
                (enriched.loc[:, [f'{i}']] < 0),
                (enriched.loc[:, [f'{i}']] >= 0)]

        value_enr = ['success', 'failure']
        enriched.loc[:,[f'{i}-prediction']] = np.select(condition_enr, value_enr)

        condition_dep = [
                (depleted.loc[:, [f'{i}']] <= 0),
                (depleted.loc[:, [f'{i}']] > 0)]

        value_dep = ['failure', 'success']
        depleted.loc[:,[f'{i}-prediction']] = np.select(condition_dep, value_dep)
        df=pd.concat([enriched, depleted])
    return (df)


In [122]:
contol=prediction_status(contol_dataset)
contol.columns=['exp_binding','protein','originalpdb_foldx-ddg', 'originalpdb_foldx-ddg-prediction']


In [152]:
RBD_ACE2_benchmarking_dataset = pd.read_csv(outputdir / 'SARS-CoV-2-RBD_ACE2_benchmarking_dataset.csv', delimiter=',')
data=RBD_ACE2_benchmarking_dataset[['#case_id','foldx-ddg', 'foldx-ddg-prediction']]
data=data.set_index('#case_id')

main_dataset =pd.concat([contol,data],axis=1)
main_dataset = main_dataset.reset_index()


In [156]:
main_dataset_list = main_dataset.values.tolist()             
diff_cases = []
for i in main_dataset_list:
    a=i[4]
    b=i[6]
    if a != b:
        x=i[0]
        diff_cases.append(x)
print(diff_cases)   
len(diff_cases)


['A386L', 'D30T', 'M82R', 'N330H', 'Q325T', 'Q325Y', 'Q42C', 'Q42H', 'Q42K', 'Q42L', 'Q42R', 'Q42T', 'Q42V', 'R393K', 'T27H', 'T324Q', 'L455M', 'V503L', 'D30P', 'G354Q', 'H34K', 'K353R', 'L45K', 'Q24K', 'Y41M', 'R403W']


26

In [226]:
df = pd.DataFrame()
for i in diff_cases:
    x = main_dataset[main_dataset['index']==i]
    df = pd.concat([df, x], axis=0)
df

Unnamed: 0,index,exp_binding,protein,originalpdb_foldx-ddg,originalpdb_foldx-ddg-prediction,foldx-ddg,foldx-ddg-prediction
1,A386L,2.6783,ACE2,-0.32,success,0.12,failure
5,D30T,0.537893,ACE2,0.06,failure,-0.67,success
34,M82R,1.08794,ACE2,-0.01,success,0.46,failure
36,N330H,2.64117,ACE2,0.01,failure,-0.05,success
45,Q325T,0.382183,ACE2,0.01,failure,-0.01,success
46,Q325Y,0.365574,ACE2,0.01,failure,-0.97,success
47,Q42C,2.85429,ACE2,0.0,failure,-0.01,success
48,Q42H,1.39636,ACE2,-0.46,success,0.03,failure
50,Q42K,1.72924,ACE2,0.02,failure,-0.05,success
51,Q42L,2.74213,ACE2,-0.31,success,0.01,failure


## Performance

In [143]:
def performance_calculation(df):
    """
    This fuction calculates performance of predictors from input dataset by using <xx>-prediction columns. 
    These columns represent the binary prediction status of predictor -success or failure.
    This function counts success cases and calculates the performance of predictor.
    """
    columns=df.columns
    # get <xx>-prediction columns as a list
    prediction_columns=columns[columns.str.contains('prediction')]
    performances = []
    for i in prediction_columns:
        predictor_name = f'{i}'
        success_rate= [f'{predictor_name}', round(sum(df[f'{i}'].str.count("success")/len(df['index'])*100),2)]
        performances.append(success_rate)
    performances = pd.DataFrame(performances)
    
    return performances

In [147]:
# Total performances of predictors were calculated. 
# Additionally performance calculation according to mutation type(Enriched, Depleted) and protein type (ACE2, RBD) were builded.  

Total=performance_calculation(main_dataset)
Enriched=performance_calculation(main_dataset[main_dataset['exp_binding']>0])
Depleted=performance_calculation(main_dataset[main_dataset['exp_binding']<0])
ACE2=performance_calculation(main_dataset[main_dataset.protein =='ACE2'])
ACE2_Enriched=performance_calculation(main_dataset[(main_dataset.protein == 'ACE2') & (main_dataset['exp_binding'] >0)])
ACE2_Depleted=performance_calculation(main_dataset[(main_dataset.protein == 'ACE2') & (main_dataset['exp_binding'] <0)])
RBD=performance_calculation(main_dataset[main_dataset.protein =='RBD'])
RBD_Enriched=performance_calculation(main_dataset[(main_dataset.protein == 'RBD') & (main_dataset['exp_binding'] >0)])
RBD_Depleted=performance_calculation(main_dataset[(main_dataset.protein == 'RBD') & (main_dataset['exp_binding'] <0)])

main_performance_table = pd.concat([Total,Enriched[1], Depleted[1], ACE2[1], ACE2_Enriched[1], ACE2_Depleted[1], RBD[1], RBD_Enriched[1], RBD_Depleted[1]], axis=1)
main_performance_table.columns=['Predictors', 'Total', 'Enriched', 'Depleted', 'ACE2', 'ACE2 Enriched', 'ACE2 Depleted', 'RBD', 'RBD Enriched', 'RBD Depleted']
main_performance_table['Predictors'] = ['FoldX original-pdb', 'FoldX-github']
main_performance_table.set_index('Predictors')

Unnamed: 0_level_0,Total,Enriched,Depleted,ACE2,ACE2 Enriched,ACE2 Depleted,RBD,RBD Enriched,RBD Depleted
Predictors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
FoldX original-pdb,63.12,49.62,76.52,60.34,44.94,75.56,69.05,59.52,78.57
FoldX-github,63.88,51.15,76.52,62.01,47.19,76.67,67.86,59.52,76.19
