In [85]:
import pandas as pd
# ignore warning
import warnings
warnings.filterwarnings('ignore')


In [86]:
def filter_shared_ids(*dfs):
    # Check if at least one dataframe is provided
    if not dfs:
        raise ValueError("At least one dataframe must be provided.")

    # Get the set of IDs from the first dataframe
    shared_ids = set(dfs[0]['id'])

    # Find the intersection of IDs across all dataframes
    for df in dfs[1:]:
        shared_ids.intersection_update(set(df['id']))
        # print(shared_ids)
    # Filter dataframes to keep only rows with shared IDs
    filtered_dfs = [df[df['id'].isin(shared_ids)] for df in dfs]

    return tuple(filtered_dfs)

def pesto_add_id_col(pesto_df):
    pesto_df['id'] = pesto_df['PDBID'].map(
    lambda x: "".join([i if i.isdigit() else i.lower() for i in x.split('/')[1]])+"_"+x.split('/')[3][0]
    )
    return pesto_df

def calculate_mean_metrics(dfs, methods):
    mean_metrics = []
    
    for i, df in enumerate(dfs):
        # Calculate mean metrics for each dataframe
        mean_values = df.mean().to_dict()
        mean_metrics.append(mean_values)
    
    # Create a new dataframe with mean metrics
    mean_metrics_df = pd.DataFrame(mean_metrics)
    mean_metrics_df.index = methods
    
    return mean_metrics_df



### Performance Comparison on DNA Binding Site prediction

- GraphBind > CLAPE > PESTO
- The reason is that PESTO are general binding site prediction models, while GraphBind and CLAPE are specifically trained on DNA binding proteins.  The two models has advantages since the test set only contains DNA binding proteins. 
- General binding site models are more useful in practice since the input proteins are not always DNA binding proteins.
- But we can always add a guidance to the model to make it more specific to xxx ligand binding proteins.


In [87]:
clape_dna = pd.read_csv('CLAPE/DNA_eval_results.csv')
graphbind_dna = pd.read_csv('GraphBind/DNA_binding_result.csv')
pesto_dna = pesto_add_id_col(pd.read_csv('PeSTo/eval_result/NA_test.csv'))
clape_dna, graphbind_dna, pesto_dna = filter_shared_ids(clape_dna, graphbind_dna, pesto_dna)
dna_metrics_df = calculate_mean_metrics([clape_dna, graphbind_dna, pesto_dna], ['CLAPE', 'GraphBind', 'PeSTo'])

dna_metrics_df


Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
CLAPE,0.886279,0.649945,0.904117,0.483027,0.963744,0.491756,0.723385,0.292555,0.443303,0.532026,0.036256
GraphBind,0.89343,0.719578,0.905806,0.477135,0.969913,0.515963,0.917192,0.218446,0.703906,0.542736,0.030087
PeSTo,0.816705,0.447695,0.854641,0.248527,0.93264,0.21953,0.729337,0.21028,0.422027,0.280175,0.06736


### Performance Comparison on RNA Binding Site prediction

- GraphBind ~ PESTO > CLAPE
- CLAPE performs unsatisfactorily on RNA binding site prediction. The reason might be it over-confidently predict all residues as non-binding (the NPV is 0.92 and STD 0.02) 

In [88]:
clape = pd.read_csv('CLAPE/RNA_eval_results.csv')
graphbind = pd.read_csv('GraphBind/RNA_binding_result.csv')
pesto = pesto_add_id_col(pd.read_csv('PeSTo/eval_result/NA_test.csv'))
clape, graphbind, pesto = filter_shared_ids(clape, graphbind, pesto)
rna_metrics_df = calculate_mean_metrics([clape, graphbind, pesto], ['CLAPE', 'GraphBind', 'PeSTo'])

rna_metrics_df


Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
CLAPE,0.909212,0.104994,0.913685,0.036058,0.993161,0.038582,0.514609,0.029473,0.10413,0.034718,0.006839
GraphBind,0.88767,0.36891,0.918957,0.282221,0.956585,0.23334,0.83174,0.18165,0.417895,0.246877,0.043415
PeSTo,0.822267,0.284933,0.875395,0.210856,0.927503,0.147944,0.674084,0.207429,0.30762,0.207842,0.072497


### Protein Binding Site Prediction
-  ScanNet-MSA > sScanNet-no-MSA > Pesto
- Model performances are consistently better than other binding site prediction tasks as data for protein binding task are more abundant.

In [89]:
scan_net_MSA = pd.read_csv('ScanNet/scan_net_MSA_metrics.csv')
scan_net_no_MSA = pd.read_csv('ScanNet/scan_net_noMSA_metrics.csv')
pesto = pesto_add_id_col(pd.read_csv('PeSTo/eval_result/Protein_test.csv'))
scan_net_MSA, scan_net_no_MSA, pesto = filter_shared_ids(scan_net_MSA, scan_net_no_MSA, pesto)

metrics_df = calculate_mean_metrics([scan_net_MSA, scan_net_no_MSA, pesto], ['ScanNet-MSA', 'sScanNet-no-MSA', 'PeSTo'])
metrics_df = metrics_df.drop(columns=['predicted_binding_sites'])
metrics_df

Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
ScanNet-MSA,0.841457,0.625981,0.883196,0.552547,0.885764,0.455531,0.855044,0.23585,0.657742,0.539219,0.095039
sScanNet-no-MSA,0.839239,0.62853,0.865552,0.47173,0.90294,0.414089,0.845996,0.210564,0.634878,0.491383,0.077863
PeSTo,0.76944,0.448222,0.851509,0.483402,0.820948,0.289128,0.757734,0.262007,0.476759,0.42799,0.161452


In [97]:
def add_id_col(df): df['id'] = df['pdbid']; return df


graphbind_ca = pd.read_csv('GraphBind/Ca2+_binding_result.csv')
graphbind_mg = pd.read_csv('GraphBind/Mg2+_binding_result.csv')
graphbind_mn = pd.read_csv('GraphBind/Mn2+_binding_result.csv')

lmetalsite_ca = add_id_col(pd.read_csv('LMetalSite/CA_pred_metrics.csv'))
lmetalsite_mg = add_id_col(pd.read_csv('LMetalSite/MG_pred_metrics.csv'))
lmetalsite_mn = add_id_col(pd.read_csv('LMetalSite/MN_pred_metrics.csv'))
lmetalsite_zn = add_id_col(pd.read_csv('LMetalSite/ZN_pred_metrics.csv'))

pesto = pesto_add_id_col(pd.read_csv('PeSTo/eval_result/ion_test.csv'))




## Metal Binding Site Prediction

In [98]:
metrics_df = calculate_mean_metrics(
    filter_shared_ids(graphbind_ca, lmetalsite_ca, pesto),
    ['GraphBind-Ca2+', 'LMetalSite-Ca2+', 'PeSTo']
)
metrics_df


Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
GraphBind-Ca2+,0.949611,0.249306,0.952448,0.143028,0.99621,0.160819,0.797399,0.076221,0.377568,0.148481,0.00379
LMetalSite-Ca2+,0.953099,0.556348,0.957361,0.219291,0.994061,0.295401,0.606676,0.105883,0.209777,0.259628,0.005939
PeSTo,0.921284,0.003759,0.92161,0.000548,0.999577,0.000311,0.635709,0.042203,0.162432,0.000957,0.000423


In [91]:
metrics_df = calculate_mean_metrics(
    filter_shared_ids(graphbind_mg, lmetalsite_mg, pesto),
    ['GraphBind-Mg2+', 'LMetalSite-Mg2+', 'PeSTo']
)
metrics_df

Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
GraphBind-Mg2+,0.975983,0.4375,0.977197,0.270833,0.998708,0.334235,0.804529,0.068019,0.36714,0.32197,0.001292
LMetalSite-Mg2+,0.974068,0.4375,0.97638,0.229167,0.997536,0.298573,0.613351,0.0694,0.221435,0.281548,0.002464
PeSTo,0.922282,0.0,0.923726,0.0,0.998162,-0.006583,0.650342,0.045243,0.158038,0.0,0.001838


In [92]:
metrics_df = calculate_mean_metrics(
    filter_shared_ids(graphbind_mn, lmetalsite_mn, pesto),
    ['GraphBind-Mn2+', 'LMetalSite-Mn2+', 'PeSTo']
)
metrics_df

Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
GraphBind-Mn2+,0.942375,0.333333,0.94237,0.25,1.0,0.288336,0.964878,0.043117,0.842999,0.285714,0.0
LMetalSite-Mn2+,0.967438,1.0,0.966181,0.469697,1.0,0.673607,0.734848,0.151916,0.502259,0.638889,0.0
PeSTo,0.940044,0.0,0.940044,0.0,1.0,0.0,0.695647,0.035628,0.342492,0.0,0.0


In [93]:

metrics_df = calculate_mean_metrics(
    filter_shared_ids(lmetalsite_zn, pesto),
    ['LMetalSite-Zn2+', 'PeSTo']
)
metrics_df

Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
LMetalSite-Zn2+,0.961431,0.70231,0.966387,0.419659,0.992148,0.516807,0.705904,0.161157,0.402769,0.507449,0.007852
PeSTo,0.918554,0.219295,0.919714,0.052642,0.997661,0.094044,0.707557,0.063038,0.328255,0.079195,0.002339


## Ligand Binding Site Prediction

In [94]:
graphbind_atp = pd.read_csv('GraphBind/ATP_binding_result.csv')
pesto = pesto_add_id_col(pd.read_csv('PeSTo/eval_result/Ligand_test.csv') )

metrics_df = calculate_mean_metrics(
    filter_shared_ids(graphbind_atp, pesto),
    ['GraphBind-ATP', 'PeSTo']
)
metrics_df

Unnamed: 0,Acc,PPV(Precision),NPV,TPR(Recall),TNR,MCC,ROC AUC,STD,PR AUC,F1,FPR
GraphBind-ATP,0.951029,0.569815,0.969721,0.538332,0.979204,0.512932,0.899108,0.184741,0.567494,0.517843,0.020796
PeSTo,0.895088,0.188235,0.895378,0.041468,0.999541,0.079902,0.750933,0.054941,0.366378,0.063925,0.000459
