# Evaluation of the Prediction servers NSitePred and ATPbind

In [2]:
import os
import json
from Bio.PDB import *
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Preparation of the Pandas DataFrames

PDB files to dataframes, including missing residues

In [3]:
# Function to parse PDB file and convert to DataFrame
def pdb_to_dataframe(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    data = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    data.append([atom.serial_number, atom.name, residue.resname, chain.id, residue.id[1], 
                                 atom.coord[0], atom.coord[1], atom.coord[2], atom.occupancy, atom.bfactor, atom.element])
    columns = ['Atom_ID', 'Atom_Name', 'Residue_Name', 'Chain_ID', 'Residue_ID', 
               'X', 'Y', 'Z', 'Occupancy', 'Bfactor', 'Element']
    return pd.DataFrame(data, columns=columns)

# Directory containing the PDB files
pdb_directory = '../Files/pdb'
# Directory containing the true binding CSV files
true_binding_directory = '../Files/true_bindings'

# Dictionary to store dataframes
pdb_dataframes = {}

# Mapping of three-letter amino acid codes to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

# Loop through all files in the directory
for pdb_file in os.listdir(pdb_directory):
    if not pdb_file.endswith('.pdb'):
        continue
    pdb_id, chain = pdb_file.split("_")[0], pdb_file.split("_")[1].split(".")[0]
    
    filepath = os.path.join(pdb_directory, pdb_file)
    pdb_df = pdb_to_dataframe(filepath)
    
    
    # Filter rows for CA atoms
    pdb_df = pdb_df[pdb_df['Atom_Name'] == 'CA']
    # Filter rows for specific chain
    pdb_df = pdb_df[pdb_df['Chain_ID'] == chain]
    # Keep only specific columns
    pdb_df = pdb_df[['Residue_Name', 'Residue_ID']]
    
    # add missing residues to the dataframe 
    h = parse_pdb_header(filepath)
    # Go through missing residues and add to list
    for residue in h['missing_residues']:
        if residue["chain"] == chain:
            pdb_df = pd.concat([pdb_df, 
                                pd.DataFrame.from_records([{
                                    'Residue_Name': residue["res_name"], 
                                    'Residue_ID': residue["ssseq"]}])])
            
    # Map three-letter amino acid codes to one-letter codes
    pdb_df['Residue_Name'] = pdb_df['Residue_Name'].map(three_to_one)
    # Remove duplicate rows
    pdb_df = pdb_df.drop_duplicates()
    # Sort by Residue_ID
    pdb_df = pdb_df.sort_values(by='Residue_ID')
    
    # Read the corresponding true binding CSV file
    true_binding_file = os.path.join(true_binding_directory, f"{pdb_id}_true.csv")
    if os.path.exists(true_binding_file):
        true_binding_df = pd.read_csv(true_binding_file)
        true_binding_set = set(zip(true_binding_df['Residue_Name'], true_binding_df['Residue_ID']))
        
        # Add the 'binding' column
        pdb_df['binding'] = pdb_df.apply(lambda row: 1 if (row['Residue_Name'], row['Residue_ID']) in true_binding_set else 0, axis=1)
    
    
    # Reset index
    pdb_df.reset_index(drop=True, inplace=True)

    pdb_dataframes[pdb_id] = pdb_df

# Display the keys of the dictionary to show the imported dataframes
print(pdb_dataframes.keys())
# Display the 3f5m.pdb dataframe as a table to test
pdb_dataframes['3f5m'].head()
pdb_dataframes['3f5m'].iloc[100:110]

dict_keys(['2j9c', '2x14', '3f5m', '4cta', '6ksh'])


Unnamed: 0,Residue_Name,Residue_ID,binding
100,G,101,0
101,I,102,0
102,V,103,0
103,T,104,0
104,C,105,1
105,G,106,1
106,G,107,1
107,I,108,0
108,C,109,0
109,P,110,0


NSite predictions to dataframes

In [4]:
# Directory containing the NSite predictions
directory = '../Files/NSite_predictions'

# Dictionary to store dataframes
ns_predictions = {}

# Loop through all files in the directory
for nsite_input_file in os.listdir(directory):
    if nsite_input_file.endswith('.csv'):
        filepath = os.path.join(directory, nsite_input_file)
        nsite_df = pd.read_csv(filepath)
        # Filter for relevant information
        nsite_df = nsite_df[['AA', 'ATP binding res.', ' ATP prob.']]
        nsite_df.rename(columns={'AA': 'Residue_Name',
                                 'ATP binding res.': 'binding',
                                 ' ATP prob.': 'probability'}, inplace=True)
        # Map binding column to 0 and 1
        nsite_df['binding'] = nsite_df['binding'].map({'B': 1, 'N': 0})
        
        # Get the starting Residue_ID from the corresponding PDB dataframe
        pdb_key = nsite_input_file.split("_")[0]
        if pdb_key in pdb_dataframes:
            start_residue_id = pdb_dataframes[pdb_key]['Residue_ID'].iloc[0]
            # start_residue_id = start_residue_id < 0 and start_residue_id or 1
            nsite_df.insert(1, 'Residue_ID', range(start_residue_id, start_residue_id + len(nsite_df)))
            ns_predictions[pdb_key] = nsite_df

# Display the keys of the dictionary to show the imported dataframes
print(ns_predictions.keys())
ns_predictions['2x14'].head()
ns_predictions['3f5m'].iloc[100:110]

dict_keys(['2j9c', '2x14', '3f5m', '4cta', '6ksh'])


Unnamed: 0,Residue_Name,Residue_ID,binding,probability
100,G,101,0,0.019
101,I,102,0,0.016
102,V,103,0,0.02
103,T,104,0,0.029
104,C,105,1,0.772
105,G,106,1,0.978
106,G,107,1,0.993
107,I,108,0,0.029
108,C,109,0,0.038
109,P,110,0,0.032


# Evaluation of NSitePred web server

In [5]:
# Check that the prediction and PDB dataframes have the same residue name for each residue ID
results = {} # Dict of merged dataframes

for pdb_id, pdb_df in pdb_dataframes.items():
    nsite_df = ns_predictions[pdb_id]    
        
    # Assuming pdb_df and nsite_df are your dataframes
    result = pd.merge(pdb_df, nsite_df, on='Residue_ID', how='left', suffixes=('_true', '_pred'))
    results[pdb_id]= result 
    
    # Print the pdb file where the Residue_Name columns do not match at some Residue_ID
    mismatch = result[result['Residue_Name_true'] != result['Residue_Name_pred']]
    if not mismatch.empty:
        print(pdb_id)
        print(mismatch)

results['3f5m'].iloc[340:346]
# results['3f5m'].tail()

Unnamed: 0,Residue_Name_true,Residue_ID,binding_true,Residue_Name_pred,binding_pred,probability
340,S,341,1,S,1,0.274
341,G,342,0,G,0,0.029
342,N,343,1,N,1,0.319
343,K,344,0,K,0,0.025
344,K,345,0,K,0,0.037
345,L,346,0,L,0,0.038


### Calculation of metrics for the NSitePred web server

Accuracy = $ \frac{TP + TN}{TP + TN + FP + FN} $ \
\
Precision = $ \frac{TP}{TP + FP} $ \
\
Recall = $ \frac{TP}{TP + FN} $ \
\
F1 = $ 2\times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} $


In [7]:
# Calculate the metrics for each PDB file
results_directory = '../Files/results'
os.makedirs(results_directory, exist_ok=True)

for pdb_id, result in results.items():
    
    # Compute the number of true positives, false positives, true negatives, and false negatives
    result['TP'] = ((result['binding_true'] == 1) & (result['binding_pred'] == 1)).astype(int)
    result['FP'] = ((result['binding_true'] == 0) & (result['binding_pred'] == 1)).astype(int)
    result['TN'] = ((result['binding_true'] == 0) & (result['binding_pred'] == 0)).astype(int)
    result['FN'] = ((result['binding_true'] == 1) & (result['binding_pred'] == 0)).astype(int)
    
    # print(f"Results for {pdb_id}")
    # print(f"True Positives: {result['TP'].sum()}")
    # print(f"False Positives: {result['FP'].sum()}")
    # print(f"True Negatives: {result['TN'].sum()}")
    # print(f"False Negatives: {result['FN'].sum()}")
    # print()
    

    # Compute Metrics
    accuracy = accuracy_score(result["binding_true"], result["binding_pred"])
    precision = precision_score(result["binding_true"], result["binding_pred"])
    recall = recall_score(result["binding_true"], result["binding_pred"])
    f1 = f1_score(result["binding_true"], result["binding_pred"])
    auc = roc_auc_score(result["binding_true"], result["probability"])  # If probabilities are available
    if np.isnan(auc):
        auc = 0.0

    # Store results in a dictionary
    metrics = {
        'True Positives': int(result['TP'].sum()),
        'False Positives': int(result['FP'].sum()),
        'True Negatives': int(result['TN'].sum()),
        'False Negatives': int(result['FN'].sum()),
        'Accuracy': round(float(accuracy), 3),
        'Precision': round(float(precision), 3),
        'Recall': round(float(recall), 3),
        'F1-Score': round(float(f1), 3),
        'AUC': round(float(auc), 3)  # Only if probabilities exist
    }
    
    # Save the dictionary to a JSON file
    result_file = os.path.join(results_directory, f"{pdb_id}_results.json")
    with open(result_file, 'w') as f:
        json.dump(metrics, f, indent=4)

    # Print results
    # print(f"Results for {pdb_id}")
    # print(f"Accuracy: {accuracy:.3f}")
    # print(f"Precision: {precision:.3f}")
    # print(f"Recall: {recall:.3f}")
    # print(f"F1-Score: {f1:.3f}")
    # print(f"AUC: {auc:.3f}")  # Only if probabilities exist
    # print()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
