# Evaluation of the web-server NSitePred

In [11]:
import os
from Bio.PDB import *
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Preparation of the Pandas DataFrames

### PDB files to dataframes, including missing residues

In [12]:
# Function to parse PDB file and convert to DataFrame
def pdb_to_dataframe(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    data = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    data.append([atom.serial_number, atom.name, residue.resname, chain.id, residue.id[1], 
                                 atom.coord[0], atom.coord[1], atom.coord[2], atom.occupancy,
                                 atom.bfactor, atom.element])
    columns = ['Atom_ID', 'Atom_Name', 'Residue_Name', 'Chain_ID', 'Residue_ID', 
               'X', 'Y', 'Z', 'Occupancy', 'Bfactor', 'Element']
    return pd.DataFrame(data, columns=columns)

# Directory containing the PDB files
pdb_directory = '../Files/pdb'
# Directory containing the true binding CSV files
true_binding_directory = '../Files/true_bindings'

# Dictionary to store dataframes
pdb_dataframes = {}

# Mapping of three-letter amino acid codes to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

# Loop through all files in the directory
for pdb_file in os.listdir(pdb_directory):
    if not pdb_file.endswith('.pdb'):
        continue
    pdb_id, chain = pdb_file.split("_")[0], pdb_file.split("_")[1].split(".")[0]
    
    # Read the corresponding true binding CSV file
    true_binding_file = os.path.join(true_binding_directory, f"{pdb_id}_true.csv")
    true_binding_df = pd.read_csv(true_binding_file)
    
    # Skip this iteration if there is no ATP binding
    if true_binding_df.empty:
        continue  
    
    # Create a set of tuples of (Residue_Name, Residue_ID) for the true binding residues
    true_binding_set = set(zip(true_binding_df['Residue_Name'], true_binding_df['Residue_ID']))
    
    filepath = os.path.join(pdb_directory, pdb_file)
    pdb_df = pdb_to_dataframe(filepath)
    
    
    # Filter rows for CA atoms
    pdb_df = pdb_df[pdb_df['Atom_Name'] == 'CA']
    # Filter rows for specific chain
    pdb_df = pdb_df[pdb_df['Chain_ID'] == chain]
    # Keep only specific columns
    pdb_df = pdb_df[['Residue_Name', 'Residue_ID']]
    
    # add missing residues to the dataframe 
    h = parse_pdb_header(filepath)
    # Go through missing residues and add to list
    for residue in h['missing_residues']:
        if residue["chain"] == chain:
            pdb_df = pd.concat([pdb_df, 
                                pd.DataFrame.from_records([{
                                    'Residue_Name': residue["res_name"], 
                                    'Residue_ID': residue["ssseq"]}])])
            
    # Map three-letter amino acid codes to one-letter codes
    pdb_df['Residue_Name'] = pdb_df['Residue_Name'].map(three_to_one)
    # Remove duplicate rows
    pdb_df = pdb_df.drop_duplicates()
    # Sort by Residue_ID
    pdb_df = pdb_df.sort_values(by='Residue_ID')
    # Add the 'binding' column
    pdb_df['binding'] = pdb_df.apply(
        lambda row: 1 if (row['Residue_Name'], row['Residue_ID']) in true_binding_set 
                        else 0, axis=1)
    # Reset index
    pdb_df.reset_index(drop=True, inplace=True)

    pdb_dataframes[pdb_id] = pdb_df

# Display the first 5 keys to show the imported proteins

print(list(pdb_dataframes.keys())[:10])

['1d4x', '1j09', '1mb9', '1xdn', '1xdp', '1z0s', '2aqx', '2j9c', '2py7', '3amt']


### NSite predictions to dataframes

In [13]:
# Directory containing the NSite predictions
# directory = '../Files/NSite_predictions'
directory = '../Bowen/NSitePred results'

# Dictionary to store dataframes
ns_predictions = {}

# Loop through all files in the directory
for nsite_input_file in os.listdir(directory):
    pdb_id = nsite_input_file.split("_")[0]
    if not nsite_input_file.endswith('.csv'):
        continue
    
    filepath = os.path.join(directory, nsite_input_file)
    nsite_df = pd.read_csv(filepath)
    # Filter for relevant information
    nsite_df = nsite_df[['AA', 'ATP binding res.', ' ATP prob.']]
    nsite_df.rename(columns={'AA': 'Residue_Name',
                                'ATP binding res.': 'binding',
                                ' ATP prob.': 'probability'}, inplace=True)
    # Map binding column to 0 and 1
    nsite_df['binding'] = nsite_df['binding'].map({'B': 1, 'N': 0})
    
    # Get the starting Residue_ID from the corresponding PDB dataframe
    if pdb_id in pdb_dataframes:
        start_residue_id = pdb_dataframes[pdb_id]['Residue_ID'].iloc[0]
        nsite_df.insert(1, 'Residue_ID', range(start_residue_id, start_residue_id + len(nsite_df)))
        ns_predictions[pdb_id] = nsite_df

# Display the keys of the dictionary to show the imported dataframes
print(list(ns_predictions.keys())[:10])

['1d4x', '1j09', '1mb9', '1xdn', '1xdp', '1z0s', '2aqx', '2j9c', '2py7', '3amt']


## Evaluation of NSitePred web server

In [14]:
results = {} # Dict of merged dataframes

for pdb_id, pdb_df in pdb_dataframes.items():
    if pdb_id not in ns_predictions:
        continue
    nsite_df = ns_predictions[pdb_id]    
        
    # SQL join on Residue_ID
    result = pd.merge(pdb_df, nsite_df, on='Residue_ID', how='inner', suffixes=('_true', '_pred'))
    result.dropna(inplace=True)
    results[pdb_id]= result 
    
    # Print the pdb file where the residues do not match at some Residue_ID
    # This is for debugging purposes
    mismatch = result[result['Residue_Name_true'] != result['Residue_Name_pred']]
    if not mismatch.empty:
        print(pdb_id)
        print(mismatch)

### Calculation of metrics for the NSitePred web server

Accuracy = $ \frac{TP + TN}{TP + TN + FP + FN} $ \
\
Precision = $ \frac{TP}{TP + FP} $ \
\
Recall = $ \frac{TP}{TP + FN} $ \
\
F1 = $ 2\times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} $


In [None]:
# Calculate the metrics for each PDB file
results_directory = '../Files/results'
os.makedirs(results_directory, exist_ok=True)

# Create a df to store the results
results_df = pd.DataFrame(columns=['PDB_ID', 'TP', 'FP', 'TN', 'FN', 
                                   'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC'])

for pdb_id, result in results.items():
    
    # Compute the number of true positives, false positives, true negatives, and false negatives
    result['TP'] = ((result['binding_true'] == 1) & (result['binding_pred'] == 1)).astype(int)
    result['FP'] = ((result['binding_true'] == 0) & (result['binding_pred'] == 1)).astype(int)
    result['TN'] = ((result['binding_true'] == 0) & (result['binding_pred'] == 0)).astype(int)
    result['FN'] = ((result['binding_true'] == 1) & (result['binding_pred'] == 0)).astype(int)
    
    # Compute Metrics
    accuracy = accuracy_score(result["binding_true"], result["binding_pred"])
    precision = precision_score(result["binding_true"], result["binding_pred"])
    recall = recall_score(result["binding_true"], result["binding_pred"])
    f1 = f1_score(result["binding_true"], result["binding_pred"])
    auc = roc_auc_score(result["binding_true"], result["probability"])
    if np.isnan(auc):
        auc = 0.0

    # Store results in a dictionary
    metrics = {
        'PDB_ID': pdb_id,
        'TP': int(result['TP'].sum()),
        'FP': int(result['FP'].sum()),
        'TN': int(result['TN'].sum()),
        'FN': int(result['FN'].sum()),
        'Accuracy': round(float(accuracy), 3),
        'Precision': round(float(precision), 3),
        'Recall': round(float(recall), 3),
        'F1-Score': round(float(f1), 3),
        'AUC': round(float(auc), 3)
    }
    
    # Add metrics as a new row to the dataframe
    new_row = pd.DataFrame([metrics])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    
print(results_df.head())   

  PDB_ID  TP  FP   TN  FN  Accuracy  Precision  Recall  F1-Score    AUC
0   1d4x  14  11  343   7     0.952      0.560   0.667     0.609  0.913
1   1j09  12   0  454   2     0.996      1.000   0.857     0.923  0.961
2   1mb9  17   2  492   2     0.992      0.895   0.895     0.895  0.955
3   1xdn  14   0  259   1     0.996      1.000   0.933     0.966  1.000
4   1xdp  12   2  671   2     0.994      0.857   0.857     0.857  0.924
