## Evaluation of the Prediction servers NSitePred and ATPbind

In [5]:
import os
from Bio.PDB import *
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [15]:
h = parse_pdb_header('x1/2x14.pdb')
#parse per chain
chain_of_interest = "A"
# go through missing residues and print the sequence number of the residue
for residue in h['missing_residues']:
    if residue["chain"] in chain_of_interest:
        print(residue["ssseq"], residue["res_name"])
        

2 SER
133 LYS
134 ASP
135 ALA
136 SER
137 GLY
138 ASN
139 LYS
140 VAL
141 LYS
142 ALA


In [3]:
# Function to parse PDB file and convert to DataFrame
def pdb_to_dataframe(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    data = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    data.append([atom.serial_number, atom.name, residue.resname, chain.id, residue.id[1], 
                                 atom.coord[0], atom.coord[1], atom.coord[2], atom.occupancy, atom.bfactor, atom.element])
    columns = ['Atom_ID', 'Atom_Name', 'Residue_Name', 'Chain_ID', 'Residue_ID', 
               'X', 'Y', 'Z', 'Occupancy', 'Bfactor', 'Element']
    return pd.DataFrame(data, columns=columns)

# Directory containing the PDB files
pdb_directory = '../Files/pdb'

# Dictionary to store dataframes
pdb_dataframes = {}

# Mapping of three-letter amino acid codes to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

# Loop through all files in the directory
for filename in os.listdir(pdb_directory):
    if filename.endswith('.pdb'):
        filepath = os.path.join(pdb_directory, filename)
        pdb_df = pdb_to_dataframe(filepath)
        # Keep only specific columns
        pdb_df = pdb_df[['Atom_Name', 'Residue_Name', 'Residue_ID']]
        # Filter rows for CA atoms
        pdb_df = pdb_df[pdb_df['Atom_Name'] == 'CA']
        # Remove duplicate rows
        pdb_df = pdb_df.drop_duplicates()
        # Map three-letter amino acid codes to one-letter codes
        pdb_df['Residue_Name'] = pdb_df['Residue_Name'].map(three_to_one)
        # Remove index
        pdb_df.reset_index(drop=True, inplace=True)
        
        pdb_dataframes[filename.split("_")[0]] = pdb_df

# Display the keys of the dictionary to show the imported dataframes
print(pdb_dataframes.keys())
# Display the 3f5m.pdb dataframe as a table to test
print(pdb_dataframes['3f5m'].head())

NameError: name 'os' is not defined

In [None]:
# Directory containing the NSite predictions
directory = '../Files/NSite_predictions'

# Dictionary to store dataframes
ns_predictions = {}

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Filter for relevant information
        df = df[['AA', 'ATP binding res.', ' ATP prob.']]
        df.rename(columns={'AA': 'Residue_Name'}, inplace=True)
        # Get the starting Residue_ID from the corresponding PDB dataframe
        pdb_key = filename.split("_")[0]
        if pdb_key in pdb_dataframes:
            start_residue_id = pdb_dataframes[pdb_key]['Residue_ID'].iloc[0]
            start_residue_id = start_residue_id < 0 and start_residue_id or 1
            df.insert(1, 'Residue_ID', range(start_residue_id, start_residue_id + len(df)))
            ns_predictions[pdb_key] = df

# Display the keys of the dictionary to show the imported dataframes
print(ns_predictions.keys())
print(ns_predictions['3f5m'].head())

dict_keys(['2j9c', '2x14', '3f5m', '4cta', '6ksh'])
  Residue_Name  Residue_ID ATP binding res.   ATP prob.
0            M           1                N       0.023
1            A           2                N       0.027
2            V           3                N       0.027
3            E           4                N       0.031
4            S           5                N       0.027


In [None]:
# Save each dataframe in pdb_dataframes to the directory of this script
for filename, dataframe in pdb_dataframes.items():
    dataframe.to_csv(filename + "_pdb.csv", index=False)

In [None]:
# Check that the prediction and PDB dataframes have the same residue name for each residue ID
for filename, pdb_df in pdb_dataframes.items():
    nsite_df = ns_predictions[filename]    
        
    # Assuming pdb_df and nsite_df are your dataframes
    result = pd.merge(pdb_df, nsite_df, on='Residue_ID', how='left', suffixes=('_pdb', '_nsite'))
    
    # Display the Residue Id for which the Residue Name is different
