In [1]:
#Determine residues of interest by finding set of all binding site and second shell residues in ancestral FDMO library
import os
import sys
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
import pickle

os.chdir('/home/azamh/demo/seq_struct_func/msa')

In [2]:
#Get list of enzymes in ancestral FDMO library (enzymes with experimental annotations)
asr_seq_annotations = pd.read_excel('../si_data/asr_seq_annotations.xlsx', header = 0, index_col = 0)
ancestral_fdmo_library = asr_seq_annotations.dropna(how = 'any', axis = 0)
ancestral_fdmo_library

Unnamed: 0,Sequence,2_exp_stereo,2_exp_conversion,2_pred_stereo,2_pred_reactivity,3_exp_stereo,3_exp_conversion,3_pred_stereo,3_pred_reactivity,4_exp_stereo,...,4_pred_stereo,4_pred_reactivity,5_exp_stereo,5_exp_conversion,5_pred_stereo,5_pred_reactivity,average_exp_stereo,average_exp_conversion,average_pred_stereo,average_pred_reactivity
280,NGNSRSPLEVAIVGGGITGLALAVGLLKRNVNFTIYERAASFGELG...,-,0.0,1,0.0,-,0.0,-1,0.0,-,...,1,0.0,-,0.0,1,0.0,-,0.000,1,0.0
280a,NGNSRAPLQVAIVGGGLTGLALALGLLRRNINFTIYERAASFGELG...,-,0.0,1,0.0,-,0.0,1,0.0,-,...,1,0.0,-,0.0,1,0.0,-,0.000,1,0.0
284,TSTDEAPLHVAIVGGGITGLTLALGLLSRNINFTIYERARSFREIG...,1,62.0,1,1.0,1,66.9,-1,0.0,1,...,-1,0.0,1,60.3,-1,0.0,1,49.400,-1,1.0
284a,MTSEESPFHVAIVGGGITGLSLALGLLKRGISFTIYERARSFGEIG...,1,17.0,1,1.0,1,50.5,-1,1.0,-,...,-1,0.0,1,5.8,-1,1.0,1,18.325,-1,1.0
304,TTEEEPLEIAIVGGGIAGLTLALGLLKRNIKVTIYEQARSFREIGA...,-,0.0,1,1.0,-,0.0,-1,0.0,-,...,-1,0.0,-,0.0,-1,0.0,-,0.000,-1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,MAKTKNFEIAIVGGGIAGLTLAIALHHRNIPVTIYEQAPQFGEIGA...,-,0.0,1,0.0,-,0.0,-1,0.0,-,...,-1,0.0,-,0.0,-1,0.0,-,0.000,-1,0.0
488a,MSPTKNFEIAIIGGGIAGLTLAIALHHRNIPVTIYEQAPQFGEIGA...,-,0.0,1,0.0,-,0.0,-1,0.0,-,...,-1,0.0,-,0.0,-1,0.0,-,0.000,-1,0.0
afod,MADHEQEQEPLSIAIIGGGIIGLMTALGLLHRNIGKVTIYERASAW...,-1,35.4,-1,0.0,-1,47.9,1,0.0,-1,...,-1,0.0,-1,23.4,-1,0.0,-1,33.425,-1,0.0
azah,MSTDSIEVAIIGAGITGITLALGLLSRGIPVRVYERARDFHEIGAG...,1,54.1,1,0.0,1,59.3,-1,0.0,1,...,1,0.0,1,13.0,-1,0.0,1,39.075,0,0.0


In [3]:
#Arguments
proteins = ancestral_fdmo_library.index
ligands = (2, 3, 4, 5)
posedir = '../si_data/top_dock_pose'
pdbdir = '../si_data/pdb_with_fad'

In [4]:
#Functions for manipulating PDB with pandas dataframes (same functions for prediction of stereochemistry)
#Read in pdb as a dataframe
def pdb_as_df(pdb_file):
    pdb_dict = dict()
    metrics = [
        "atom_num",
        "atom_name",
        "resi_name",
        "chain",
        "resi_num",
        "atom_x",
        "atom_y",
        "atom_z",
        "atom_occ",
        "atom_b",
        "atom_segid"
    ]
    columns = [
        (6,11),
        (12,16),
        (17,21),
        (21,22),
        (22,26),
        (30,38),
        (38,46),
        (46,54),
        (54,60),
        (60,66),
        (72,76)
    ]
    pdb_txt = open(pdb_file, 'r').read().splitlines()
    index_i = 0
    for line in pdb_txt:
        if line.startswith('ATOM') or line.startswith('HETATM'):
            atom_dict = {metric: line[col[0]:col[1]].strip() for metric,
                         col in zip(metrics, columns)}
            pdb_dict[index_i] = atom_dict
            index_i += 1

    pdb_df = pd.DataFrame.from_dict(pdb_dict, orient='index')
    pdb_df = pdb_df.astype({
                            "atom_num": 'int',
                            'resi_num': 'int',
                            'atom_x': 'float',
                            'atom_y': 'float',
                            'atom_z': 'float',
                            'atom_occ': 'float',
                            'atom_b': 'float',
                            })

    return pdb_df

#Selections
def get_atoms(df, property, value):
    if isinstance(value, list):
        return df.loc[df[property].isin(value)].reset_index(drop = True)
    else:
        return df.loc[df[property] == value].reset_index(drop = True)

def get_heavy_atoms(df):
    return df.loc[~df['atom_name'].str.startswith('H')].reset_index(drop = True)

def get_prot_heavy_atoms(df, pseg = 'PROA'):
    return get_heavy_atoms(get_atoms(df, 'atom_segid', pseg))

def get_atom_coor(df, atom_name):
    atom_df = get_atoms(df, 'atom_name', atom_name).iloc[0]
    x, y, z = atom_df[['atom_x', 'atom_y', 'atom_z']].values
    return x, y, z

In [5]:
#Functions for finding contacts:
#pairwise distances
def pairwise_dist(df1, df2):
    return pd.DataFrame(euclidean_distances(df1[['atom_x', 'atom_y', 'atom_z']], df2[['atom_x', 'atom_y', 'atom_z']]))

#Get list of residues within cutoff in distance matrix
def find_prot_lig_contacts(prot_df, lig_dist_df, cutoff=4.5):
    protein_atom_col_bool = lig_dist_df[lig_dist_df <= cutoff].any(axis = 1)
    protein_atoms_w_contact = protein_atom_col_bool[protein_atom_col_bool].index
    resis_w_contact = set()
    for idx in protein_atoms_w_contact:
        resis_w_contact.add(prot_df['resi_num'][idx])
    return resis_w_contact

#Transform atom distances to resi distances
def get_resi_distances(prot_df, dist_df):
    dist_resi_df = dist_df.copy()
    dist_resi_df['resi_num'] = prot_df['resi_num'].copy()

    row_wise_groupby = dist_resi_df.groupby(
        ['resi_num'], axis=0, as_index=False).min().drop('resi_num', axis=1).T
    row_wise_groupby['resi_num'] = prot_df['resi_num']
    resi_dist_df = row_wise_groupby.groupby(['resi_num'], axis=0,
                                            as_index=False).min().drop('resi_num', axis=1).T

    return resi_dist_df

#get second shell residues based on distance to binding site residues
def get_second_shell(prot_df, prot_dist_df, binding_site_resis, cutoff=4.5):

    #set to hold second shell residues
    second_shell_resis = set()

    #get distance matrix in terms of residues not atoms
    resi_dist_df = get_resi_distances(prot_df, prot_dist_df)

    #reindex start at 1
    resi_dist_df.columns += 1
    resi_dist_df.index += 1

    #reduce distance matrix to binding site residues
    binding_site_df = resi_dist_df.loc[resi_dist_df.index.isin(
        binding_site_resis)]

    #get list of residues within cutoff distance
    protein_resi_col_bool = binding_site_df[binding_site_df <= cutoff].any()
    protein_resi_w_contact = protein_resi_col_bool[protein_resi_col_bool].index
    #print(protein_resi_w_contact)

    for resi in protein_resi_w_contact:
        if resi not in binding_site_resis:
            second_shell_resis.add(resi)
    return second_shell_resis

In [6]:
#Dictionaries for binding site and second shell residues
bs_dict = {}
ss_dict = {}
for protein in proteins:
    print(protein)
    
    #Read in pdb as dataframe
    pdb_prot_file = os.path.join(pdbdir, f'{protein}_fad.pdb')
    print('READING PROTEIN PDB')
    prot_df = pdb_as_df(pdb_prot_file)
    prot_df = get_prot_heavy_atoms(prot_df)
    
    #create distance matrix for protein for finding second shell residues
    print('CALCULATING PROTEIN DISTANCE MATRIX')
    prot_dist_df = pairwise_dist(prot_df, prot_df)
    
    #get set of binding site residues from ligand contacts
    binding_site_resis = set()
    
    for ligand in ligands:
        print('FINDING CONTACTS FOR', ligand)
        
        pdb_lig_file = os.path.join(
                posedir, f'{protein}_{ligand}.pdb')
        
        #read ligand pdb as df
        lig_df = pdb_as_df(pdb_lig_file)
        lig_df = get_heavy_atoms(lig_df)
        
        #create distance df for ligand
        lig_dist_df = pairwise_dist(prot_df, lig_df)
        
        #Find contacts
        resis_w_contact = find_prot_lig_contacts(prot_df, lig_dist_df)
        print('\t',resis_w_contact)
        binding_site_resis.update(resis_w_contact)
    
    print('ALL CONTACTS:', binding_site_resis)
    bs_dict[protein] = binding_site_resis
    
    #Find second shell residues from binding site ones
    second_shell_resis = get_second_shell(prot_df, prot_dist_df, binding_site_resis)
    ss_dict[protein] = second_shell_resis
    
    

280
READING PROTEIN PDB
CALCULATING PROTEIN DISTANCE MATRIX
FINDING CONTACTS FOR 2
	 {320, 225, 319, 106, 234, 236, 238, 48, 50, 82, 84, 381, 317, 318, 223}
FINDING CONTACTS FOR 3
	 {320, 225, 106, 234, 236, 238, 48, 50, 82, 84, 381, 317, 221, 318, 223}
FINDING CONTACTS FOR 4
	 {320, 225, 319, 106, 234, 236, 238, 48, 49, 50, 82, 405, 317, 318, 223}
FINDING CONTACTS FOR 5
	 {320, 225, 319, 106, 236, 238, 50, 82, 84, 381, 317, 221, 318, 223}
ALL CONTACTS: {320, 82, 84, 405, 221, 223, 225, 106, 234, 236, 238, 48, 49, 50, 317, 381, 318, 319}
280a
READING PROTEIN PDB
CALCULATING PROTEIN DISTANCE MATRIX
FINDING CONTACTS FOR 2
	 {320, 225, 319, 106, 236, 238, 48, 49, 50, 82, 84, 381, 317, 221, 318, 223}
FINDING CONTACTS FOR 3
	 {192, 225, 320, 319, 236, 238, 48, 50, 82, 84, 381, 214, 317, 221, 318, 223}
FINDING CONTACTS FOR 4
	 {192, 225, 320, 319, 106, 234, 236, 48, 49, 50, 82, 405, 317, 318, 223}
FINDING CONTACTS FOR 5
	 {320, 225, 319, 106, 236, 238, 48, 50, 381, 317, 221, 318, 223}
ALL CO

In [7]:
#Save with pickle
pickle.dump(bs_dict, open('resis_of_interest/bs_dict.pkl','wb'))
pickle.dump(ss_dict, open('resis_of_interest/ss_dict.pkl','wb'))