In [20]:
#Slice MSA by limiting to columns of interest determined by binding site and second shell residues
import os
import pickle
import pandas as pd

os.chdir('/home/azamh/demo/seq_struct_func/msa')

In [21]:
#Load in MSA and dictionaries holding residues of interest
msa_df = pd.read_excel(f'alignment/msa_df.xlsx', header = 0, index_col = 0)
resi_df = pd.read_excel(f'alignment/resi_df.xlsx', header = 0, index_col = 0)
bs_dict = pickle.load(open('resis_of_interest/bs_dict.pkl','rb'))
ss_dict = pickle.load(open('resis_of_interest/ss_dict.pkl','rb'))
resis_of_interest = set()

In [22]:
#Functions to convert between msa and protein indexing
#convert msa resi to protein resi
def msa_to_protein_numbering(msa_df, resi_df, resi, protein):
    return msa_df[resi][protein], resi_df[resi][protein]

#convert prot resi to msa column index
def prot_to_msa_numbering(msa_df, resi_df, resi, protein):
    prot_resi_series = list(resi_df.loc[protein])
    resi_idx = prot_resi_series.index(resi)
    return resi_idx

#Convert prot resi to another prot resi
def prot_to_prot_numbering(msa_df, resi_df, resi1, protein1, protein2):
    #Convert to msa numbering
    msa_resi_idx = prot_to_msa_numbering(msa_df, resi_df, resi1, protein1)
    return msa_to_protein_numbering(msa_df, resi_df, msa_resi_idx, protein2)

In [23]:
#Combine bs and ss dictionaries
prot_resi_dict = {protein:bs_dict[protein]|ss_dict[protein] for protein in bs_dict.keys()}
assert len(prot_resi_dict['tropb']) == len(bs_dict['tropb']) + len(ss_dict['tropb'])
print(prot_resi_dict['tropb'])

{22, 278, 292, 296, 297, 299, 301, 51, 52, 53, 54, 55, 56, 57, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 81, 82, 83, 84, 94, 95, 96, 97, 98, 110, 111, 112, 119, 120, 121, 122, 124, 386, 389, 390, 393, 394, 397, 398, 421, 422, 425, 426, 203, 204, 205, 206, 207, 208, 225, 226, 227, 228, 234, 235, 236, 237, 238, 239, 240, 241, 246, 247, 248, 249, 250, 251, 252, 253}


In [24]:
#Map to MSA columsn
resis_of_interest = set()
for protein, resis in prot_resi_dict.items():
    msa_resis = [prot_to_msa_numbering(msa_df, resi_df, resi, protein) for resi in resis]
    resis_of_interest.update(msa_resis)

resis_of_interest = sorted(list(resis_of_interest))
print(resis_of_interest)
print('Number of MSA columns with binding site and second shell residues', len(resis_of_interest))
    

[114, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 166, 167, 181, 182, 184, 185, 187, 188, 189, 190, 191, 192, 193, 196, 199, 200, 201, 210, 211, 212, 213, 214, 215, 216, 217, 218, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 278, 279, 280, 281, 282, 283, 284, 285, 286, 289, 413, 414, 415, 416, 417, 418, 422, 425, 444, 446, 447, 448, 449, 450, 451, 452, 453, 455, 456, 457, 458, 459, 460, 461, 462, 463, 466, 470, 471, 472, 479, 480, 481, 482, 483, 497, 545, 548, 564, 576, 577, 578, 580, 582, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 688, 689, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 704, 705, 729, 730, 731, 732, 733, 734, 735, 736, 737, 754, 755, 756, 757, 758, 760]
Number of MSA columns with binding site and second shell residues 143


In [25]:
#Slice msa df
sliced_msa_df = msa_df[resis_of_interest]
sliced_resi_df = resi_df[resis_of_interest]
sliced_msa_df

Unnamed: 0,114,153,154,155,156,157,158,161,162,163,...,734,735,736,737,754,755,756,757,758,760
278,P,L,G,L,G,R,G,L,E,P,...,K,E,L,L,L,L,L,W,E,Y
278a,P,L,G,L,G,V,A,F,E,P,...,K,E,L,L,L,L,L,W,E,Y
279,M,L,G,V,G,I,H,F,T,P,...,Q,E,R,S,H,K,L,W,N,Y
279a,M,L,G,V,G,I,H,F,T,P,...,Q,E,R,S,H,K,L,W,N,Y
280,I,L,G,V,G,I,H,F,T,P,...,L,E,R,S,H,Q,L,W,D,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xp_659718,I,V,G,A,G,V,S,F,G,P,...,N,R,R,N,A,I,I,T,D,A
xp_660831,I,T,S,A,G,F,S,F,S,K,...,W,Q,R,Y,T,Q,I,W,E,F
xp_660986,I,P,G,A,G,I,A,F,T,A,...,K,E,R,S,H,R,I,W,D,F
xp_681171,I,I,G,A,G,I,A,F,T,A,...,K,D,R,S,Y,K,I,W,H,F


In [27]:
#Drop columns with more than 10% gaps
#rename df to tropb columns
def rename_columns(msa_df, resi_df):
    #Find index with 'tropb'
    col_map = dict()
    for col in msa_df.columns:
        tropb_resi, tropb_resn = msa_to_protein_numbering(msa_df, resi_df, col, 'tropb')
        if tropb_resi == '-':
            tropb_resi = col
        col_map[col] = f'{tropb_resn}{tropb_resi}'

    msa_df = msa_df.rename(columns=col_map)
    resi_df = resi_df.rename(columns=col_map)
    return msa_df, resi_df

#remove columns with excess gaps
def prune_df(df, cutoff=.1):
    pruned_df = df.copy(deep=True)
    for column in df.columns:
        total = len(df[column])
        value_count = df[column].value_counts()
        gap_count = 0
        if '-' in value_count:
            gap_count = value_count['-']
        
        #drop column if greater than cutoff    
        if gap_count/total >= cutoff:
            pruned_df = pruned_df.drop(column, axis = 1)
            
        #convert column to categorical for faster processing
        else:
            pruned_df[column] = pruned_df[column].astype('category')
        
    return pruned_df

#Rename columns
sliced_msa_df, sliced_resi_df = rename_columns(sliced_msa_df, sliced_resi_df)

#Prune columns with > 10% gaps
pruned_msa_df = prune_df(sliced_msa_df)
pruned_resi_df = prune_df(sliced_resi_df)
pruned_msa_df

Unnamed: 0,22I,50I,51G,52A,53G,54M,55A,56F,57T,58A,...,419K,420D,421R,422S,423H,424K,425I,426W,427H,428F
278,P,L,G,L,G,R,G,L,E,P,...,K,E,L,L,L,L,L,W,E,Y
278a,P,L,G,L,G,V,A,F,E,P,...,K,E,L,L,L,L,L,W,E,Y
279,M,L,G,V,G,I,H,F,T,P,...,Q,E,R,S,H,K,L,W,N,Y
279a,M,L,G,V,G,I,H,F,T,P,...,Q,E,R,S,H,K,L,W,N,Y
280,I,L,G,V,G,I,H,F,T,P,...,L,E,R,S,H,Q,L,W,D,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xp_659718,I,V,G,A,G,V,S,F,G,P,...,N,R,R,N,A,I,I,T,D,A
xp_660831,I,T,S,A,G,F,S,F,S,K,...,W,Q,R,Y,T,Q,I,W,E,F
xp_660986,I,P,G,A,G,I,A,F,T,A,...,K,E,R,S,H,R,I,W,D,F
xp_681171,I,I,G,A,G,I,A,F,T,A,...,K,D,R,S,Y,K,I,W,H,F


In [28]:
#Save to excel
pruned_msa_df.to_excel('alignment/pruned_msa_df.xlsx')
pruned_resi_df.to_excel('alignment/pruned_resi_df.xlsx')