# This notebook is to map the isolated pocket points to input features

## 1.0 Libraries

In [1]:
import os, glob, datetime
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## 2.0 Functions

## 3.0 I/O

In [2]:
# make output dir
output = os.path.join(os.path.abspath("."), str(datetime.date.today())+"_sites_parsed_info/")
os.makedirs(output, exist_ok=0)

In [3]:
# paths to the input feats folder
input_feats_src   = os.path.abspath("./data_preparation/04b-precomputation_12A/precomputation/")
input_feats_paths = [f.strip() for f in glob.iglob(os.path.join(input_feats_src, "*", "*"))]
print(len(input_feats_paths))

18


## 4.0 Execution

### 4.1 Parse the isolated pocket points

In [4]:
# Parse the isolated pockets

# container for storing the parsed sites
vessel01 = [] 

for pocket in tqdm(sorted(glob.iglob(os.path.join(os.path.abspath("./sel_fibril_sites"), "*", "*.csv"))), desc='parsing defined sites'):
    if 'refined' in pocket or 'isolate' in pocket:
        df_temp01 = pd.read_csv(pocket, index_col=0)
        df_temp01.insert(0, 'fibril', os.path.basename(pocket).replace('.csv','').split('_')[1])
        df_temp01.insert(1, 'pocket_id', "_".join(os.path.basename(pocket).replace('.csv','').split('_')[2:-1]))
        df_temp01.insert(2, 'isolation', os.path.basename(pocket).replace('.csv','').split('_')[-1])
        vessel01.append(df_temp01)

# add all these parsed pockets to a dataframe
df_pockets_crude = pd.concat(vessel01).reset_index(drop=1)

print(len(set(df_pockets_crude.pocket_id)))
print(df_pockets_crude.shape)
df_pockets_crude.head()

parsing defined sites:   0%|          | 0/599 [00:00<?, ?it/s]

61
(28364, 17)


Unnamed: 0,fibril,pocket_id,isolation,MaSIF_index,atom_type,chain,coords,point_direction,resid,resname,sasa,surf_charge,surf_coords,surf_hbond,surf_hphob,surf_norm_fibril_dot,surf_normals
0,6NWQ,CTEII_P73,isolate,9478,C,A,[153.771 151.437 160.011],,324.0,SER,0.0,0.08785,[151.993 151.207 159.101],-0.203964,-0.8,0.100411,[-0.814522 -0.507549 -0.280977]
1,6NWQ,CTEII_P73,isolate,9837,CB,A,[151.985 153.109 160.591],,324.0,SER,1.21,0.419086,[150.786 151.785 160.011],-0.274085,-0.8,-0.116151,[-0.784149 -0.617719 -0.0594427]
2,6NWQ,CTEII_P73,isolate,8628,HG,A,[150.656 153.184 159.266],,324.0,SER,22.94,1.27597,[149.75 153.347 159.617],0.818023,-0.8,-0.188231,[-0.996672 0.0813628 0.00494443]
3,6NWQ,CTEII_P73,isolate,6436,N,A,[149.065 145.482 160.634],,327.0,ASN,0.0,1.13595,[148.554 147.148 160.936],0.0,-1.89469,-0.177757,[-0.417711 0.897445 0.14181 ]
4,6NWQ,CTEII_P73,isolate,9413,CA,A,[148.084 144.692 159.884],,327.0,ASN,0.0,0.710355,[147.46 147.241 160.119],0.0,-3.5,-0.130648,[-0.110449 0.981508 0.156342]


In [5]:
# Remove the isolate points for the pockets that underwent a round of refinement 

# container to contain the filtered points info
vessel02 = [] 

for g01 in tqdm(df_pockets_crude.groupby(by='pocket_id'), desc='cleaning'):
    if len(set(g01[1].isolation)) < 2 :
        vessel02.append(g01[1])
    else:
        vessel02.append(g01[1][g01[1].isolation == 'refined'])

df_pockets = pd.concat(vessel02).reset_index(drop=1)
df_pockets.to_csv(os.path.join(output, str(datetime.date.today())+"_all_sites_parsed.csv"))

print(len(set(df_pockets.pocket_id)))
print(df_pockets.shape)
df_pockets.head()

cleaning:   0%|          | 0/61 [00:00<?, ?it/s]

61
(13667, 17)


Unnamed: 0,fibril,pocket_id,isolation,MaSIF_index,atom_type,chain,coords,point_direction,resid,resname,sasa,surf_charge,surf_coords,surf_hbond,surf_hphob,surf_norm_fibril_dot,surf_normals
0,6lrq,A53T_P47,refined,4967,N,A,[151.323 154.776 201.084],,37.0,VAL,40.46,5.12869,[150.192 153.956 201.365],0.0,4.2,-0.259657,[-0.824852 -0.508163 0.247769]
1,6lrq,A53T_P47,refined,5080,O,A,[153.933 154.814 202.726],,37.0,VAL,21.43,2.41838,[151.405 156.229 199.358],0.0,4.2,0.254332,[-0.892828 0.269062 -0.361199]
2,6lrq,A53T_P47,refined,2425,CB,A,[153.095 156.308 200.335],,37.0,VAL,0.0,0.68523,[151.304 158.224 201.15 ],0.0,4.2,0.006875,[-0.712537 0.687191 -0.141632]
3,6lrq,A53T_P47,refined,2329,CG1,A,[152.694 157.228 201.469],,37.0,VAL,10.87,0.379075,[156.452 156.79 200.259],0.0,4.2,0.014161,[ 0.604358 0.79516 -0.0497192]
4,6lrq,A53T_P47,refined,3179,HB,A,[152.626 156.586 199.533],,37.0,VAL,3.4,0.466497,[153.279 159.127 200.921],0.0,4.2,0.021477,[ 0.217775 0.969931 -0.108669]


### 4.2 Get the computed surface features

*surface feature parsing* 

MaSIF files are organzed in the manner of MaSIF point index that is the center of the patch, the 200 points within the patch and their computed surface features. We map the defined site points to their corresponding features based on the MaSIF indexing.

In [6]:
# container for the full info
vessel03 = [] 

for g02 in tqdm(df_pockets.groupby(by='fibril'), desc='getting feats'):
 
    # load the input feat files
    input_feats = np.load([f for f in input_feats_paths if g02[0] in f if os.path.basename(f) == "p1_input_feat.npy"][0])[:,0,:]
    
    # add the values
    df_temp02 = g02[1]
    df_temp02.insert(df_temp02.shape[1], 'input_si',     [input_feats[x][0] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_charge', [input_feats[x][3] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_hphob',  [input_feats[x][4] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_hbonds', [input_feats[x][2] for x in df_temp02.MaSIF_index])
    
    vessel03.append(df_temp02)
    

df_all_feats = pd.concat(vessel03).reset_index(drop=1)
df_all_feats.to_csv(os.path.join(output, str(datetime.date.today())+"_all_sites_input_feats.csv"))

print(df_all_feats.shape)
df_all_feats.head()

getting feats:   0%|          | 0/18 [00:00<?, ?it/s]

(13667, 21)


Unnamed: 0,fibril,pocket_id,isolation,MaSIF_index,atom_type,chain,coords,point_direction,resid,resname,...,surf_charge,surf_coords,surf_hbond,surf_hphob,surf_norm_fibril_dot,surf_normals,input_si,input_charge,input_hphob,input_hbonds
0,6NWQ,CTEII_P73,refined,9478,C,A,[153.771 151.437 160.011],,324.0,SER,...,0.08785,[151.993 151.207 159.101],-0.203964,-0.8,0.100411,[-0.814522 -0.507549 -0.280977],0.113715,0.195311,-0.155556,0.030528
1,6NWQ,CTEII_P73,refined,9837,CB,A,[151.985 153.109 160.591],,324.0,SER,...,0.419086,[150.786 151.785 160.011],-0.274085,-0.8,-0.116151,[-0.784149 -0.617719 -0.0594427],-0.513678,0.249782,0.237698,0.0
2,6NWQ,CTEII_P73,refined,8628,HG,A,[150.656 153.184 159.266],,324.0,SER,...,1.27597,[149.75 153.347 159.617],0.818023,-0.8,-0.188231,[-0.996672 0.0813628 0.00494443],0.29528,0.313374,1.0,0.0
3,6NWQ,CTEII_P73,refined,6436,N,A,[149.065 145.482 160.634],,327.0,ASN,...,1.13595,[148.554 147.148 160.936],0.0,-1.89469,-0.177757,[-0.417711 0.897445 0.14181 ],-0.999883,-0.814627,-0.777778,0.0
4,6NWQ,CTEII_P73,refined,9413,CA,A,[148.084 144.692 159.884],,327.0,ASN,...,0.710355,[147.46 147.241 160.119],0.0,-3.5,-0.130648,[-0.110449 0.981508 0.156342],0.193265,0.236785,-0.777778,0.0
