# This notebook is to map the isolated pockets points to their MaSIF desc ouput and their input features

These feats are gonna be used later in the tSNE clustering and the pocket registeration

## 1.0 Libraries

In [1]:
import os
import glob
import datetime
import numpy as np
import pandas as pd

from tqdm import tqdm

## 2.0 Functions

## 3.0 I/O

In [2]:
output = f'{datetime.date.today()}_parsed_info/'
os.makedirs(output)

In [3]:
# paths to the desc folder
desc_files_paths = [f.strip() for f in glob.iglob('./masif_output_files/desc_out/*/*')]

# paths to the input feats folder
input_feats_paths = [f.strip() for f in glob.iglob('./masif_output_files/input_feats/*/*')]

print(len(desc_files_paths), desc_files_paths[:2])
print(len(input_feats_paths), input_feats_paths[:2])

47 ['./masif_output_files/desc_out/7v48_ABCDEF/p1_desc_straight.npy', './masif_output_files/desc_out/7ozg_ABCDEFGHIJ/p1_desc_straight.npy']
47 ['./masif_output_files/input_feats/7v48_ABCDEF/p1_input_feat.npy', './masif_output_files/input_feats/7ozg_ABCDEFGHIJ/p1_input_feat.npy']


## 4.0 Execution

### 4.1 Parse the isolated pocket points

In [4]:
# parse the isolated pockets
vessel01 = [] # container for storing the parsed pockets


for pocket in tqdm(sorted(glob.iglob('./sel_grooves/*/*.csv')), desc='parsing_pockets'):
    if 'refined' in pocket or 'isolate' in pocket:
        df_temp01 = pd.read_csv(pocket, index_col=0)
        df_temp01.insert(0, 'fibril', os.path.basename(pocket).replace('.csv','').split('_')[1])
        df_temp01.insert(1, 'pocket_id', "_".join(os.path.basename(pocket).replace('.csv','').split('_')[2:-1]))
        df_temp01.insert(2, 'isolation', os.path.basename(pocket).replace('.csv','').split('_')[-1])
        vessel01.append(df_temp01)

# add all these parsed pockets to a dataframe
df_pockets_crude = pd.concat(vessel01).reset_index(drop=1)

print(len(set(df_pockets_crude.pocket_id)))
print(df_pockets_crude.shape)
df_pockets_crude.head()

parsing_pockets: 100%|████████████████████████████████████████████████████████████████████████████████████| 599/599 [00:00<00:00, 2116.20it/s]


61
(28364, 18)


Unnamed: 0,fibril,pocket_id,isolation,chain,resid,resname,atom_type,coords,sasa,MaSIF_index,surf_coords,surf_iface,surf_charge,surf_hbond,surf_hphob,surf_normals,surf_norm_fibril_dot,point_direction
0,6cu7,Pol_1a_P1,isolate,E,85.0,ALA,C,[ -2.723 -30.999 10.871],0.0,4839,[ -0.913 -31.873 11.2098],0.604408,0.294057,-0.005686,1.8,[0.998504 0.0462843 0.0291052],-0.022067,
1,6cu7,Pol_1a_P1,isolate,E,85.0,ALA,O,[ -1.918 -31.114 9.943],5.36,3066,[ -0.597 -30.763 10.244],0.722108,-0.568392,-0.0,1.8,[ 0.955121 -0.274045 0.112439],-0.106723,
2,6cu7,Pol_1a_P1,isolate,E,85.0,ALA,CB,[ -2.238 -33.278 11.794],7.25,9578,[ -0.649375 -33.6351 12.3311 ],0.430415,0.508585,0.0,1.8,[ 0.898844 -0.394701 0.190503],-0.185559,
3,6cu7,Pol_1a_P1,isolate,E,85.0,ALA,HB2,[ -2.634 -34.049 12.23 ],12.74,5811,[ -2.84238 -35.0911 12.0895 ],0.226561,0.493869,0.0,1.8,[-0.0925209 -0.988771 0.117355 ],-0.121154,
4,6cu7,Pol_1a_P1,isolate,E,85.0,ALA,HB3,[ -1.842 -33.552 10.952],7.65,8883,[ -1.60563 -34.725 11.049 ],0.278574,0.54586,0.0,1.8,[ 0.404138 -0.90011 -0.162708],0.162611,


In [5]:
# remove the isolate points for the pockets that underwent a round of refinement 
vessel02 = [] # container to contain the filtered points info

for g01 in tqdm(df_pockets_crude.groupby(by='pocket_id'), desc='cleaning'):
    if len(set(g01[1].isolation)) < 2 :
        vessel02.append(g01[1])
    else:
        vessel02.append(g01[1][g01[1].isolation == 'refined'])

df_pockets = pd.concat(vessel02).reset_index(drop=1)
df_pockets.to_csv(f"{output}/{datetime.date.today()}_all_pockets_parsed.csv")

print(len(set(df_pockets.pocket_id)))
print(df_pockets.shape)
df_pockets.head()

cleaning: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 3237.29it/s]


61
(13667, 18)


Unnamed: 0,fibril,pocket_id,isolation,chain,resid,resname,atom_type,coords,sasa,MaSIF_index,surf_coords,surf_iface,surf_charge,surf_hbond,surf_hphob,surf_normals,surf_norm_fibril_dot,point_direction
0,6lrq,A53T_P47,refined,A,37.0,VAL,N,[151.323 154.776 201.084],40.46,4967,[150.192 153.956 201.365],0.109501,5.12869,0.0,4.2,[-0.824852 -0.508163 0.247769],-0.259657,
1,6lrq,A53T_P47,refined,A,37.0,VAL,O,[153.933 154.814 202.726],21.43,5080,[151.405 156.229 199.358],0.138518,2.41838,0.0,4.2,[-0.892828 0.269062 -0.361199],0.254332,
2,6lrq,A53T_P47,refined,A,37.0,VAL,CB,[153.095 156.308 200.335],0.0,2425,[151.304 158.224 201.15 ],0.13742,0.68523,0.0,4.2,[-0.712537 0.687191 -0.141632],0.006875,
3,6lrq,A53T_P47,refined,A,37.0,VAL,CG1,[152.694 157.228 201.469],10.87,2329,[156.452 156.79 200.259],0.536073,0.379075,0.0,4.2,[ 0.604358 0.79516 -0.0497192],0.014161,
4,6lrq,A53T_P47,refined,A,37.0,VAL,HB,[152.626 156.586 199.533],3.4,3179,[153.279 159.127 200.921],0.2937,0.466497,0.0,4.2,[ 0.217775 0.969931 -0.108669],0.021477,


### 4.2 Get the MaSIF desc and input feats

*input feats parsing* 

MaSIF files are organzed in the manner of MaSIF point index that is the center of the patch, the 200 points within the patch and their descriptors, so we just drop the other points in the patch and just retain the values corresponding to the indexing point

In [6]:
vessel03 = [] # container for the full info

for g02 in tqdm(df_pockets.groupby(by='fibril'), desc='getting feats'):
 
    # load desc and input feat files
    descs = np.load([f for f in desc_files_paths if g02[0] in f][0])
    input_feats = np.load([f for f in input_feats_paths if g02[0] in f][0])[:,0,:] 
    
    # add the values
    df_temp02 = g02[1]
    df_temp02.insert(df_temp02.shape[1], 'straight_desc', [descs[x] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_si', [input_feats[x][0] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_charge', [input_feats[x][3] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_hphob', [input_feats[x][4] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_hbonds', [input_feats[x][2] for x in df_temp02.MaSIF_index])
    
    vessel03.append(df_temp02)
    

df_all_feats = pd.concat(vessel03).reset_index(drop=1)
df_all_feats.to_csv(f"{output}/{datetime.date.today()}_all_pockets_feats.csv")

print(df_all_feats.shape)
df_all_feats.head()

getting feats: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 32.54it/s]


(13667, 23)


Unnamed: 0,fibril,pocket_id,isolation,chain,resid,resname,atom_type,coords,sasa,MaSIF_index,...,surf_hbond,surf_hphob,surf_normals,surf_norm_fibril_dot,point_direction,straight_desc,input_si,input_charge,input_hphob,input_hbonds
0,6NWQ,CTEII_P73,refined,A,324.0,SER,C,[153.771 151.437 160.011],0.0,9478,...,-0.203964,-0.8,[-0.814522 -0.507549 -0.280977],0.100411,,"[0.45121828, -0.79679435, -0.36818764, -0.1094...",0.113715,0.195311,-0.155556,0.030528
1,6NWQ,CTEII_P73,refined,A,324.0,SER,CB,[151.985 153.109 160.591],1.21,9837,...,-0.274085,-0.8,[-0.784149 -0.617719 -0.0594427],-0.116151,,"[0.38821366, -0.60027254, -0.32101157, -0.0326...",-0.513678,0.249782,0.237698,0.0
2,6NWQ,CTEII_P73,refined,A,324.0,SER,HG,[150.656 153.184 159.266],22.94,8628,...,0.818023,-0.8,[-0.996672 0.0813628 0.00494443],-0.188231,,"[0.35675552, 0.09994614, -0.3331059, -0.019597...",0.29528,0.313374,1.0,0.0
3,6NWQ,CTEII_P73,refined,A,327.0,ASN,N,[149.065 145.482 160.634],0.0,6436,...,0.0,-1.89469,[-0.417711 0.897445 0.14181 ],-0.177757,,"[0.2246013, 0.19974554, -0.5713173, 0.12002113...",-0.999883,-0.814627,-0.777778,0.0
4,6NWQ,CTEII_P73,refined,A,327.0,ASN,CA,[148.084 144.692 159.884],0.0,9413,...,0.0,-3.5,[-0.110449 0.981508 0.156342],-0.130648,,"[0.42035002, -0.7223128, -0.3325043, -0.048171...",0.193265,0.236785,-0.777778,0.0
