# This notebook is to map the isolated pocket points to input features

## 1.0 Libraries

In [1]:
import os, glob, datetime
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## 2.0 Functions

## 3.0 I/O

In [2]:
# make output dir
output = os.path.join(os.path.abspath("."), str(datetime.date.today())+"_sites_parsed_info/")
os.makedirs(output, exist_ok=0)

In [3]:
# paths to the input feats folder
input_feats_src   = os.path.abspath("../data_preparation/04b-precomputation_12A/precomputation/")
input_feats_paths = [f.strip() for f in glob.iglob(os.path.join(input_feats_src, "*", "*"))]
print(len(input_feats_paths))

18


## 4.0 Execution

### 4.1 Parse the isolated pocket points

In [None]:
# Parse the isolated pockets

# container for storing the parsed sites
vessel01 = [] 

for pocket in tqdm(sorted(glob.iglob(os.path.join(os.path.abspath("../sel_fibril_sites"), "*", "*.csv"))), desc='parsing defined sites'):
    if 'refined' in pocket or 'isolate' in pocket:
        df_temp01 = pd.read_csv(pocket, index_col=0)
        df_temp01.insert(0, 'fibril', os.path.basename(pocket).replace('.csv','').split('_')[1])
        df_temp01.insert(1, 'pocket_id', "_".join(os.path.basename(pocket).replace('.csv','').split('_')[2:-1]))
        df_temp01.insert(2, 'isolation', os.path.basename(pocket).replace('.csv','').split('_')[-1])
        vessel01.append(df_temp01)

# add all these parsed pockets to a dataframe
df_pockets_crude = pd.concat(vessel01).reset_index(drop=1)

print(len(set(df_pockets_crude.pocket_id)))
print(df_pockets_crude.shape)
df_pockets_crude.head()

parsing defined sites:   0%|          | 0/599 [00:00<?, ?it/s]

In [None]:
# Remove the isolate points for the pockets that underwent a round of refinement 

# container to contain the filtered points info
vessel02 = [] 

for g01 in tqdm(df_pockets_crude.groupby(by='pocket_id'), desc='cleaning'):
    if len(set(g01[1].isolation)) < 2 :
        vessel02.append(g01[1])
    else:
        vessel02.append(g01[1][g01[1].isolation == 'refined'])

df_pockets = pd.concat(vessel02).reset_index(drop=1)
df_pockets.to_csv(os.path.join(output, str(datetime.date.today())+"_all_sites_parsed.csv"))

print(len(set(df_pockets.pocket_id)))
print(df_pockets.shape)
df_pockets.head()

### 4.2 Get the computed surface features

*surface feature parsing* 

MaSIF files are organzed in the manner of MaSIF point index that is the center of the patch, the 200 points within the patch and their computed surface features. We map the defined site points to their corresponding features based on the MaSIF indexing.

In [None]:
# container for the full info
vessel03 = [] 

for g02 in tqdm(df_pockets.groupby(by='fibril'), desc='getting feats'):
 
    # load the input feat files
    input_feats = np.load([f for f in input_feats_paths if g02[0] in f if os.path.basename(f) == "p1_input_feat.npy"][0])[:,0,:]
    
    # add the values
    df_temp02 = g02[1]
    df_temp02.insert(df_temp02.shape[1], 'input_si',     [input_feats[x][0] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_charge', [input_feats[x][3] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_hphob',  [input_feats[x][4] for x in df_temp02.MaSIF_index])
    df_temp02.insert(df_temp02.shape[1], 'input_hbonds', [input_feats[x][2] for x in df_temp02.MaSIF_index])
    
    vessel03.append(df_temp02)
    

df_all_feats = pd.concat(vessel03).reset_index(drop=1)
df_all_feats.to_csv(os.path.join(output, str(datetime.date.today())+"_all_sites_input_feats.csv"))

print(df_all_feats.shape)
df_all_feats.head()