# This notebook is to analyse the fibril site alignments with focus on ex vivo aSyn fibrils
- Find share features among ex vivo aSyn fibrils
- Find share features between ex vivo fibrils of aSyn and other amyloid proteins
- Find share features between ex vivo and in vitro aSyn fibrils

**Open3d output explained**

- *icp_fitness*, which measures the overlapping area (# of correspondences / # of points in source pocket). The higher the better.
        "how much of the source pocket is matched to the target" --> Fitness(source)
        
- *inlier_rmse*, which measures the RMSE of all inlier correspondences. The lower the better.

**target_matched_percent explained**
- *Fitness(Target)* =  # of correspondences / # of points in target pocket

**combined_fitness_score (SSmax) explained**
- since there is a size difference between the matched pockets, a fitness score, aka query coverage, calculated based on the source and target pockets' sizes. the Max of both is used as the most coverage obtained between 2 pockets and will be used for further downstream analysis

- fitness score >= 0.5 if source:target ratio is 1.0 or less
- if source:target ratio > 1, use Fitness(Target) >= 0.5

## 1.0 Libraries

In [None]:
import os, glob, shutil, datetime, open3d, copy

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from tqdm import tqdm

## 2.0 Functions

In [None]:
def fibril_name_str(x):
    """  
    Add fibril name
    """
    if x.split('_')[0] in ['PHF','CTEII']:
        return 'Tau'
    elif x.split('_')[0] in ['A53T', 'Pol', 'LF', 'G51D', 'MSA', 'E46K', 'H50Q', 'pY39']:
        return 'aSyn'
    else:
        return x.split('_')[0]

In [None]:
def add_size_metrics(df:pd.DataFrame):
    """  """
    df['src2target_size_ratio'] = round(df['size_source'] / df['size_target'],2)
    df['target_matched_percent_icp']    = round(df['icp_nb_corres']/df['size_target'],2)
    return df

In [4]:
def get_input_desc_diff(df_all_info:pd.DataFrame, df:pd.DataFrame, idx:int, input_feats_npy:list) -> pd.DataFrame:
    """ This function calculates the Input features difference """ 
    
    # define the open3d results file path 
    path_ = [p for p in input_feats_npy if df.at[idx, 'source_pocket'] in p
            if df.at[idx, 'target_pocket'] in p]
    assert len(path_) == 1, 'Path finding ERROR'
    path_ = path_[0]
    
    # load the results file
    reg_results = np.load(path_, allow_pickle=True).item()['icp_test']
    
    # get the correspondences indicies
    source_corres_set = reg_results['corres_set'][0][:,0]
    target_corres_set = reg_results['corres_set'][0][:,1]
    assert len(source_corres_set) == len(target_corres_set)    
    
    # get the corres set properties
    ## for source pocket
    df_src = df_all_info[df_all_info.pocket_id == df.at[idx, 'source_pocket']].reset_index(drop=True)
    source_input_si    = df_src['input_si'].to_numpy()[source_corres_set]
    source_surf_charge = df_src['input_charge'].to_numpy()[source_corres_set]
    source_surf_hphob  = df_src['input_hphob'].to_numpy()[source_corres_set]
    source_surf_hbonds = df_src['input_hbonds'].to_numpy()[source_corres_set]
    
    feats_source = np.stack(( source_surf_charge, source_surf_hphob, source_surf_hbonds, source_input_si), axis=1)
    
    ## for target pocket
    df_target = df_all_info[df_all_info.pocket_id == df.at[idx, 'target_pocket']].reset_index(drop=True)
    target_input_si    = df_target['input_si'].to_numpy()[target_corres_set]
    target_surf_charge = df_target['input_charge'].to_numpy()[target_corres_set]
    target_surf_hphob  = df_target['input_hphob'].to_numpy()[target_corres_set]
    target_surf_hbonds = df_target['input_hbonds'].to_numpy()[target_corres_set]
    
    feats_target = np.stack(( target_surf_charge, target_surf_hphob, target_surf_hbonds, target_input_si), axis=1)
    
    assert feats_source.shape == feats_target.shape, 'Feats parsing failure'
    
    input_desc_diff = np.mean(np.linalg.norm(feats_target - feats_source, axis = 1))
    
    return input_desc_diff

In [5]:
def calc_input_feat_diff(df:pd.DataFrame, df_all_info:pd.DataFrame, input_feats_npy:list, output:str, export:bool=False, ident=None ) -> pd.DataFrame :
    """ This function is to calculate the input feat differece, raw and weighted """

    # calculate the input feature difference
    input_feat_diff_vessel = []
    input_feat_diff_weighted_vessel = []

    for idx01 in df.index:
        input_feat_diff_v = get_input_desc_diff(df_all_info=df_all_info, df=df, idx=idx01, input_feats_npy=input_feats_npy)

        # weigh the Input feat difference in respect to the pocket overlap
        input_feat_diff_weighted_v = input_feat_diff_v/df.at[idx01, "combined_fitness_score"]

        # append    
        input_feat_diff_vessel.append(input_feat_diff_v)
        input_feat_diff_weighted_vessel.append(input_feat_diff_weighted_v)

    # add to the dataframe
    df.insert(9, 'icp_mean_input_diff', input_feat_diff_vessel)
    df.insert(10, 'icp_mean_input_diff_weighted', input_feat_diff_weighted_vessel)

    assert len(input_feat_diff_vessel) == df.shape[0]
    assert len(input_feat_diff_weighted_vessel) == df.shape[0]

    if export and ident != None:
        df.to_csv(os.path.join(output, str(ident) + '_matches.csv'))
    
    return df

In [6]:
def add_pocket_matches(df:pd.DataFrame) -> pd.DataFrame:
    """  """
    vessel = []
    for item00 in df.pocket_pairs:
        sorted_items = sorted([item00[0],item00[2]])
        vessel.append((sorted_items[0], sorted_items[1]))
    
    # add to Dataframe
    df.insert(0, 'pocket_matches', vessel)

    # check if there are duplicated matches
    assert len(df.pocket_matches.to_list()) == len(list(set(df.pocket_matches.to_list()))), "Duplicated Matches Found"
    print("Unique Matches")

    return df

## 3.0 I/O

### 3.1 Output dirs

In [11]:
# make main output folder
main_output = os.path.join(os.path.abspath('.'), str(datetime.date.today()) + "_input_feats_pockets_analysis_umap")
os.makedirs(main_output, exist_ok=False)

## ------------------------------------------------------- ##

# make output for all vs all comparison
allvsall_out = os.path.join(main_output, "all_vs_all")
os.makedirs(allvsall_out, exist_ok=False)

# make an output for the plots for all vs all comparison
allvsall_plots_out = os.path.join(allvsall_out, 'plots')
os.makedirs(allvsall_plots_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for aSyn to aSyn brain comparison
asyn2asyn_out = os.path.join(main_output, "aSyn_to_aSyn")
os.makedirs(asyn2asyn_out, exist_ok=False)

# make an output for the plots for aSyn to aSyn brain comparison
asyn2asyn_plots_out = os.path.join(asyn2asyn_out, 'plots')
os.makedirs(asyn2asyn_plots_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for aSyn brain to other amyloids comparison
asyn2others_out = os.path.join(main_output, "aSyn_to_other_amyloids")
os.makedirs(asyn2others_out, exist_ok=False)

# make an output for the plots for aSyn brain to other amyloids comparison
asyn2others_plots_out = os.path.join(asyn2others_out, 'plots')
os.makedirs(asyn2others_plots_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for aSyn brain to Invitro aSyn comparison
asyn2invitro_out = os.path.join(main_output, "aSyn_to_invitro_aSyn")
os.makedirs(asyn2invitro_out, exist_ok=False)

# make an output for the plots for aSyn brain to other amyloids comparison
asyn2invitro_plots_out = os.path.join(asyn2invitro_out, 'plots')
os.makedirs(asyn2invitro_plots_out, exist_ok=False)

## ------------------------------------------------------- ##

### 3.2 Folders paths

In [None]:
# path to alignment (i.e., registration) results folder
reg_out_dir = os.path.abspath('{}')
assert os.path.exists(reg_out_dir), "Folder not found"
print("All Good !")

All Good !


In [None]:
#paths to npy files for input feats used in registration
input_feats_npy = [l.strip() for l in glob.iglob(os.path.join('{}', "*.npy"))]

print(len(input_feats_npy))
print(input_feats_npy[:2])

1830
['../230914_run/2023-09-14_registration_outputs/o3d_objects_npy/input_feats_reg/MSA_I_P57_pY39_P40_input-feats.npy', '../230914_run/2023-09-14_registration_outputs/o3d_objects_npy/input_feats_reg/MSA_I_P57_E46K_P55_input-feats.npy']


## 4.0 Execution

### 4.1 Load all the information

In [None]:
# load points with their features
df_all_info = pd.read_csv(os.path.abspath("{PATH_TO_all_pockets_feats.csv_FILE}"), index_col=0)

print(df_all_info.shape)
df_all_info.head(2)

(13667, 23)


Unnamed: 0,fibril,pocket_id,isolation,chain,resid,resname,atom_type,coords,sasa,MaSIF_index,...,surf_hbond,surf_hphob,surf_normals,surf_norm_fibril_dot,point_direction,straight_desc,input_si,input_charge,input_hphob,input_hbonds
0,6NWQ,CTEII_P73,refined,A,324.0,SER,C,[153.771 151.437 160.011],0.0,9478,...,-0.203964,-0.8,[-0.814522 -0.507549 -0.280977],0.100411,,[ 0.45121828 -0.79679435 -0.36818764 -0.109434...,0.113715,0.195311,-0.155556,0.030528
1,6NWQ,CTEII_P73,refined,A,324.0,SER,CB,[151.985 153.109 160.591],1.21,9837,...,-0.274085,-0.8,[-0.784149 -0.617719 -0.0594427],-0.116151,,[ 0.38821366 -0.60027254 -0.32101157 -0.032672...,-0.513678,0.249782,0.237698,0.0


In [None]:
# From the registeration output
df_input_all = pd.read_csv(os.path.abspath("{PATH_TO_all_pockets_input_features_registered.csv_FILE}"), index_col=0).reset_index(drop=True)

# add fibril source for source and target pockets
df_input_all['source_pocket_fibril'] = df_input_all['source_pocket'].apply(lambda x: fibril_name_str(x))
df_input_all['target_pocket_fibril'] = df_input_all['target_pocket'].apply(lambda x: fibril_name_str(x))

# add the alternate metrics from the target pocket side
df_input_all = add_size_metrics(df=df_input_all)

# rename the fitness score columns
df_input_all.rename(columns={"icp_fitness":"icp_fitness_source", "target_matched_percent_icp":"icp_fitness_target"}, inplace=True)

# add the combined fitness score
comb_fit_sc_vessel = []
for idx00 in df_input_all.index:
    comb_fit_sc_vessel.append(round(max(df_input_all.at[idx00, "icp_fitness_source"], df_input_all.at[idx00, "icp_fitness_target"]),2))

df_input_all.insert(df_input_all.shape[1], "combined_fitness_score", comb_fit_sc_vessel)


# get rid of the ransac columns
df_input_all = df_input_all[[
    'source_pocket', 'target_pocket',
    'icp_rmse', 'icp_fitness_source', 'icp_fitness_target', 'combined_fitness_score',
    'icp_nb_corres', 'size_source', 'size_target', 'src2target_size_ratio',
    'source_pocket_fibril', 'target_pocket_fibril']]

# get the pocket pairs
df_input_all.insert(0, 'pocket_pairs', [[s,sf,t,tf] for s,sf,t,tf in zip(df_input_all.source_pocket, df_input_all.source_pocket_fibril, df_input_all.target_pocket, df_input_all.target_pocket_fibril)])
df_input_all.sort_values(by='pocket_pairs', inplace=True)
df_input_all.reset_index(drop=True, inplace=True)

# calculate the input feat diff
df_input_all = calc_input_feat_diff(df=df_input_all, df_all_info=df_all_info, input_feats_npy=input_feats_npy, output=main_output, export=False)

# add the pocket matches
df_input_all = add_pocket_matches(df=df_input_all)

# export 
df_input_all.to_csv(os.path.join(main_output, 'all_input_reg_pockets.csv'))

print(df_input_all.shape)
df_input_all.head(2)

Unique Matches
(1830, 17)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_mean_MaSIF_diff,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,icp_mean_input_diff,icp_mean_input_diff_weighted,size_source,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(A53T_P47, E46K_P53)","[A53T_P47, aSyn, E46K_P53, aSyn]",A53T_P47,E46K_P53,3.569954,0.689122,0.28481,0.25,0.28,45,1.070095,3.821768,158,180,0.88,aSyn,aSyn
1,"(A53T_P47, G51D_P46)","[A53T_P47, aSyn, G51D_P46, aSyn]",A53T_P47,G51D_P46,3.46701,0.709329,0.227848,0.39,0.39,36,0.772345,1.980372,158,92,1.72,aSyn,aSyn


### 4.2 Compare the aSyn Brain structures to themselves

In [None]:
# poockets in this analysis
pockets = [
        'Pol_1a_P1', 'Pol_1a_P2', 'Pol_1a_P3', 'Pol_1a_P4',
        'Pol_1b_P19', 'Pol_1b_P20',
        'Pol_2a_P21', 'Pol_2a_P22', 'Pol_2a_P23', 'Pol_2a_P24', 'Pol_2a_P25', 'Pol_2a_P26',
        'Pol_2b_P32', 'Pol_2b_P33', 'Pol_2b_P34', 'Pol_2b_P35', 'Pol_2b_P36',
        'pY39_P40', 'pY39_P41', 'pY39_P42', 'pY39_P43',
        'G51D_P44', 'G51D_P45', 'G51D_P46',
        'A53T_P47', 'A53T_P48', 'A53T_P49',
        'H50Q_P50', 'H50Q_P51', 'H50Q_P83',
        'E46K_P52', 'E46K_P53', 'E46K_P54', 'E46K_P55',
        'MSA_I_P56', 'MSA_I_P57', 'MSA_I_P58',
        'MSA_IIa_P59', 'MSA_IIa_P60', 'MSA_IIa_P61', 'MSA_IIa_P62','MSA_IIa_P69',
        'MSA_IIb_P63', 'MSA_IIb_P64', 'MSA_IIb_P65', 'MSA_IIb_P66','MSA_IIb_P70',
        'LF_P67', 'LF_P68',
        'TMEM106B_P71', 'TMEM106B_P72',
        'CTEII_P73', 'CTEII_P74',
        'AB42_P75', 'AB42_P76',
        'PHF_P77', 'PHF_P78',
        'Prp_P79', 'Prp_P80', 'Prp_P81', 'Prp_P82']

In [207]:
# get the pocket names in the brain derived structures
asyn_brain_pockets = [p for p in pockets if p.split("_")[0] in ["MSA", "LF"]]


# get the information for the brain derived pockets
df_input_b2b = df_input_all.copy()[(df_input_all.source_pocket.isin(asyn_brain_pockets)) & (df_input_all.target_pocket.isin(asyn_brain_pockets))].sort_values(by='combined_fitness_score', ascending=0).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2b.to_csv(os.path.join(asyn2asyn_out, 'all_asyn_brain_to_asyn_brain_matches.csv'))

print(df_input_all.shape)
print(df_input_b2b.shape)

df_input_b2b.head(2)

(1830, 17)
(105, 17)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_mean_MaSIF_diff,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,icp_mean_input_diff,icp_mean_input_diff_weighted,size_source,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(MSA_IIa_P60, MSA_IIb_P64)","[MSA_IIb_P64, aSyn, MSA_IIa_P60, aSyn]",MSA_IIb_P64,MSA_IIa_P60,1.113781,0.615505,0.877005,0.92,0.92,164,0.319574,0.347363,187,179,1.04,aSyn,aSyn
1,"(MSA_IIa_P62, MSA_I_P58)","[MSA_IIa_P62, aSyn, MSA_I_P58, aSyn]",MSA_IIa_P62,MSA_I_P58,1.116053,0.589014,0.790419,0.87,0.87,132,0.267585,0.307569,167,151,1.11,aSyn,aSyn


#### I need to get the closest matches among the MDS space for plotting later 
- I will go through each pocket and get its matches and find the closest 5 neighbours

In [214]:
# container
b2b_vessel = []

for poc00 in tqdm(asyn_brain_pockets):
    df_temp00 = df_input_b2b.copy()[(df_input_b2b.source_pocket == poc00) | (df_input_b2b.target_pocket == poc00)].sort_values(by="icp_mean_input_diff", ascending=True).reset_index(drop=True).head(5)
    b2b_vessel.append(df_temp00)
    df_temp00.to_csv(os.path.join(asyn2asyn_out, poc00 + "_top5_nghs.csv"))

# put the selected nghs into a df
df_b2b_sel_ngh = pd.concat(b2b_vessel).reset_index(drop=True)
df_b2b_sel_ngh.to_csv(os.path.join(asyn2asyn_out, "asyn_brain_to_brain_top5_nghs.csv"))

100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 370.98it/s]


### 4.3 Compare the aSyn Brain structures to other amyloid fibrils (not aSyn)

In [216]:
# get the pocket names in the brain derived structures
asyn_brain_and_other_amyloids_pockets = [p for p in pockets if p.split("_")[0] not in ['A53T', 'Pol', 'G51D', 'E46K', 'H50Q', 'pY39']]

# get the information for the desired pockets
df_input_b2others = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_brain_and_other_amyloids_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_brain_and_other_amyloids_pockets))
    ].sort_values(by='combined_fitness_score', ascending=0).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2others.to_csv(os.path.join(asyn2others_out, 'all_asyn_brain_to_other_amyloids_matches.csv'))

print(df_input_all.shape)
print(df_input_b2others.shape)

df_input_b2others.head(2)

(1830, 17)
(351, 17)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_mean_MaSIF_diff,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,icp_mean_input_diff,icp_mean_input_diff_weighted,size_source,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(MSA_IIa_P60, MSA_IIb_P64)","[MSA_IIb_P64, aSyn, MSA_IIa_P60, aSyn]",MSA_IIb_P64,MSA_IIa_P60,1.113781,0.615505,0.877005,0.92,0.92,164,0.319574,0.347363,187,179,1.04,aSyn,aSyn
1,"(MSA_IIa_P62, MSA_I_P58)","[MSA_IIa_P62, aSyn, MSA_I_P58, aSyn]",MSA_IIa_P62,MSA_I_P58,1.116053,0.589014,0.790419,0.87,0.87,132,0.267585,0.307569,167,151,1.11,aSyn,aSyn


In [226]:
# container
b2o_vessel = []

for poc01 in tqdm(asyn_brain_and_other_amyloids_pockets):
    df_temp01 = df_input_b2others.copy()[(df_input_b2others.source_pocket == poc01) | (df_input_b2others.target_pocket == poc01)].sort_values(by="icp_mean_input_diff", ascending=True).reset_index(drop=True).head(5)
    b2o_vessel.append(df_temp01)
    df_temp01.to_csv(os.path.join(asyn2others_out, poc01 + "_top5_nghs.csv"))

# put the selected nghs into a df
df_b2o_sel_ngh = pd.concat(b2o_vessel).reset_index(drop=True)
df_b2o_sel_ngh.to_csv(os.path.join(asyn2others_out, "asyn_brain_to_other_amyloids_top5_nghs.csv"))

100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 258.97it/s]


### 4.4 Compare the aSyn Brain structures to In vitro generated aSyn fibrils

In [228]:
# get the pocket names
asyn_brain_and_other_invitro_pockets = [p for p in pockets if p.split("_")[0] in ['MSA', 'LF', 'A53T', 'Pol', 'G51D', 'E46K', 'H50Q', 'pY39']]

# get the information for the desired pockets
df_input_b2invitro = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_brain_and_other_invitro_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_brain_and_other_invitro_pockets))
    ].sort_values(by='combined_fitness_score', ascending=0).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2invitro.to_csv(os.path.join(asyn2invitro_out, 'all_asyn_brain_to_invitro_matches.csv'))

print(df_input_all.shape)
print(df_input_b2invitro.shape)

df_input_b2invitro.head(2)

(1830, 17)
(1176, 17)


Unnamed: 0,pocket_matches,pocket_pairs,source_pocket,target_pocket,icp_mean_MaSIF_diff,icp_rmse,icp_fitness_source,icp_fitness_target,combined_fitness_score,icp_nb_corres,icp_mean_input_diff,icp_mean_input_diff_weighted,size_source,size_target,src2target_size_ratio,source_pocket_fibril,target_pocket_fibril
0,"(Pol_2a_P22, Pol_2b_P35)","[Pol_2b_P35, aSyn, Pol_2a_P22, aSyn]",Pol_2b_P35,Pol_2a_P22,1.184599,0.648768,0.854015,1.05,1.05,117,0.423559,0.40339,137,111,1.23,aSyn,aSyn
1,"(Pol_2a_P22, Pol_2b_P32)","[Pol_2a_P22, aSyn, Pol_2b_P32, aSyn]",Pol_2a_P22,Pol_2b_P32,2.213575,0.647479,0.918919,0.22,0.92,102,0.576794,0.62695,111,464,0.24,aSyn,aSyn


In [247]:
# container
b2invitro_vessel = []

for poc02 in tqdm(asyn_brain_and_other_invitro_pockets):
    df_temp02 = df_input_b2invitro.copy()[(df_input_b2invitro.source_pocket == poc02) | (df_input_b2invitro.target_pocket == poc02)].sort_values(by="icp_mean_input_diff", ascending=True).reset_index(drop=True).head(5)
    b2invitro_vessel.append(df_temp02)
    df_temp02.to_csv(os.path.join(asyn2invitro_out, poc02 + "_top5_nghs.csv"))

# put the selected nghs into a df
df_b2invitro_sel_ngh = pd.concat(b2invitro_vessel).reset_index(drop=True)
df_b2invitro_sel_ngh.to_csv(os.path.join(asyn2invitro_out, "asyn_brain_to_invitro_top5_nghs.csv"))

100%|██████████████████████████████████████████| 49/49 [00:00<00:00, 284.17it/s]
