# This notebook is to analyse the fibril site alignments with focus on ex vivo aSyn fibrils
- Find share features among ex vivo aSyn fibrils
- Find share features between ex vivo fibrils of aSyn and other amyloid proteins
- Find share features between ex vivo and in vitro aSyn fibrils

**Open3d output explained**

- *icp_fitness*, which measures the overlapping area (# of correspondences / # of points in source pocket). The higher the better.
        "how much of the source pocket is matched to the target" --> Fitness(source)
        
- *inlier_rmse*, which measures the RMSE of all inlier correspondences. The lower the better.

**target_matched_percent explained**
- *Fitness(Target)* =  # of correspondences / # of points in target pocket

**combined_fitness_score (SSmax) explained**
- since there is a size difference between the matched pockets, a fitness score, aka query coverage, calculated based on the source and target pockets' sizes. the Max of both is used as the most coverage obtained between 2 pockets and will be used for further downstream analysis

## 1.0 Libraries

In [None]:
import os, glob, shutil, datetime, open3d, copy

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from tqdm import tqdm

from fibrilsite import get_input_desc_diff, calc_input_feat_diff

## 2.0 Functions

In [None]:
def fibril_name_str(x):
    """  """
    if x.split('_')[0] in ['PHF','CTEII']:
        return 'Tau'
    elif x.split('_')[0] in ['A53T', 'Pol', 'LF', 'G51D', 'MSA', 'E46K', 'H50Q', 'pY39']:
        return 'aSyn'
    else:
        return x.split('_')[0]

In [None]:
def add_size_metrics(df:pd.DataFrame):
    """  """
    df['src2target_size_ratio'] = round(df['size_source'] / df['size_target'],2)
    df['target_matched_percent_icp']    = round(df['icp_nb_corres']/df['size_target'],2)
    return df

In [None]:
def add_pocket_matches(df:pd.DataFrame) -> pd.DataFrame:
    """  """
    vessel = []
    for item00 in df.pocket_pairs:
        sorted_items = sorted([item00[0],item00[2]])
        vessel.append((sorted_items[0], sorted_items[1]))
    
    # add to Dataframe
    df.insert(0, 'pocket_matches', vessel)

    # check if there are duplicated matches
    assert len(df.pocket_matches.to_list()) == len(list(set(df.pocket_matches.to_list()))), "Duplicated Matches Found"
    print("Unique Matches")

    return df

## 3.0 I/O

### 3.1 Output dirs

In [None]:
# make main output folder
main_output = os.path.join(os.path.abspath('.'), str(datetime.date.today()) + "_input_feats_pockets_analysis_publish")
os.makedirs(main_output, exist_ok=False)

## ------------------------------------------------------- ##

# make output for all vs all comparison
allvsall_out = os.path.join(main_output, "all_vs_all")
os.makedirs(allvsall_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for ex vivo aSyn to aSyn fibril sites comparison
asyn2asyn_out = os.path.join(main_output, "aSyn_to_aSyn")
os.makedirs(asyn2asyn_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for ex vivo aSyn to other amyloid fibrils sites comparison
asyn2others_out = os.path.join(main_output, "aSyn_to_other_amyloids")
os.makedirs(asyn2others_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for ex vivo to in vitro aSyn fibril sites comparison
asyn2invitro_out = os.path.join(main_output, "aSyn_to_invitro_aSyn")
os.makedirs(asyn2invitro_out, exist_ok=False)

## ------------------------------------------------------- ##

# make output for in vitro aSyn to other amyloid fibrils sites comparison
asyninvitro2others_out = os.path.join(main_output, "invitro_aSyn_to_other_amyloids")
os.makedirs(asyninvitro2others_out, exist_ok=False)

## ------------------------------------------------------- ##

### 3.2 Folders paths

In [None]:
# path to registration output results
reg_out_dir = os.path.abspath('./2023-11-20_registration_output_no_hphob_frac_analysis_updated/')
assert os.path.exists(reg_out_dir), "Folder not found"
print("All Good !")

In [None]:
#paths to npy files for input feats registration
input_feats_npy = [l.strip() for l in glob.iglob(os.path.join(
    os.path.abspath("../230914_run/2023-09-14_registration_outputs/o3d_objects_npy/input_feats_reg"), "*.npy"))]

print(len(input_feats_npy))
print(input_feats_npy[0])

In [None]:
# defined pockets in this study
pockets = [
    'Pol_1a_P1', 'Pol_1a_P2', 'Pol_1a_P3', 'Pol_1a_P4',
    'Pol_1b_P19', 'Pol_1b_P20',
    'Pol_2a_P21', 'Pol_2a_P22', 'Pol_2a_P23', 'Pol_2a_P24', 'Pol_2a_P25', 'Pol_2a_P26',
    'Pol_2b_P32', 'Pol_2b_P33', 'Pol_2b_P34', 'Pol_2b_P35', 'Pol_2b_P36',
    'pY39_P40', 'pY39_P41', 'pY39_P42', 'pY39_P43',
    'G51D_P44', 'G51D_P45', 'G51D_P46',
    'A53T_P47', 'A53T_P48', 'A53T_P49',
    'H50Q_P50', 'H50Q_P51', 'H50Q_P83',
    'E46K_P52', 'E46K_P53', 'E46K_P54', 'E46K_P55',
    'MSA_I_P56', 'MSA_I_P57', 'MSA_I_P58',
    'MSA_IIa_P59', 'MSA_IIa_P60', 'MSA_IIa_P61', 'MSA_IIa_P62','MSA_IIa_P69',
    'MSA_IIb_P63', 'MSA_IIb_P64', 'MSA_IIb_P65', 'MSA_IIb_P66','MSA_IIb_P70',
    'LF_P67', 'LF_P68',
    'TMEM106B_P71', 'TMEM106B_P72',
    'CTEII_P73', 'CTEII_P74',
    'AB42_P75', 'AB42_P76',
    'PHF_P77', 'PHF_P78',
    'Prp_P79', 'Prp_P80', 'Prp_P81', 'Prp_P82']

## 4.0 Execution

### 4.1 Load all the information

In [None]:
# load points with their features
df_all_info = pd.read_csv(os.path.abspath('../2023-09-06_parsed_info/2023-09-06_all_pockets_feats.csv'), index_col=0)

print(df_all_info.shape)
df_all_info.head(2)

In [None]:
# work on the input feats reg
df_input_all = pd.read_csv(os.path.abspath('../230914_run/2023-09-14_registration_outputs/2023-09-14_all_pockets_input_features_registered.csv'), index_col=0).rename(columns={'runsac_rmse':'ransac_rmse', 'runsac_fitness':'ransac_fitness'}).reset_index(drop=True)

# add fibril source for source and target pockets
df_input_all['source_pocket_fibril'] = df_input_all['source_pocket'].apply(lambda x: fibril_name_str(x))
df_input_all['target_pocket_fibril'] = df_input_all['target_pocket'].apply(lambda x: fibril_name_str(x))

# add the alternate metrics from the target pocket side
df_input_all = add_size_metrics(df=df_input_all)

# rename the fitness score columns
df_input_all.rename(columns={"icp_fitness":"icp_fitness_source", "target_matched_percent_icp":"icp_fitness_target"}, inplace=True)

# add the combined fitness score
comb_fit_sc_vessel = []
for idx00 in df_input_all.index:
    comb_fit_sc_vessel.append(round(max(df_input_all.at[idx00, "icp_fitness_source"], df_input_all.at[idx00, "icp_fitness_target"]),2))

df_input_all.insert(df_input_all.shape[1], "combined_fitness_score", comb_fit_sc_vessel)


# get rid of the ransac columns
df_input_all = df_input_all[[
    'source_pocket', 'target_pocket',
    'icp_rmse', 'icp_fitness_source', 'icp_fitness_target', 'combined_fitness_score',
    'icp_nb_corres', 'size_source', 'size_target', 'src2target_size_ratio',
    'source_pocket_fibril', 'target_pocket_fibril']]

# get the pocket pairs
df_input_all.insert(0, 'pocket_pairs', [[s,sf,t,tf] for s,sf,t,tf in zip(df_input_all.source_pocket, df_input_all.source_pocket_fibril, df_input_all.target_pocket, df_input_all.target_pocket_fibril)])
df_input_all.sort_values(by='pocket_pairs', inplace=True)
df_input_all.reset_index(drop=True, inplace=True)

# calculate the input feat diff
df_input_all = calc_input_feat_diff(df=df_input_all, df_all_info=df_all_info, input_feats_npy=input_feats_npy, output=main_output, export=False)

# add the pocket matches
df_input_all = add_pocket_matches(df=df_input_all)

# export 
df_input_all.to_csv(os.path.join(main_output, str(datetime.date.today())+'_all_input_reg_pockets.csv'))

print(df_input_all.shape)
df_input_all.head(2)

In [None]:
# Find the all vs all matches
# base the selections on the ICP mean input feats (Fdiff)

# container
allvsall_vessel = []

for poc in tqdm(pockets):
    df_temp = df_input_all.copy()[(df_input_all.source_pocket == poc) | (df_input_all.target_pocket == poc)].sort_values(by="icp_mean_input_diff", ascending=True).reset_index(drop=True).head(5)
    allvsall_vessel.append(df_temp)
    df_temp.to_csv(os.path.join(allvsall_out, poc + "_top5_nghs.csv"))

# put the selected nghs into a df
df_allvsall_sel_ngh = pd.concat(allvsall_vessel).reset_index(drop=True)
df_allvsall_sel_ngh.to_csv(os.path.join(allvsall_out, "allvsall_top5_nghs.csv"))

### 4.2 Compare sites among ex vivo aSyn fibrils

In [None]:
# get the pocket names in the brain derived structures
asyn_brain_pockets = [p for p in pockets if p.split("_")[0] in ["MSA", "LF"]]


# get the information for the brain derived pockets
df_input_b2b = df_input_all.copy()[(df_input_all.source_pocket.isin(asyn_brain_pockets)) & (df_input_all.target_pocket.isin(asyn_brain_pockets))].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2b.to_csv(os.path.join(asyn2asyn_out, 'all_asyn_brain_to_asyn_brain_matches.csv'))

print(df_input_all.shape)
print(df_input_b2b.shape)

df_input_b2b.head(2)

In [None]:
# get the matches that pass the simirity threshold
# SSmax : combined_fitness_score >= 0.5
# Fdiff : icp_mean_input_diff <= 0.6
# with exception of MSA–I P56

df_b2b_sel = df_input_b2b.copy()[(df_input_b2b.combined_fitness_score >= 0.4) & (df_input_b2b.icp_mean_input_diff <= 0.6)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)

# export
df_b2b_sel.to_csv(os.path.join(asyn2asyn_out, "sel_asyn_brain_to_asyn_brain_matches.csv"))

print(df_b2b_sel.shape)
df_b2b_sel

### 4.3 Compare sites between ex vivo fibrils of aSyn and other amyloid proteins

In [None]:
# get the pocket names in the brain derived structures
asyn_brain_and_other_amyloids_pockets = [p for p in pockets if p.split("_")[0] not in ['A53T', 'Pol', 'G51D', 'E46K', 'H50Q', 'pY39']]

# get the information for the desired pockets
df_input_b2others = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_brain_and_other_amyloids_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_brain_and_other_amyloids_pockets))
    ].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2others.to_csv(os.path.join(asyn2others_out, 'all_asyn_brain_to_other_amyloids_matches.csv'))

print(df_input_all.shape)
print(df_input_b2others.shape)

df_input_b2others.head(2)

In [None]:
# get the matches that pass the simirity threshold
# SSmax : combined_fitness_score >= 0.5
# Fdiff : icp_mean_input_diff <= 0.6

df_b2o_sel = df_input_b2others.copy()[(df_input_b2others.combined_fitness_score >= 0.5) & (df_input_b2others.icp_mean_input_diff <= 0.6)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)

# make sure that the matches are not among ex vivo aSyn fibrils 
df_b2o_sel["fibril_src_target_match"] = df_b2o_sel["source_pocket_fibril"] == df_b2o_sel["target_pocket_fibril"]
df_b2o_sel = df_b2o_sel[df_b2o_sel.fibril_src_target_match == False]

# export
df_b2o_sel.to_csv(os.path.join(asyn2others_out, 'sel_asyn_brain_to_other_amyloids_matches.csv'))

df_b2o_sel

### 4.4 Compare sites between ex vivo and in vitro fibrils of aSyn

In [None]:
# get the pocket names
asyn_brain_and_other_invitro_pockets = [p for p in pockets if p.split("_")[0] in ['MSA', 'LF', 'A53T', 'Pol', 'G51D', 'E46K', 'H50Q', 'pY39']]

# get the information for the desired pockets
df_input_b2invitro = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_brain_and_other_invitro_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_brain_and_other_invitro_pockets))
    ].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)

# export all aSyn brain pockets matches
df_input_b2invitro.to_csv(os.path.join(asyn2invitro_out, 'all_asyn_brain_to_invitro_matches.csv'))

print(df_input_all.shape)
print(df_input_b2invitro.shape)

df_input_b2invitro.head(2)

In [None]:
# get the matches that pass the simirity threshold
# SSmax : combined_fitness_score >= 0.5
# Fdiff : icp_mean_input_diff <= 0.6

df_b2invitro_sel = df_input_b2invitro.copy()[(df_input_b2invitro.combined_fitness_score >= 0.5) & (df_input_b2invitro.icp_mean_input_diff <= 0.6)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)

# ensure that the matches are with the ex vivo aSyn structures
df_b2invitro_sel = df_b2invitro_sel[(df_b2invitro_sel.source_pocket.isin(asyn_brain_pockets)) | (df_b2invitro_sel.target_pocket.isin(asyn_brain_pockets))].reset_index(drop=True)

# make sure that the matches are not among ex vivo aSyn fibrils 
df_b2invitro_sel_rej = df_b2invitro_sel[(df_b2invitro_sel.source_pocket.isin(asyn_brain_pockets)) & (df_b2invitro_sel.target_pocket.isin(asyn_brain_pockets))].reset_index(drop=True)
df_b2invitro_sel = df_b2invitro_sel[~df_b2invitro_sel.pocket_matches.isin(df_b2invitro_sel_rej.pocket_matches.to_list())].reset_index(drop=True)

# export
df_b2invitro_sel.to_csv(os.path.join(asyn2invitro_out, 'sel_asyn_brain_to_invitro_matches.csv'))

df_b2invitro_sel

### 4.5 Compare sites between in vitro fibrils of aSyn and ex vivo fibrils of other amyloids

In [None]:
# get the pocket names
asyn_invitro_and_other_exvivo_pockets = [p for p in pockets if p.split("_")[0] not in ['MSA', 'LF']]

# get the information for the desired pockets
df_input_invitro2others = df_input_all.copy()[
    (df_input_all.source_pocket.isin(asyn_invitro_and_other_exvivo_pockets)) & 
    (df_input_all.target_pocket.isin(asyn_invitro_and_other_exvivo_pockets))
    ].sort_values(by='combined_fitness_score', ascending=False).reset_index(drop=1)


# make sure that the matches are not among ex vivo aSyn fibrils 
df_input_invitro2others["fibril_src_target_match"] = df_input_invitro2others["source_pocket_fibril"] == df_input_invitro2others["target_pocket_fibril"]
df_input_invitro2others = df_input_invitro2others[df_input_invitro2others.fibril_src_target_match == False].reset_index(drop=True)

# export
df_input_invitro2others.to_csv(os.path.join(asyninvitro2others_out, 'all_asyn_invitro_to_other_amyloids_matches.csv'))

df_input_invitro2others.head(2)

In [None]:
# get the matches that pass the simirity threshold
# SSmax : combined_fitness_score >= 0.5
# Fdiff : icp_mean_input_diff <= 0.6

df_invitro2others_sel = df_input_invitro2others.copy()[(df_input_invitro2others.combined_fitness_score >= 0.5) & (df_input_invitro2others.icp_mean_input_diff <= 0.6)].sort_values(by="combined_fitness_score", ascending=False).reset_index(drop=True)

# export
df_invitro2others_sel.to_csv(os.path.join(asyninvitro2others_out, 'sel_asyn_invitro_to_other_amyloids_matches.csv'))

df_invitro2others_sel