## Notebook Initialisation

In [1]:
import os

import numpy as np
import pandas as pd

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\ode-biomarker-project


In [2]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import f_classif
import numpy as np

# Dimension reduction and clustering libraries
import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Plotting 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set_theme(style="whitegrid")
sns.set_context("talk")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Bring in CCLE data
from PathLoader import PathLoader
from DataLink import DataLink
path_loader = PathLoader('data_config.env', 'current_user.env')
data_link = DataLink(path_loader, 'data_codes.csv')

In [4]:
# attempt load from loading code

loading_code = 'fgfr4_model_raw_simulation'
simulation_data = data_link.get_data_from_code(loading_code)

In [5]:
loading_code = 'fgfr4_ccle_dynamic_features'
dynamic_features = data_link.get_data_from_code(loading_code)

## Analysis Chain

In [35]:
## CONFIG AREA 
### OPTIONS
# aCbl vs. pSPRY2 vs. pMEK vs. SPRY2
# OR 
# PTP vs aCbl 

# folder output
folder_name = "FGFR4_Dyn_Feats_Sensitivity_Analysis"

# MAIN DEFINITIONS
target_proteins = ['aCbl', 'pSPRY2', 'pMEK', 'SPRY2']
plot_colors = ['red', 'blue', 'green', 'purple']
# file output options 
exp_id = "PTP_control"
# plot options 
dynamic_plot_cols = 2
ncluster_plot_cols = 2
normalise_simulation_dynamics = False
save_figure = True
projection_show_legend = True
## Technical parameters 
fixed_random_seed = 42 # -1 for no seed, not implemented yet
umap_params = {
    # ADD YOUR OWN HERE 
    'random_state': fixed_random_seed
}

hbd_params = {
    # ADD YOUR OWN HERE 
    'min_cluster_size': 500,
    'min_samples': 10,
}

# validation scripts
assert len(target_proteins) >= dynamic_plot_cols, 'Not enough target proteins for the plot'
assert len(target_proteins) >= ncluster_plot_cols, 'Not enough target proteins for the plot'
assert len(target_proteins) == len(plot_colors), 'Not enough plot colors for the plot'
if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

# save parameters in a text file
with open(f'{file_save_path}{exp_id}_parameters.txt', 'w') as f:
    f.write(f'Fixed random seed: {fixed_random_seed}\n')
    f.write(f'UMAP parameters: {umap_params}\n')
    f.write(f'HDBSCAN parameters: {hbd_params}\n')
    f.write(f'Target proteins: {target_proteins}\n')
    f.write(f'Plot colors: {plot_colors}\n')
    f.write(f'Normalise simulation dynamics: {normalise_simulation_dynamics}\n')
    f.write(f'Projection show legend: {projection_show_legend}\n')
        

In [36]:
indexed_dyn_feats = dynamic_features.set_index('Row')
columns = indexed_dyn_feats.columns

dyn_feat_cols = ['auc', 'median', 'tfc', 'tmax', 'max', 'tmin', 'min', 'ttsv', 'tsv', 'init']

joint_dyn_feat_data = []
joint_labels = []
joint_labels_id = []


# dataFrame['column name'].str.match('string')


dyn_feat_data = []
for protein in target_proteins:
    # filter data based on "protein_" in the column name
    filtered_cols = [col for col in columns if col.startswith(protein)]
    filtered_data = indexed_dyn_feats[filtered_cols]
    dyn_feat_data.append(filtered_data)
    
for i, data in enumerate(dyn_feat_data):
    label = target_proteins[i]
    label_id = i
    for row in range(data.shape[0]):
        data_row = data.iloc[row].tolist()
        # print(label,len(data_row))
        joint_dyn_feat_data.append(data_row)
        joint_labels.append(label)
        joint_labels_id.append(label_id)
        
joint_dyn_feat_df = pd.DataFrame(joint_dyn_feat_data, columns=dyn_feat_cols) 
        
        
    

In [37]:
# check f score for each dynamic feature
f_scores = f_classif(joint_dyn_feat_df, joint_labels)

f_score_df = pd.DataFrame(f_scores, columns=dyn_feat_cols)

In [38]:
f_scores

(array([1.16636960e+03, 1.32108504e+03, 4.06942852e-01, 1.11361740e+07,
        2.69640903e+03, 3.28725893e+04, 1.81609869e+01, 6.25433194e+04,
        2.69667837e+03, 1.81244041e+01]),
 array([0.00000000e+00, 0.00000000e+00, 7.48018971e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00467433e-11, 0.00000000e+00,
        0.00000000e+00, 1.05976500e-11]))

In [39]:
# format the output such that it is easier to read
f_score_df = f_score_df.T
f_score_df.columns = ['F value', 'p value']
f_score_df = f_score_df.sort_values(by='F value', ascending=False)
# when displaying, show only 3 decimal places
f_score_df = f_score_df.round(2)
f_score_df.head(10)

Unnamed: 0,F value,p value
tmax,11136173.98,0.0
ttsv,62543.32,0.0
tmin,32872.59,0.0
tsv,2696.68,0.0
max,2696.41,0.0
median,1321.09,0.0
auc,1166.37,0.0
min,18.16,0.0
init,18.12,0.0
tfc,0.41,0.75


Perhaps when comparing between different dynamics, different dynamic features become better separators of data