## Initialise Notebook

In [None]:
import os

import numpy as np
import pandas as pd

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Dimension reduction and clustering libraries
import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
# Bring in CCLE data
from PathLoader import PathLoader
from DataLink import DataLink
path_loader = PathLoader('data_config.env', 'current_user.env')
data_link = DataLink(path_loader, 'data_codes.csv')

In [None]:
# attempt load from loading code

loading_code = 'fgfr4_model_raw_simulation'
simulation_data = data_link.get_data_from_code(loading_code)

In [None]:
loading_code = 'fgfr4_ccle_dynamic_features'
dynamic_features = data_link.get_data_from_code(loading_code)

## Analysing a single protein 

In [None]:
### OPTIONS

target_protein = "pAkt"
output_folder = "testFolder"
fixed_random_seed = 42 # -1 for no seed

In [None]:
indexed_dyn_feats = dynamic_features.set_index('Row')
target_dynamic_features = indexed_dyn_feats.filter(like=target_protein, axis=1)

dyn_feat_cols = ['auc', 'median', 'tfc', 'tmax', 'max', 'tmin', 'min', 'ttsv', 'tsv', 'init']
joint_dyn_feat_data = []
for row in range(target_dynamic_features.shape[0]):
    ptp_data = target_dynamic_features.iloc[row].tolist()
    joint_dyn_feat_data.append(ptp_data)
    
joint_dynamic_features = pd.DataFrame(joint_dyn_feat_data, columns=dyn_feat_cols)

standard_embedding = umap.UMAP(random_state=fixed_random_seed).fit_transform(joint_dynamic_features)
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], s=0.1)

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(joint_dynamic_features)

plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=0.1)

labels = hdbscan.HDBSCAN(
    min_samples=10,
    min_cluster_size=500,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
plt.scatter(standard_embedding[clustered, 0],
            standard_embedding[clustered, 1],
            c=labels[clustered],
            s=0.1,
            cmap='Spectral')

In [None]:
np.sum(clustered) / joint_dynamic_features.shape[0]