# Combined CCLE Model Training

This notebook combines the CCLE gene expression dataset with dynamic features and then is trained together

## Initialisation

In [1]:
import os

import numpy as np
import pandas as pd

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\ode-biomarker-project


In [2]:
# Bring in CCLE data
from PathLoader import PathLoader
from DataLink import DataLink
path_loader = PathLoader('data_config.env', 'current_user.env')
data_link = DataLink(path_loader, 'data_codes.csv')

In [9]:
# load in dynamic features data 
loading_code = 'generic-gdsc-1-FGFR_0939-LN_IC50-fgfr4_ccle_dynamic_features_v2-true-Unnamed: 0'
# generic-gdsc-{number}-{drug_name}-{target_label}-{dataset_name}-{replace_index}-{row_index}
dynamic_feature_data, dynamic_label_data = data_link.get_data_using_code(loading_code)
print(f'Data loaded for code {loading_code} Feature Shape {dynamic_feature_data.shape} Label Shape {dynamic_label_data.shape}')

# load in ccle static gene expression data
loading_code = 'ccle-gdsc-1-FGFR_0939-LN_IC50'
feature_data, label_data = data_link.get_data_using_code(loading_code)
print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')

Data loaded for code generic-gdsc-1-FGFR_0939-LN_IC50-fgfr4_ccle_dynamic_features_v2-true-Unnamed: 0 Feature Shape (665, 260) Label Shape (665,)
Data loaded for code ccle-gdsc-1-FGFR_0939-LN_IC50 Feature Shape (667, 19221) Label Shape (667,)


In [4]:
from toolkit import *

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [5]:
folder_name = "FGFR4-combined-model-training"
exp_id = "test"
fixed_random_seed = 42 # -1 for no seed
save_figure = False
save_data = True
show_figure = False

all_models = ['SVR', 'RandomForestRegressor', 'XGBRegressor', 'MLPRegressor', 'KNeighborsRegressor', 'ElasticNet']

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

In [10]:
# remove samples present in feature data but not in dynamic feature data
new_feature_data = feature_data.loc[feature_data.index.isin(dynamic_feature_data.index)]
new_label_data = label_data.loc[label_data.index.isin(dynamic_feature_data.index)]

# check size 
print(new_feature_data.shape)
print(new_label_data.shape)

(665, 19221)
(665,)


In [11]:
combined_feature_data = pd.concat([new_feature_data, dynamic_feature_data], axis=1)
combined_label_data = new_label_data

In [12]:
combined_feature_data

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,amTORC2_auc,amTORC2_median,amTORC2_tfc,amTORC2_tmax,amTORC2_max,amTORC2_tmin,amTORC2_min,amTORC2_ttsv,amTORC2_tsv,amTORC2_init
SIDM01132,3.955127,1.416840,6.620293,2.000000,3.333424,0.014355,5.654779,0.028569,2.726831,4.061776,...,0.001622,0.001435,1.732396,0.96,0.003096,0.0,0.001133,0.04,0.001133,0.001133
SIDM00848,4.247928,0.000000,6.174127,2.316146,3.823749,0.189034,1.321928,3.536053,3.943921,4.468583,...,0.463970,0.483035,0.281069,0.96,0.544116,0.0,0.424736,0.04,0.424736,0.424736
SIDM01111,4.327687,0.070389,5.979111,2.906891,4.904484,0.263034,2.235727,0.422233,4.432959,5.139961,...,0.073737,0.076469,0.295231,0.96,0.088013,0.0,0.067951,0.04,0.067951,0.067951
SIDM00909,3.264536,0.000000,6.096557,2.518535,3.040892,0.000000,0.831877,6.575615,4.656496,4.738768,...,0.042001,0.043628,0.375213,0.96,0.051130,0.0,0.037180,0.04,0.037180,0.037180
SIDM00807,5.128871,0.000000,6.691534,2.010780,4.976364,0.163499,1.636915,6.193575,3.505891,3.709291,...,0.153621,0.159928,0.308393,0.96,0.181684,0.0,0.138860,0.04,0.138860,0.138860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SIDM00217,4.489286,0.475085,6.759955,2.451541,3.305971,1.464668,0.056584,5.996615,5.134221,4.635754,...,0.015140,0.015568,0.485826,0.96,0.019511,0.0,0.013131,0.04,0.013131,0.013131
SIDM00214,4.628190,0.298658,5.996615,4.042644,4.784504,0.042644,1.761285,5.067811,5.783457,4.943453,...,0.001227,0.001096,2.566311,0.96,0.002511,0.0,0.000704,0.04,0.000704,0.000704
SIDM00194,4.229588,0.014355,6.664767,2.691534,3.454176,0.042644,0.739848,6.417009,4.442943,2.959770,...,0.059500,0.061853,0.369219,0.96,0.072176,0.0,0.052713,0.04,0.052713,0.052713
SIDM00193,3.477677,0.000000,6.385949,3.570463,3.232661,0.000000,0.275007,6.417515,6.407013,3.727920,...,0.046056,0.047720,0.353875,0.96,0.056037,0.0,0.041390,0.04,0.041390,0.041390


## Negative Control (make_regression data)

To ensure all models are working as intended

### Training

In [14]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=1000, n_features=500, noise=0.1)

In [15]:
powerkit = Powerkit(X, y)
rngs = list(range(100))

In [None]:
def pipeline_func(X_train, y_train, rng, model_used, **kwargs):
    k = X_train.shape[1]
    selected_features, scores = f_regression_select(X_train, y_train, k)
    model = get_model_from_string(model_used, **kwargs)
    selected_features, X_selected = select_preset_features(X_train, y_train, selected_features)
    model.fit(X_selected, y_train)
    return {'model': model,
            'filter_selected_features': selected_features,
            'filter_scores': scores}


def eval_func(X_test, y_test, pipeline_components=None, **kwargs):
    selected_features, X_selected = select_preset_features(X_test, y_test, pipeline_components['filter_selected_features'])
    y_pred = pipeline_components['model'].predict(X_selected)
    # assess performance by pearson correlation
    corr, p_vals = pearsonr(y_test, y_pred)
    feat_imp = (pipeline_components['filter_selected_features'], pipeline_components['filter_scores'])
    return {'model_performance': corr, 'p_vals': p_vals, 'feature_importance': feat_imp}

In [None]:
for model_used in all_models:
    powerkit.add_condition(model_used, False, pipeline_func, {'model_used': model_used}, eval_func, {})

In [None]:
for model_used in all_models:
    print(f'Running {model_used}...')
    df = powerkit.run_selected_condition(model_used, rngs, 16, True)
    if save_data:
        print(f'Saved {model_used} to path')
        df.to_pickle(f'{file_save_path}{exp_id}_{model_used}_simple.pkl')

### Visualisation

## Positive Control (CCLE Only)

## Combined (CCLE + Dyn)