# Biomarkers for palbociclib - Workbook 2 June 2023 

## Import Data

In [1]:
import pandas as pd
import pickle

# import GDSC2 drug response data using pickle

with open('data/drug-response/GDSC2/cache_gdsc2.pkl', 'rb') as f:
    gdsc2 = pickle.load(f)
    gdsc2_info = pickle.load(f)

# import CCLE gene expression data using pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_expression.pkl', 'rb') as f:
    gene_entrez = pickle.load(f)
    ccle = pickle.load(f)

# import CCLE sample info data using pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_sample_info.pkl', 'rb') as f:
    ccle_sample_info = pickle.load(f)

# import STRING database using pickle

with open('data/protein-interaction/STRING/string_df.pkl', 'rb') as f:
    string_df = pickle.load(f)
    string_df_info = pickle.load(f)
    string_df_alias = pickle.load(f)


# import proteomic expression
with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna_processed.pkl', 'rb') as f:
    joined_full_protein_matrix = pickle.load(f)
    joined_sin_peptile_exclusion_matrix = pickle.load(f)

# import STRING database using pickle

with open('data/protein-interaction/STRING/string_df.pkl', 'rb') as f:
    string_df = pickle.load(f)
    string_df_info = pickle.load(f)
    string_df_alias = pickle.load(f)

# open STRING to goncalves mapping file

with open('data\protein-interaction\STRING\goncalve_to_string_id_df.pkl', 'rb') as f:
    goncalve_to_string_id_df = pickle.load(f)


## Palbociclib GDSC with Goncalves et al proteomics (preprocessed & normalised)

In [2]:
# create feature and target 

import DataFunctions as utils

drug_selected = 'Palbociclib'

# create the full dataset

palbociclib_proteomic_df = utils.create_joint_dataset_from_proteome_gdsc(drug_selected, joined_sin_peptile_exclusion_matrix, gdsc2)

feature_data, label_data = utils.create_feature_and_label(palbociclib_proteomic_df)



### Computing Interactors

In [3]:
# using STRING database to select the 1st,2nd and 3rd degree neighbours of the drug target

import pandas as pd

drug_targets = ['CDK4', 'CDK6']
first_degree_neighbours = []
second_degree_neighbours = []
third_degree_neighbours = []

for drug_target in drug_targets:
    string_id = utils.get_protein_id_by_name(drug_target, string_df_info, string_df_alias)
    if string_id is not None:
        first_interactors_string_id = utils.get_protein_interactors(string_id, string_df, score_threshold=900)
        for ii in first_interactors_string_id:
            interactor_name = utils.get_protein_name_by_id(ii, goncalve_to_string_id_df, 
                                                           field_name='goncalve_protein_id',
                                                           check_field_name='string_protein_id')
            if interactor_name is not None:
                first_degree_neighbours.append(interactor_name)

first_degree_neighbours = list(set(first_degree_neighbours))

print(f'first degree neighbours size: {len(first_degree_neighbours)}')
print(f'first degree neighbours: {first_degree_neighbours}')

first degree neighbours size: 43
first degree neighbours: ['Q9P2W1;HOP2_HUMAN', 'Q00534;CDK6_HUMAN', 'P06493;CDK1_HUMAN', 'P31947;1433S_HUMAN', 'P50613;CDK7_HUMAN', 'P20248;CCNA2_HUMAN', 'P00519;ABL1_HUMAN', 'Q13309;SKP2_HUMAN', 'P49918;CDN1C_HUMAN', 'P06400;RB_HUMAN', 'P49715;CEBPA_HUMAN', 'P16989;YBOX3_HUMAN', 'Q9P287;BCCIP_HUMAN', 'Q14186;TFDP1_HUMAN', 'P51948;MAT1_HUMAN', 'P14635;CCNB1_HUMAN', 'P12004;PCNA_HUMAN', 'Q16543;CDC37_HUMAN', 'Q13951;PEBB_HUMAN', 'P24385;CCND1_HUMAN', 'Q00535;CDK5_HUMAN', 'P42771;CDN2A_HUMAN', 'O75832;PSD10_HUMAN', 'O60563;CCNT1_HUMAN', 'P04637;P53_HUMAN', 'P08238;HS90B_HUMAN', 'P07900;HS90A_HUMAN', 'O95067;CCNB2_HUMAN', 'Q13485;SMAD4_HUMAN', 'P10275;ANDR_HUMAN', 'P15090;FABP4_HUMAN', 'P50750;CDK9_HUMAN', 'P42773;CDN2C_HUMAN', 'Q13547;HDAC1_HUMAN', 'P24941;CDK2_HUMAN', 'Q9BWT6;MND1_HUMAN', 'P11802;CDK4_HUMAN', 'O43502;RA51C_HUMAN', 'P07948;LYN_HUMAN', 'P49841;GSK3B_HUMAN', 'P84022;SMAD3_HUMAN', 'P51946;CCNH_HUMAN', 'P12931;SRC_HUMAN']


In [4]:
# get the second degree neighbours using first_interactors_string_id

for ii in first_interactors_string_id:
    second_interactors_string_id = utils.get_protein_interactors(ii, string_df, score_threshold=900)
    for sec_ii in second_interactors_string_id:
        interactor_name = utils.get_protein_name_by_id(sec_ii, goncalve_to_string_id_df, 
                                                       field_name='goncalve_protein_id',
                                                       check_field_name='string_protein_id')
        if interactor_name is not None:
            second_degree_neighbours.append(interactor_name)

second_degree_neighbours = list(set(second_degree_neighbours + first_degree_neighbours))
print(f'second degree neighbours size: {len(second_degree_neighbours)}')
print(f'second degree neighbours: {second_degree_neighbours}')



second degree neighbours size: 967
second degree neighbours: ['Q00534;CDK6_HUMAN', 'P49321;NASP_HUMAN', 'Q15022;SUZ12_HUMAN', 'P04049;RAF1_HUMAN', 'P11233;RALA_HUMAN', 'P62820;RAB1A_HUMAN', 'Q12873;CHD3_HUMAN', 'P63208;SKP1_HUMAN', 'P06400;RB_HUMAN', 'P40763;STAT3_HUMAN', 'P19387;RPB3_HUMAN', 'Q05193;DYN1_HUMAN', 'P16989;YBOX3_HUMAN', 'Q14161;GIT2_HUMAN', 'Q9BZK7;TBL1R_HUMAN', 'Q9UL46;PSME2_HUMAN', 'Q96L91;EP400_HUMAN', 'Q00653;NFKB2_HUMAN', 'P31946;1433B_HUMAN', 'P28340;DPOD1_HUMAN', 'O15160;RPAC1_HUMAN', 'P42356;PI4KA_HUMAN', 'Q04726;TLE3_HUMAN', 'P50991;TCPD_HUMAN', 'Q96T76;MMS19_HUMAN', 'P02452;CO1A1_HUMAN', 'P63096;GNAI1_HUMAN', 'Q9NRG9;AAAS_HUMAN', 'Q9P1U0;RPA12_HUMAN', 'Q15008;PSMD6_HUMAN', 'Q13042;CDC16_HUMAN', 'Q96EB6;SIR1_HUMAN', 'Q96HR3;MED30_HUMAN', 'Q13469;NFAC2_HUMAN', 'Q13191;CBLB_HUMAN', 'Q14997;PSME4_HUMAN', 'P62191;PRS4_HUMAN', 'Q9Y230;RUVB2_HUMAN', 'Q00688;FKBP3_HUMAN', 'Q13761;RUNX3_HUMAN', 'Q16531;DDB1_HUMAN', 'P49642;PRI1_HUMAN', 'P28070;PSB4_HUMAN', 'P55786;PSA_H

In [5]:
# get the third degree neighbours using second_interactors_string_id

for ii in second_interactors_string_id:
    third_interactors_string_id = utils.get_protein_interactors(ii, string_df, score_threshold=900)
    for third_ii in third_interactors_string_id:
        interactor_name = utils.get_protein_name_by_id(third_ii, goncalve_to_string_id_df, 
                                                       field_name='goncalve_protein_id',
                                                       check_field_name='string_protein_id')
        if interactor_name is not None:
            third_degree_neighbours.append(interactor_name)

third_degree_neighbours = list(set(third_degree_neighbours + second_degree_neighbours))
print(f'third degree neighbours size: {len(third_degree_neighbours)}')
print(f'third degree neighbours: {third_degree_neighbours}')

third degree neighbours size: 993
third degree neighbours: ['Q00534;CDK6_HUMAN', 'P49321;NASP_HUMAN', 'P04049;RAF1_HUMAN', 'Q15022;SUZ12_HUMAN', 'P11233;RALA_HUMAN', 'P62820;RAB1A_HUMAN', 'Q12873;CHD3_HUMAN', 'P63208;SKP1_HUMAN', 'P06400;RB_HUMAN', 'P40763;STAT3_HUMAN', 'Q05193;DYN1_HUMAN', 'P10451;OSTP_HUMAN', 'Q9BZK7;TBL1R_HUMAN', 'Q14161;GIT2_HUMAN', 'P19387;RPB3_HUMAN', 'Q9UL46;PSME2_HUMAN', 'Q00653;NFKB2_HUMAN', 'P16989;YBOX3_HUMAN', 'P31946;1433B_HUMAN', 'P28340;DPOD1_HUMAN', 'Q96L91;EP400_HUMAN', 'O15160;RPAC1_HUMAN', 'P42356;PI4KA_HUMAN', 'Q04726;TLE3_HUMAN', 'P50991;TCPD_HUMAN', 'Q96T76;MMS19_HUMAN', 'P02452;CO1A1_HUMAN', 'P63096;GNAI1_HUMAN', 'Q9NRG9;AAAS_HUMAN', 'Q9P1U0;RPA12_HUMAN', 'Q15008;PSMD6_HUMAN', 'Q13042;CDC16_HUMAN', 'Q96EB6;SIR1_HUMAN', 'Q96HR3;MED30_HUMAN', 'Q13469;NFAC2_HUMAN', 'Q13191;CBLB_HUMAN', 'Q14997;PSME4_HUMAN', 'P62191;PRS4_HUMAN', 'P15291;B4GT1_HUMAN', 'Q9Y230;RUVB2_HUMAN', 'Q00688;FKBP3_HUMAN', 'Q13761;RUNX3_HUMAN', 'Q16531;DDB1_HUMAN', 'P49642;PRI1_H

In [6]:
# verify a list is unique

def verify_unique_list(l):
    return len(l) == len(set(l))

# find duplicates in the list

def find_duplicates(l):
    return list(set([x for x in l if l.count(x) > 1]))

print(f'first degree neighbours is unique: {verify_unique_list(first_degree_neighbours)}')
print(f'second degree neighbours is unique: {verify_unique_list(second_degree_neighbours)}')
print(f'third degree neighbours is unique: {verify_unique_list(third_degree_neighbours)}')

# print the duplicates in first degree neighbours

print(f'duplicates in first degree neighbours: {find_duplicates(first_degree_neighbours)}')
print(f'duplicates in second degree neighbours: {find_duplicates(second_degree_neighbours)}')
print(f'duplicates in third degree neighbours: {find_duplicates(third_degree_neighbours)}')

first degree neighbours is unique: True
second degree neighbours is unique: True
third degree neighbours is unique: True
duplicates in first degree neighbours: []
duplicates in second degree neighbours: []
duplicates in third degree neighbours: []


### Validation Framework Implementation

#### Initial Parameters

In [24]:
import Visualisation as vis
import matplotlib.pyplot as plt

import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# import random forest regression model
from sklearn.ensemble import RandomForestRegressor

# import support vector machine regression model
from sklearn.svm import SVR

# import elastic net regression model
from sklearn.linear_model import ElasticNet

# import simple mlp regression model
from sklearn.neural_network import MLPRegressor

# import xgb regression model
from xgboost import XGBRegressor

# import k nearest neighbors regression model
from sklearn.neighbors import KNeighborsRegressor

## feature selection
# import feature selection
from sklearn.feature_selection import SelectKBest, f_regression
import shap 

## validation
from sklearn.metrics import r2_score
from scipy.stats import pearsonr


## INPUTS 
# hyperparameters
max_gene_target_disance = 2
statistical_filter_size = 100
monte_carlo_cross_validation_size = 3
models_used = ['ElasticNet', 'RandomForestRegressor']
models_hyperparameters = [{'alpha': 1, 'l1_ratio': 0.5}, {'n_estimators': 100}]

# extra hyperparameters
statistical_filter_threshold = 0.05 
cv_split_size = 0.1
input_parameter_file_path = 'input_parameters.pkl'
output_file_path = 'results.pkl'


# generated hyperparameters
rng_seed_lists = []
for i in range(monte_carlo_cross_validation_size):
    rng_seed_lists.append(np.random.randint(100000))

def get_model_from_string(model_name, **kwargs):
    if model_name == 'ElasticNet':
        return ElasticNet(**kwargs)
    elif model_name == 'RandomForestRegressor':
        return RandomForestRegressor(**kwargs)
    elif model_name == 'SVR':
        return SVR(**kwargs)
    elif model_name == 'MLPRegressor':
        return MLPRegressor(**kwargs)
    elif model_name == 'XGBRegressor':
        return XGBRegressor(**kwargs)
    elif model_name == 'KNeighborsRegressor':
        return KNeighborsRegressor(**kwargs)
    else:
        raise ValueError(f'{model_name} is not supported')
    
nth_degree_neighbours = [drug_targets, first_degree_neighbours, second_degree_neighbours, third_degree_neighbours]

In [25]:
network_features = nth_degree_neighbours[max_gene_target_disance]
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=cv_split_size,
                                                    random_state=rng_seed_lists[0])

if statistical_filter_size > len(network_features):
    statistical_filter_size = len(network_features)
    print(f'WARNING: statistical_filter_size is too large, set to {statistical_filter_size}')

# perform feature selection on the training set
selector = SelectKBest(f_regression, k=statistical_filter_size)
selector.fit(X_train[network_features], y_train)

# get the selected features
selected_features = X_train[network_features].columns[selector.get_support()]

# get the feature importance
feature_importance = selector.scores_[selector.get_support()]

# DEBUG print the selected features and their importance
# print(f'selected features: {selected_features}')
# print(f'feature importance: {feature_importance}')

In [26]:

verbose = True
perform_whole_dataset_control = True
perform_random_control = True
get_feature_importance_for_whole_dataset_control = False
get_feature_importance_for_random_control = False
run_whole_dataset_control_only = False
run_random_control_only = False
data_collector = []

for model_str in models_used:
    for rng in rng_seed_lists:
        X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=cv_split_size,
                                                            random_state=rng)
    
        if not run_whole_dataset_control_only and not run_random_control_only:
            if verbose:
                print(f'running {model_str} with seed {rng} under experimental conditions')
            network_features = nth_degree_neighbours[max_gene_target_disance]
            # perform feature selection on the training set
            selector = SelectKBest(f_regression, k=statistical_filter_size)
            selector.fit(X_train[network_features], y_train)
            # get the selected features
            selected_features = X_train[network_features].columns[selector.get_support()]
            sel_train, sel_test = X_train[selected_features], X_test[selected_features]
            model = get_model_from_string(model_str, **models_hyperparameters[models_used.index(model_str)])
            model.fit(sel_train, y_train)
            y_pred = model.predict(sel_test)
            score = mean_squared_error(y_test, y_pred)
            corr, p_val = pearsonr(y_test, y_pred)
            r_squared = r2_score(y_test, y_pred)

            if model_str == 'RandomForestRegressor':
                explainer = shap.TreeExplainer(model, sel_train)
            elif model_str == 'ElasticNet':
                explainer = shap.LinearExplainer(model, sel_train)
            else: 
                explainer = shap.KernelExplainer(model.predict, sel_train)
            shap_values = explainer.shap_values(sel_test)

            if verbose:
                print(f'--- result: prediction correlation: {corr}, r-squared: {r_squared}')

            data_collector.append([rng, model_str, 'experimental',
                                score, corr, p_val, r_squared, 
                                shap_values, sel_train, sel_test, 
                                y_test, y_pred])
        
        if not run_whole_dataset_control_only and perform_whole_dataset_control:
            if verbose:
                print(f'running {model_str} with seed {rng} under whole dataset control conditions')
            whole_dataset_control_model = get_model_from_string(model_str, **models_hyperparameters[models_used.index(model_str)])
            whole_dataset_control_model.fit(X_train, y_train)
            y_pred = whole_dataset_control_model.predict(X_test)
            score = mean_squared_error(y_test, y_pred)
            corr, p_val = pearsonr(y_test, y_pred)
            r_squared = r2_score(y_test, y_pred)

            shap_values = None

            if get_feature_importance_for_whole_dataset_control:
                if model_str == 'RandomForestRegressor':
                    explainer = shap.TreeExplainer(whole_dataset_control_model, X_train)
                elif model_str == 'ElasticNet':
                    explainer = shap.LinearExplainer(whole_dataset_control_model, X_train)
                else: 
                    explainer = shap.KernelExplainer(whole_dataset_control_model.predict, X_train)
                shap_values = explainer.shap_values(X_test)

            if verbose:
                print(f'--- result: prediction correlation: {corr}, r-squared: {r_squared}')

            data_collector.append([rng, model_str, 'whole_dataset_control',
                                score, corr, p_val, r_squared,
                                shap_values, X_train, X_test,
                                y_test, y_pred])
            
        if not run_random_control_only and perform_random_control:
            if verbose:
                print(f'running {model_str} with seed {rng} under random control conditions')
            random_control_model = get_model_from_string(model_str, **models_hyperparameters[models_used.index(model_str)])
            # select random features of size statistical_filter_size
            random_features = np.random.choice(feature_data.columns, statistical_filter_size, replace=False)
            sel_train, sel_test = X_train[random_features], X_test[random_features]
            random_control_model.fit(sel_train, y_train)
            y_pred = random_control_model.predict(sel_test)
            score = mean_squared_error(y_test, y_pred)
            corr, p_val = pearsonr(y_test, y_pred)
            r_squared = r2_score(y_test, y_pred)

            shap_values = None

            if get_feature_importance_for_random_control:
                if model_str == 'RandomForestRegressor':
                    explainer = shap.TreeExplainer(random_control_model, sel_train)
                elif model_str == 'ElasticNet':
                    explainer = shap.LinearExplainer(random_control_model, sel_train)
                else: 
                    explainer = shap.KernelExplainer(random_control_model.predict, sel_train)
                shap_values = explainer.shap_values(sel_test)
            
            if verbose:
                print(f'--- result: prediction correlation: {corr}, r-squared: {r_squared}')
            
            data_collector.append([rng, model_str, 'random_control',
                                score, corr, p_val, r_squared,
                                shap_values, sel_train, sel_test,
                                y_test, y_pred])
            
df = pd.DataFrame(data_collector, columns=['rng', 'model', 'exp_condition', 'mse', 'corr', 'p_val', 'r_squared', 'shap_values', 'X_train', 'X_test', 'y_test', 'y_pred'])
df.head()

running ElasticNet with seed 25813 under experimental conditions
--- result: prediction correlation: 0.5601003974727523, r-squared: 0.2753858032585259
running ElasticNet with seed 25813 under whole dataset control conditions
--- result: prediction correlation: 0.5548203787797128, r-squared: 0.2603492949667393
running ElasticNet with seed 25813 under random control conditions
--- result: prediction correlation: 0.49595304397937107, r-squared: 0.19021294583795467
running ElasticNet with seed 16251 under experimental conditions
--- result: prediction correlation: 0.7165530913303219, r-squared: 0.435149517303831
running ElasticNet with seed 16251 under whole dataset control conditions
--- result: prediction correlation: 0.7253907816643994, r-squared: 0.48139184221673537
running ElasticNet with seed 16251 under random control conditions
--- result: prediction correlation: 0.5186305791885967, r-squared: 0.16999890934832296
running ElasticNet with seed 68672 under experimental conditions
--- 