## Initialise Repository

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Users\dawson\Documents\GitHub\ode-biomarker-project


In [2]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')

In [3]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [4]:
# loading packages 

from tqdm import tqdm
from toolkit import *

# load folder specific python files 
from sklearn.metrics import mean_squared_error, r2_score
def pipeline_tree_methods(X_train, 
                          y_train, 
                          rng, 
                          model_used, 
                          model_extra_args, 
                          pre_filter=True,
                          pre_filter_size=1000,
                          **kwargs):
    
    # RandomForestRegressor or XGBRegressor at the moment 
    if model_used != 'RandomForestRegressor' and model_used != 'XGBRegressor':
        raise ValueError(f'Model not supported for pipeline_tree_methods, use RandomForestRegressor or XGBRegressor, current model_used param is: {model_used}')
    
    # perform feature selection if pre_filter is True
    if pre_filter:
        selected_features, scores = f_regression_select(X_train, y_train, pre_filter_size)
        _, X_selected = select_preset_features(X_train, y_train, selected_features)
    else:
        X_selected = X_train
    model = get_model_from_string(model_used, **model_extra_args)
    model.fit(X_selected, y_train)
    return {'model': model, 
            'model_type': model_used,
            'train_data': X_train,
            'pre_filter': pre_filter,
            'filtered_features': selected_features if pre_filter else None,
            }


def shap_eval_func(X_test, y_test, pipeline_components=None, **kwargs):
    
    '''
    evaluate the performance of a pipeline through pearson correlation, r2, mse, and 
    feature importance scores using mean absolute SHAP values 
    inputs
        X_test: test set features
        y_test: test set labels
        pipeline_components: dictionary of pipeline components, e.g. {'model': model, 'selected_features': selected_features, 'scores': scores}
    '''
    
    ## evaluation of model performance using test set
    X_test, y_test = transform_impute_by_zero_to_min_uniform(X_test, y_test)
    if pipeline_components['filtered_features'] is None: 
        X_selected = X_test
    else:
        _, X_selected = select_preset_features(X_test, y_test, pipeline_components['filtered_features'])
    y_pred = pipeline_components['model'].predict(X_selected)
    # assess performance by pearson correlation
    corr, p_vals = pearsonr(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    ## obtaining SHAP values for each feature, mean absolute SHAP values will 
    ## be used as a way to compute feature importance scores 
    shap_values = get_shap_values(pipeline_components['model'], 
                                  pipeline_components['model_type'],
                                  pipeline_components['train_data'], 
                                  X_selected)
    mean_shap_values = np.abs(shap_values).mean(axis=0)
    ## returning key metrics and results 
    features, scores = X_selected.columns.tolist(), mean_shap_values.tolist()
    # at the end, return a dictionary of all the information you want to return
    return {'model_used': pipeline_components['model_type'],
            'prediction_target': 'cell_LNIC50',
            'model_performance': corr, 
            'pearson_p_vals': p_vals, 
            'r_squared': r2,
            'mse': mse,
            'feature_importance': (features, scores),
            'important_features': features, 
            'feature_scores': scores,
            'y_pred': y_pred, # for plotting purposes
            'y_test': y_test, 
            }

In [5]:
# test data\\proteomic-expression\\ccle-2019-cell\\ccle_proteomics_processed.pkl for access 
data = data_link.get_data_from_code('ccle_protein_expression')


## Retriving Data 

Create a list that stores all drug names 

In [None]:
gdsc = data_link.get_data_from_code('gdsc1')
# Select column 'DRUG_NAME' and make it unique by using set()
all_drug_names = list(set(gdsc['DRUG_NAME']))
all_drug_names

['Ipatasertib',
 'AZD6482',
 'IAP_5620',
 'BMS-345541',
 'KU-55933',
 'Alisertib',
 'VSP34_8731',
 'Nelarabine',
 'Cytarabine',
 'AZD6738',
 'Staurosporine',
 'Leflunomide',
 'AGI-5198',
 'Ibrutinib',
 'PD0325901',
 'Picolinici-acid',
 'MK-2206',
 'I-BET-762',
 'JAK1_8709',
 'GSK2606414',
 'Zoledronate',
 'OSI-027',
 'AZD8186',
 'Docetaxel',
 'MK-1775',
 'Gallibiscoquinazole',
 'Dactolisib',
 'Elephantin',
 'Foretinib',
 'Crizotinib',
 'Temozolomide',
 'PAK_5339',
 'Ulixertinib',
 'Camptothecin',
 'ERK_2440',
 'AZD5991',
 'ML323',
 'UMI-77',
 'LCL161',
 'XAV939',
 'GSK1904529A',
 'PRIMA-1MET',
 'MN-64',
 'Topotecan',
 'AMG-319',
 'Alpelisib',
 'GDC0810',
 'LY2109761',
 'KRAS (G12C) Inhibitor-12',
 'RO-3306',
 'EPZ5676',
 'EPZ004777',
 'OTX015',
 'Obatoclax Mesylate',
 'MK-8776',
 'Vinorelbine',
 'VE-822',
 'Cyclophosphamide',
 'TAF1_5496',
 'VX-11e',
 'Sepantronium bromide',
 'CDK9_5038',
 'GSK2578215A',
 'Wee1 Inhibitor',
 'Taselisib',
 'Erlotinib',
 'PCI-34051',
 'PD173074',
 'Dorama

In [None]:
# load in dynamic features data 

available_drugs = []
for drug_name in tqdm(all_drug_names):
    if '-' in drug_name:
        continue
    # print(drug_name)
    loading_code = f'generic-gdsc-1-{drug_name}-LN_IC50-ccle_protein_expression-true-Cell_Line'
    # generic-gdsc-{number}-{drug_name}-{target_label}-{dataset_name}-{replace_index}-{row_index}
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    # print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    # if the feature data is not empty, append the drug name to the available_drugs list
    if feature_data.shape[0] > 0 and label_data.shape[0] > 0:
        available_drugs.append(drug_name)
        
available_drugs

 38%|███▊      | 73/192 [00:14<00:20,  5.72it/s]

In [None]:
len(available_drugs)

## Create Streamline Functions

In [None]:
folder_name = 'CANISRDatabase'

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

In [None]:
def pipeline_func(X_train, y_train, rng, model_used, **kwargs):
    k = X_train.shape[1]
    selected_features, scores = f_regression_select(X_train, y_train, int(k/2))
    model = get_model_from_string(model_used, **kwargs)
    selected_features, X_selected = select_preset_features(X_train, y_train, selected_features)
    model.fit(X_selected, y_train)
    return {'model': model,
            'filter_selected_features': selected_features,
            'filter_scores': scores}


def eval_func(X_test, y_test, pipeline_components=None, **kwargs):
    selected_features, X_selected = select_preset_features(X_test, y_test, pipeline_components['filter_selected_features'])
    y_pred = pipeline_components['model'].predict(X_selected)
    # assess performance by pearson correlation
    corr, p_vals = pearsonr(y_test, y_pred)
    feat_imp = (pipeline_components['filter_selected_features'], pipeline_components['filter_scores'])
    return {'model_performance': corr, 'p_vals': p_vals, 'feature_importance': feat_imp}

In [None]:
def run_drug(drug_name, rng): 
    loading_code = f'generic-gdsc-2-{drug_name}-LN_IC50-ccle_protein_expression-true-Cell_Line'
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    
    ### Extra Preprocessing Steps 
    # ensure all feature column names are strings
    feature_data.columns = [str(col) for col in feature_data.columns]
    # remove Nan values from the feature data
    feature_data = feature_data.dropna(axis=1)
    # ensure all column names are unique by dropping duplicates
    feature_data = feature_data.loc[:,~feature_data.columns.duplicated()]
    print(f'Feature Shape after preprocessing and dropping duplicates {feature_data.shape}')
    powerkit = Powerkit(feature_data, label_data)
    powerkit.add_condition(drug_name, True, pipeline_func, {'model_used': 'XGBRegressor'}, eval_func, {})
    df = powerkit.run_selected_condition(drug_name, [rng], 1, True)
    return df 

In [None]:
drugs = ['Alisertib', 'Palbociclib']
all_dfs = []
for drug in drugs:
    df = run_drug(drug, 0)
    all_dfs.append(df)

In [None]:
# combine all dataframes into one dataframe
df = pd.concat(all_dfs)
# reset index
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
# convert feature importance to a dataframe
feature_importance = df['feature_importance'][0].iloc[0]


In [None]:
feature_importance_df = pd.DataFrame({'Feature': feature_importance[0], 'Score': feature_importance[1]})
# set the value to be the absolute value of the score column
feature_importance_df['Score'] = abs(feature_importance_df['Score'])
# sort the dataframe by the score column in descending order
feature_importance_df = feature_importance_df.sort_values(by='Score', ascending=False)
feature_importance_df

### Streamline

In [None]:
# ccle_protein_expression-true-Cell_Line

sample_kwargs = {
    'drugs': available_drugs,
    'data_link': data_link,
    'drug_database': 'gdsc-2',
    'feature_database_string': 'ccle_protein_expression-true-Cell_Line',
    'target_name': 'LN_IC50',
    'pipeline': pipeline_func,
    'pipeline_args': {'model_used': 'RandomForestRegressor', 'model_extra_args': {}},
    'evaluation_func': shap_eval_func,
    'evaluation_args': {},
    'experiment_id': 'test_sample_10',
    'random_seeds': [i for i in range(10)],
    'n_cores': 1,
}

In [None]:
def run_drugs(drug, **kwargs):
    # breakdown the kwargs
    drug_name = drug
    data_link = kwargs['data_link']
    drug_database = kwargs['drug_database']
    feature_database_string = kwargs['feature_database_string']
    target_name = kwargs['target_name']
    pipeline = kwargs['pipeline']
    pipeline_args = kwargs['pipeline_args']
    evaluation_func = kwargs['evaluation_func']
    evaluation_args = kwargs['evaluation_args']
    random_seeds = kwargs['random_seeds']
    n_cores = kwargs['n_cores']
    
    loading_code = f'generic-{drug_database}-{drug_name}-{target_name}-{feature_database_string}'
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    
    ### Extra Preprocessing Steps 
    # ensure all feature column names are strings
    feature_data.columns = [str(col) for col in feature_data.columns]
    # remove Nan values from the feature data
    feature_data = feature_data.dropna(axis=1)
    # ensure all column names are unique by dropping duplicates
    feature_data = feature_data.loc[:,~feature_data.columns.duplicated()]
    print(f'Feature Shape after preprocessing and dropping duplicates {feature_data.shape}')
    powerkit = Powerkit(feature_data, label_data)
    powerkit.add_condition(drug_name, True, pipeline, pipeline_args, evaluation_func, evaluation_args)
    df = powerkit.run_selected_condition(drug_name, random_seeds, n_cores, True)
    return df 


def run_all_drugs(**kwargs):
    drugs = kwargs['drugs']
    all_dfs = []
    for drug in drugs:
        df = run_drugs(drug, **kwargs)
        all_dfs.append(df)
    # combine all dataframes into one dataframe
    df = pd.concat(all_dfs)
    # reset index
    df = df.reset_index(drop=True)
    return df

In [None]:
df = run_all_drugs(**sample_kwargs) 

In [None]:
df

### Connect to Database and upload data

In [None]:
from sqlalchemy import create_engine

# load .env file to get the current
engine = create_engine(
    "postgresql+pg8000://canisr:canisr@192.168.3.106:9080/db"
)

In [None]:
from sqlalchemy.dialects.postgresql import ARRAY, TEXT, INTEGER, FLOAT

# write df to database

df.to_sql('test_sample_10', engine, if_exists='replace', index=False, dtype={
    'important_features': ARRAY(TEXT),
    'feature_scores': ARRAY(FLOAT), 
    'y_pred': ARRAY(FLOAT), 
    'y_test': ARRAY(FLOAT),
})

In [None]:
print(engine)

In [None]:
from sqlalchemy import text 

# select data from database and load into a dataframe to check if the data was written correctly
# Establish a connection
with engine.connect() as connection:
    query = text('SELECT * FROM test_sample_10')
    df = pd.read_sql_query(query, connection)
    
df

In [None]:
# extract 'importance_features' and 'feature_scores' columns from the dataframe and plot them as bar chart

feature_importance = df['important_features'][0]
feature_scores = df['feature_scores'][0]

feature_importance_df = pd.DataFrame({'Feature': feature_importance, 'Score': feature_scores})
# set the value to be the absolute value of the score column

feature_importance_df['Score'] = abs(feature_importance_df['Score'])
# sort the dataframe by the score column in descending order
feature_importance_df = feature_importance_df.sort_values(by='Score', ascending=False)

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context('talk')
sns.set_style('whitegrid')

plt.figure(figsize=(20, 10))
sns.barplot(x='Score', y='Feature', data=feature_importance_df[:10])
plt.title('Feature Importance')
plt.show()

In [None]:
# plot y_pred and y_test columns from the dataframe

y_pred = df['y_pred'][0]
y_test = df['y_test'][0]

# make into np arrays
y_pred = np.array(y_pred)
y_test = np.array(y_test)

plt.figure(figsize=(20, 10))
plt.scatter(y_test, y_pred)
# show a trend line
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.plot(y_test, np.poly1d(np.polyfit(y_test, y_pred, 1))(y_test), color='red', lw=2)
plt.title('Predictions vs True Values')
# also show the correlation coefficient
plt.text(0, 6, f'Pearson Correlation: {df["model_performance"][0]:.2f}', fontsize=24, ha='center')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()


### Streamline 2

In [None]:
def pipeline_func(X_train, y_train, rng, model_used, **kwargs):
    k = 100 # have to minimize the number of features to 100 for the model to work with kernelexplainer 
    selected_features, scores = f_regression_select(X_train, y_train, k)
    model = get_model_from_string(model_used, **kwargs)
    selected_features, X_selected = select_preset_features(X_train, y_train, selected_features)
    model.fit(X_selected, y_train)
    return {'model': model,
            'model_type': model_used,
            'train_data': X_selected,
            'filtered_features': selected_features,
            'filtered_scores': scores}

folder_name = 'CANISRDatabase'
if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'


In [None]:
all_models = ['RandomForestRegressor', 'XGBRegressor', 'MLPRegressor', 'KNeighborsRegressor', 'ElasticNet', 'LinearRegression', 'SVR']

In [None]:
streamline2_kwargs = {
    'drugs': available_drugs, # FIXME: Change this to available_drugs on nml
    'models_used': all_models, # FIXME: Change this to all_models on nml
    'data_link': data_link,
    'drug_database': 'gdsc-1',
    'feature_database_string': 'ccle_protein_expression-true-Cell_Line',
    'target_name': 'LN_IC50',
    'pipeline': pipeline_func,
    'pipeline_args': {'model_used': 'RandomForestRegressor'},
    'evaluation_func': shap_eval_func,
    'evaluation_args': {},
    'experiment_id': 'drug_response_models',
    'random_seeds': [i for i in range(10)], # FIXME: Change this to 10 on nml
    'n_cores': -1,
}

In [None]:
def run_drugs_with_model(drug, model_used, **kwargs):
    # breakdown the kwargs
    drug_name = drug
    data_link = kwargs['data_link']
    drug_database = kwargs['drug_database']
    feature_database_string = kwargs['feature_database_string']
    target_name = kwargs['target_name']
    pipeline = kwargs['pipeline']
    evaluation_func = kwargs['evaluation_func']
    evaluation_args = kwargs['evaluation_args']
    random_seeds = kwargs['random_seeds']
    n_cores = kwargs['n_cores']
    
    loading_code = f'generic-{drug_database}-{drug_name}-{target_name}-{feature_database_string}'
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    # print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    
    ### Extra Preprocessing Steps 
    # ensure all feature column names are strings
    feature_data.columns = [str(col) for col in feature_data.columns]
    # remove Nan values from the feature data
    feature_data = feature_data.dropna(axis=1)
    # ensure all column names are unique by dropping duplicates
    feature_data = feature_data.loc[:,~feature_data.columns.duplicated()]
    # print(f'Feature Shape after preprocessing and dropping duplicates {feature_data.shape}')
    powerkit = Powerkit(feature_data, label_data)
    pipeline_args = {'model_used': model_used}
    powerkit.add_condition(drug_name, True, pipeline, pipeline_args, evaluation_func, evaluation_args)
    df = powerkit.run_selected_condition(drug_name, random_seeds, n_cores, True)
    return df 

import tqdm 
import itertools
from tqdm.notebook import trange, tqdm

def run_all_drugs_2(**kwargs):
    drugs = kwargs['drugs']
    all_model_used = kwargs['models_used']
    # all_dfs = []
    # for drug in drugs:
    #     for model_used in all_model_used:
    #         df = run_drugs_with_model(drug, model_used, **kwargs)
    #         all_dfs.append(df)
    # use tqdm to show progress bar
    all_dfs = []
    # zip the drugs and models together
    drug_model_pairs = list(itertools.product(drugs, all_model_used))
    print(f'Running {len(drug_model_pairs)} drug-model pairs')
    for drug, model_used in tqdm(drug_model_pairs, desc="Running drug-model pairs"):
        # print(f'Running drug {drug} with model {model_used}')
        df = run_drugs_with_model(drug, model_used, **kwargs)
        all_dfs.append(df)         
    # combine all dataframes into one dataframe
    df = pd.concat(all_dfs)
    # reset index
    df = df.reset_index(drop=True)
    return df

In [None]:
df = run_all_drugs_2(**streamline2_kwargs) 

In [None]:
# change the column names to be more readable
df = df.rename(columns={'condition': 'drugname'})

In [None]:
# add a column called 'source_id' which will be a unique id for the entire cohort 
source_id = "goncalves_proteomics_gdsc1_shap_v1"
# make all the source_id the same
df['source_id'] = source_id

In [None]:
df

In [None]:
# extract 'importance_features' and 'feature_scores' columns from the dataframe and plot them as bar chart

feature_importance = df['important_features'][0]
feature_scores = df['feature_scores'][0]

feature_importance_df = pd.DataFrame({'Feature': feature_importance, 'Score': feature_scores})
# set the value to be the absolute value of the score column

feature_importance_df['Score'] = abs(feature_importance_df['Score'])
# sort the dataframe by the score column in descending order
feature_importance_df = feature_importance_df.sort_values(by='Score', ascending=False)

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context('talk')
sns.set_style('whitegrid')

plt.figure(figsize=(20, 10))
sns.barplot(x='Score', y='Feature', data=feature_importance_df[:10])
plt.title('Feature Importance')
plt.show()

In [None]:
# plot y_pred and y_test columns from the dataframe

y_pred = df['y_pred'][0]
y_test = df['y_test'][0]

# make into np arrays
y_pred = np.array(y_pred)
y_test = np.array(y_test)

plt.figure(figsize=(20, 10))
plt.scatter(y_test, y_pred)
# show a trend line
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.plot(y_test, np.poly1d(np.polyfit(y_test, y_pred, 1))(y_test), color='red', lw=2)
plt.title('Predictions vs True Values')
# also show the correlation coefficient
plt.text(0, 6, f'Pearson Correlation: {df["model_performance"][0]:.2f}', fontsize=24, ha='center')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()


In [None]:
# save df to the folder as a pickle file
df.to_pickle(f'{file_save_path}drug_response_models.pkl')
print(f'Data saved to {file_save_path}drug_response_models.pkl')