## Initialise Repository

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\ode-biomarker-project


In [2]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')

In [3]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [4]:
# loading packages 

from tqdm import tqdm
from toolkit import *

# load folder specific python files 

from functions import *

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Retriving Data 

Create a list that stores all drug names 

In [5]:
gdsc = data_link.get_data_from_code('gdsc2')
# Select column 'DRUG_NAME' and make it unique by using set()
all_drug_names = list(set(gdsc['DRUG_NAME']))
all_drug_names

['Osimertinib',
 'Mirin',
 'MN-64',
 'NU7441',
 'CDK9_5576',
 'Erlotinib',
 'Gallibiscoquinazole',
 'EPZ004777',
 'AZD8186',
 'Doramapimod',
 'Taselisib',
 'XAV939',
 'Ulixertinib',
 'BMS-345541',
 'Cyclophosphamide',
 'P22077',
 'JQ1',
 'MK-8776',
 'Foretinib',
 'GSK1904529A',
 'Cytarabine',
 'YK-4-279',
 'Olaparib',
 'Buparlisib',
 'Topotecan',
 'OTX015',
 'PLX-4720',
 'GSK343',
 'AZD2014',
 'WZ4003',
 'Dabrafenib',
 'Temozolomide',
 'Sepantronium bromide',
 'AZD7762',
 'PRIMA-1MET',
 'Afuresertib',
 'AZD5363',
 'PRT062607',
 'Palbociclib',
 'AZD5991',
 'Acetalax',
 'MG-132',
 'Wnt-C59',
 'MK-2206',
 'PCI-34051',
 'Ruxolitinib',
 'Picolinici-acid',
 'ERK_2440',
 'Pevonedistat',
 'Vinorelbine',
 'Obatoclax Mesylate',
 'Dihydrorotenone',
 'LY2109761',
 'Uprosertib',
 'Ribociclib',
 'Docetaxel',
 'Staurosporine',
 'BI-2536',
 'IWP-2',
 'VE-822',
 'OF-1',
 'AZD1208',
 'Fulvestrant',
 'Vinblastine',
 'GSK2578215A',
 'GSK269962A',
 'RO-3306',
 'Telomerase Inhibitor IX',
 'SCH772984',
 'Gef

In [6]:
# load in dynamic features data 

available_drugs = []
for drug_name in tqdm(all_drug_names):
    if '-' in drug_name:
        continue
    # print(drug_name)
    loading_code = f'generic-gdsc-2-{drug_name}-LN_IC50-ccle_protein_expression-true-Cell_Line'
    # generic-gdsc-{number}-{drug_name}-{target_label}-{dataset_name}-{replace_index}-{row_index}
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    # print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    # if the feature data is not empty, append the drug name to the available_drugs list
    if feature_data.shape[0] > 0 and label_data.shape[0] > 0:
        available_drugs.append(drug_name)
        
available_drugs

100%|██████████| 192/192 [00:13<00:00, 14.68it/s]


['Osimertinib',
 'Mirin',
 'NU7441',
 'CDK9_5576',
 'Erlotinib',
 'Gallibiscoquinazole',
 'EPZ004777',
 'AZD8186',
 'Doramapimod',
 'Taselisib',
 'XAV939',
 'Ulixertinib',
 'Cyclophosphamide',
 'P22077',
 'JQ1',
 'Foretinib',
 'GSK1904529A',
 'Cytarabine',
 'Olaparib',
 'Buparlisib',
 'Topotecan',
 'OTX015',
 'GSK343',
 'AZD2014',
 'WZ4003',
 'Dabrafenib',
 'Temozolomide',
 'Sepantronium bromide',
 'AZD7762',
 'Afuresertib',
 'AZD5363',
 'PRT062607',
 'Palbociclib',
 'AZD5991',
 'Acetalax',
 'Ruxolitinib',
 'ERK_2440',
 'Pevonedistat',
 'Vinorelbine',
 'Obatoclax Mesylate',
 'Dihydrorotenone',
 'LY2109761',
 'Uprosertib',
 'Ribociclib',
 'Docetaxel',
 'Staurosporine',
 'AZD1208',
 'Fulvestrant',
 'Vinblastine',
 'GSK2578215A',
 'GSK269962A',
 'Telomerase Inhibitor IX',
 'SCH772984',
 'Gefitinib',
 'Selumetinib',
 'Alisertib',
 'PFI3',
 'AZD6738',
 'ERK_6604',
 'Irinotecan',
 'PAK_5339',
 'Elephantin',
 'VE821',
 'Crizotinib',
 'EPZ5676',
 'GDC0810',
 'AZD5438',
 'AZD4547',
 'AZD5582',


In [10]:
len(available_drugs)

152

## Create Streamline Functions

In [68]:
folder_name = 'CANISRDatabase'

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

In [69]:
def pipeline_func(X_train, y_train, rng, model_used, **kwargs):
    k = X_train.shape[1]
    selected_features, scores = f_regression_select(X_train, y_train, k)
    model = get_model_from_string(model_used, **kwargs)
    selected_features, X_selected = select_preset_features(X_train, y_train, selected_features)
    model.fit(X_selected, y_train)
    return {'model': model,
            'filter_selected_features': selected_features,
            'filter_scores': scores}


def eval_func(X_test, y_test, pipeline_components=None, **kwargs):
    selected_features, X_selected = select_preset_features(X_test, y_test, pipeline_components['filter_selected_features'])
    y_pred = pipeline_components['model'].predict(X_selected)
    # assess performance by pearson correlation
    corr, p_vals = pearsonr(y_test, y_pred)
    feat_imp = (pipeline_components['filter_selected_features'], pipeline_components['filter_scores'])
    return {'model_performance': corr, 'p_vals': p_vals, 'feature_importance': feat_imp}

In [None]:
def run_drug(drug_name, rng): 
    loading_code = f'generic-gdsc-2-{drug_name}-LN_IC50-ccle_protein_expression-true-Cell_Line'
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    
    ### Extra Preprocessing Steps 
    # ensure all feature column names are strings
    feature_data.columns = [str(col) for col in feature_data.columns]
    # remove Nan values from the feature data
    feature_data = feature_data.dropna(axis=1)
    # ensure all column names are unique by dropping duplicates
    feature_data = feature_data.loc[:,~feature_data.columns.duplicated()]
    print(f'Feature Shape after preprocessing and dropping duplicates {feature_data.shape}')
    powerkit = Powerkit(feature_data, label_data)
    powerkit.add_condition(drug_name, True, pipeline_func, {'model_used': 'XGBRegressor'}, eval_func, {})
    df = powerkit.run_selected_condition(drug_name, [rng], 1, True)
    return df 

In [71]:
df = run_drug('Palbociclib')

Data loaded for code generic-gdsc-2-Palbociclib-LN_IC50-ccle_protein_expression-true-Cell_Line Feature Shape (258, 12755) Label Shape (258,)
Feature Shape after preprocessing and dropping duplicates (258, 5109)


In [72]:
df

Unnamed: 0,rng,condition,model_performance,p_vals,feature_importance
0,1,Palbociclib,0.362175,0.069032,"([SLC12A2, KDM1A, SRP14, SOGA1, NUP133, RPL4, ..."


In [73]:
# convert feature importance to a dataframe
feature_importance = df['feature_importance'][0]
feature_importance_df = pd.DataFrame({'Feature': feature_importance[0], 'Score': feature_importance[1]})
# set the value to be the absolute value of the score column
feature_importance_df['Score'] = abs(feature_importance_df['Score'])
# sort the dataframe by the score column in descending order
feature_importance_df = feature_importance_df.sort_values(by='Score', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Score
3834,GNG12,5.859038e+01
3611,IL6ST,5.762527e+01
931,RAI14,5.014236e+01
2764,SDC4,4.953916e+01
2782,CORO7,4.490675e+01
...,...,...
2509,RRP36,1.638636e-05
3336,ETV6,9.627023e-06
3748,ATAD1,5.225055e-06
261,CEP290,2.231663e-06
