## Initialise Repository

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\ode-biomarker-project


In [2]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')

In [3]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [4]:
# loading packages 

from tqdm import tqdm
from toolkit import *

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Retriving Data 

Create a list that stores all drug names 

In [5]:
gdsc = data_link.get_data_from_code('gdsc2')
# Select column 'DRUG_NAME' and make it unique by using set()
all_drug_names = list(set(gdsc['DRUG_NAME']))
all_drug_names

['XAV939',
 'Nelarabine',
 'WEHI-539',
 'WZ4003',
 'ML323',
 'Epirubicin',
 'Irinotecan',
 'VSP34_8731',
 'Staurosporine',
 'Vinblastine',
 'Taselisib',
 'Dihydrorotenone',
 '5-Fluorouracil',
 'Zoledronate',
 'ZM447439',
 'MG-132',
 'PRIMA-1MET',
 'AZD6482',
 'ABT737',
 'AZD7762',
 'Tamoxifen',
 'JAK_8517',
 'GSK2578215A',
 'Vorinostat',
 'MIRA-1',
 'GSK591',
 'BMS-345541',
 'Rapamycin',
 'SB505124',
 'PD173074',
 'CZC24832',
 'Fludarabine',
 'UMI-77',
 'BIBR-1532',
 'Acetalax',
 'Paclitaxel',
 'Uprosertib',
 'CDK9_5576',
 'GSK2606414',
 'Dactinomycin',
 'Picolinici-acid',
 'ERK_2440',
 'Fulvestrant',
 'Axitinib',
 'Cisplatin',
 'Erlotinib',
 'KU-55933',
 'BMS-754807',
 'Dasatinib',
 'Camptothecin',
 'Daporinad',
 'Sinularin',
 'Sapitinib',
 'Afatinib',
 'Ulixertinib',
 'Nilotinib',
 'Olaparib',
 'VX-11e',
 'EPZ004777',
 'Docetaxel',
 'AZD2014',
 'Obatoclax Mesylate',
 'AZD5991',
 'KRAS (G12C) Inhibitor-12',
 'PRT062607',
 'AZD8186',
 'Ipatasertib',
 'Sorafenib',
 'BI-2536',
 'JAK1_870

In [6]:
# load in dynamic features data 

available_drugs = []
for drug_name in tqdm(all_drug_names):
    if '-' in drug_name:
        continue
    # print(drug_name)
    loading_code = f'generic-gdsc-2-{drug_name}-LN_IC50-ccle_protein_expression-true-Cell_Line'
    # generic-gdsc-{number}-{drug_name}-{target_label}-{dataset_name}-{replace_index}-{row_index}
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    # print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    # if the feature data is not empty, append the drug name to the available_drugs list
    if feature_data.shape[0] > 0 and label_data.shape[0] > 0:
        available_drugs.append(drug_name)
        
available_drugs

 41%|████      | 78/192 [00:06<00:08, 12.76it/s]


KeyboardInterrupt: 

In [None]:
len(available_drugs)

152

## Create Streamline Functions

In [None]:
folder_name = 'CANISRDatabase'

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

In [None]:
def pipeline(drug_name):
    loading_code = f'generic-gdsc-2-{drug_name}-LN_IC50-ccle_protein_expression-true-Cell_Line'
    feature_data, label_data = data_link.get_data_using_code(loading_code)
    print(f'Data loaded for code {loading_code} Feature Shape {feature_data.shape} Label Shape {label_data.shape}')
    
    # split the data into training and testing data
    