# Exploring Classifier Based Models

## Init

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\ode-biomarker-project


In [2]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

## Create Classifier Model

In [3]:
## Loading Data 

from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

X, y = make_classification(n_samples=1000, n_features=20)

print('Loading CCLE Palbociclib..')

loading_code = 'ccle-gdsc-2-Palbociclib-LN_IC50'
feature_data, label_data = data_link.get_data_using_code(loading_code)
print(f'Data loaded for code {loading_code}')

Loading CCLE Palbociclib..
Data loaded for code ccle-gdsc-2-Palbociclib-LN_IC50


In [4]:
X.shape, y.shape

((1000, 20), (1000,))

In [6]:
# import sklearn support vector machine classifier
from sklearn.svm import SVC
# import roc_auc_score to evaluate the model    
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from toolkit import *

def pipeline_func(X_train, y_train, rng):
    model = SVC()
    model.fit(X_train, y_train)
    # print('Model fitted')
    return {'model': model}
    
def eval_func(X_test, y_test, pipeline_components=None, save_model=False, **kwargs):
    
    y_pred = pipeline_components['model'].predict(X_test)
    # accuracy, precision, recall, f1, and roc_auc
    # assess accuracy of the model
    accuracy = pipeline_components['model'].score(X_test, y_test)
    # assess precision of the model
    precision = precision_score(y_test, y_pred)
    # assess recall of the model
    recall = recall_score(y_test, y_pred)
    # assess f1 of the model
    f1 = f1_score(y_test, y_pred)
    
    # assess roc_auc of the model
    roc_auc = roc_auc_score(y_test, y_pred)

    # print(f'Correlation: {corr}')
    returned_data = {'model_performance': accuracy, ### DO NOT GET RID
                     'feature_importance': None, ### DO NOT GET RID 
                     'recall': recall,
                    'precision': precision,
                    'f1': f1,  
                    'roc_auc': roc_auc,
                    'model': None 
    }
    if save_model:
        returned_data['model'] = pipeline_components['model']
    return returned_data

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [8]:
powerkit = Powerkit(X, y)
powerkit.add_condition('classifier', False, pipeline_func, {}, eval_func, {})
df = powerkit.run_all_conditions([42], 1)

In [9]:
df

Unnamed: 0,rng,condition,model_performance,recall,precision,f1,roc_auc,model
0,42,classifier,0.9,0.86,0.934783,0.895833,0.9,
