In [96]:
# import necessary libraries

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pickle
import time

from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [2]:
# read in data

test_features = pd.read_csv('kaggle_data/test_features.csv')
train_features = pd.read_csv('kaggle_data/train_features.csv')
tt_nonscored = pd.read_csv('kaggle_data/train_targets_nonscored.csv')
tt_scored = pd.read_csv('kaggle_data/train_targets_scored.csv')
ss = pd.read_csv('kaggle_data/sample_submission.csv')

In [3]:
# Define models

kf = KFold(n_splits=5, shuffle=True, random_state=22)

ovr_class = OneVsRestClassifier(SGDClassifier(loss='log', max_iter=25000, n_jobs=-1, random_state=34, 
                                              learning_rate='constant', eta0=0.002, alpha=0.004, shuffle=True), n_jobs=-1)
clf = Pipeline([('ss', StandardScaler()),  ('classifier', ovr_class)])

ovr_class_2 = OneVsRestClassifier(LogisticRegression(random_state=34, n_jobs=-1, multi_class='multinomial'),n_jobs=-1)
clf2 = Pipeline([('ss', StandardScaler()), ('classifier', ovr_class_2)])

params = {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 'booster': 'gbtree',
         'tree_method': 'hist', 'n_jobs':-1,
          'random_state':34} 

xgb_cl = OneVsRestClassifier(XGBClassifier(**params),n_jobs=-1)
clf3 = Pipeline([('ss', StandardScaler()), ('classifier', xgb_cl)])

rf_cl = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, max_depth = 10, n_jobs=-1, random_state=34), n_jobs=-1)
clf4 = Pipeline([('ss', StandardScaler()), ('classifier', rf_cl)])

In [4]:
# sns.heatmap(pd.concat([tr_gene_df, tr_cell_df]))


In [5]:
# drop 'sig_id' column 

full_dfs = [tt_scored]
def col_drop(df):
    df = df.drop(columns=['sig_id'], axis=1, inplace=True)
    return df
    
    
for df in full_dfs:
    col_drop(df)

In [6]:
# create list of DataFrames

dfs = [train_features, test_features]

In [7]:
# send DataFrames through mapping function

def cleaner(df):
    df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1})
    df['cp_time'] = df['cp_time'].map({24: 1, 48: 2, 72: 3})
    df['cp_dose'] = df['cp_dose'].map({'D1': 0 , 'D2': 1})
    return df


for df in dfs:
    cleaner(df)

In [8]:
# Check shape of DataFrames

train_features.shape, test_features.shape

((23814, 876), (3982, 876))

In [9]:
## Find indicies that do not have '0' under the 'cp_type' feature, match and slice the dataframes
## Kept this code commented out because doing so made the models perform worse

#keep_idx_test = test_features[test_features.cp_type != 0].index
#keep_idx_train = train_features[train_features.cp_type != 0].index

#test_features = test_features.loc[keep_idx_test]
#train_features = train_features.loc[keep_idx_train]
#tt_scored = tt_scored.loc[keep_idx_train]
#train_features.shape, test_features.shape

In [106]:
# create feature-unique lists and dataframes for further examination and to limit dimentiality when training model

g_cols = [col for col in train_features if 'g-' in col]
c_cols = [col for col in train_features if 'c-' in col]

tr_gene_df = train_features.loc[:, 'g-0':'g-771']
tr_cell_df = train_features.loc[:, 'c-0':]

tr_cols = train_features.loc[:, 'cp_type':]
tr_cols.shape, tt_scored.shape

test_cols = test_features.loc[:, 'cp_type':]
tr_cols.shape, tt_scored.shape, test_cols.shape

((23814, 875), (23814, 206), (3982, 875))

In [107]:
# Found through EDA notebook, remove these extreme outlier features to reduce dimentiality

dfs = [tr_cols,test_cols]
col_list = ['g-496', 'g-333', 'g-676', 'g-127', 'g-39', 'g-360', 'g-28', 'g-19', 'g-184', 'g-110', 'g-687', 'g-216',
            'g-15', 'g-626', 'g-393', 'g-667', 'g-164', 'g-688', 'g-754', 'g-557', 'g-363', 'g-132', 'g-435', 'g-536',
            'g-550', 'g-481','g-611', 'g-18', 'g-756', 'g-331', 'g-618', 'g-718', 'g-370', 'g-219','g-153','g-46','g-238',
            'g-23','g-707','g-213','g-307','g-104']

 
def outlier_drop(df, col):
    df = df.drop([col], axis=1, inplace=True)
    return df
for col in col_list:
    for df in dfs:
        outlier_drop(df, col)
tr_cols.shape, test_cols.shape

((23814, 833), (3982, 833))

In [None]:
# convert dataframes to matricies for model input

X, y, test = np.array(tr_cols), np.array(tt_scored), np.array(test_cols)

In [23]:
# create empty dataframe and lists for metric/evaluation outputs

oof_preds = np.zeros(y.shape)
oof_losses = []
list_preds = []

In [24]:
# first model

for k_f, (tr_idx, t_idx) in enumerate(kf.split(X, y)):
    fold_start = time.time()
    
    X_train, X_val = X[tr_idx], X[t_idx]
    y_train, y_val = y[tr_idx], y[t_idx]
    
    clf1 = clf.fit(X_train, y_train)
    val_preds = clf.predict_proba(X_val)
    val_preds = np.array(val_preds)
    oof_preds[t_idx] = np.array(val_preds)
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    
    preds = clf.predict_proba(test)
    list_preds.append(preds)
    
    
    fold_end = time.time()
    print('fold time: ', fold_end - fold_start)
        
print(oof_losses)
# print(preds)

fold time:  11.34304928779602
fold time:  11.634966850280762
fold time:  11.603526830673218
fold time:  11.533483266830444
fold time:  11.438003778457642
[0.01670911375545155, 0.016992782722502972, 0.01724335821926487, 0.016240868223501997, 0.016883533478118513]


In [52]:
# second model 

oof_preds2 = np.zeros(y.shape)
oof_losses2 = []

for k_f, (tr_idx, t_idx) in enumerate(kf.split(X, y)):
    fold_start = time.time()
    
    X_train, X_val = X[tr_idx], X[t_idx]
    y_train, y_val = y[tr_idx], y[t_idx]
    
    clf2 = clf2.fit(X_train, y_train)
    val_preds2 = clf2.predict_proba(X_val)
    val_preds2 = np.array(val_preds2)
    oof_preds2[t_idx] = np.array(val_preds2)
    
    loss2 = log_loss(np.ravel(y_val), np.ravel(val_preds2))
    oof_losses2.append(loss2)
    
    # preds2 = clf2.predict_proba(test)
    
    
    fold_end = time.time()
    print('fold time: ', fold_end - fold_start)
        
print(oof_losses2)

fold time:  583.1211156845093
fold time:  580.9785079956055
fold time:  576.304976940155
fold time:  581.6895434856415
fold time:  576.6104669570923
[0.030242662109904228, 0.031145176664757922, 0.031212612469562716, 0.02901750874922711, 0.031257828227517416]


In [14]:
# third model

oof_preds3 = np.zeros(y.shape)
oof_losses3 = []

for k_f, (tr_idx, t_idx) in enumerate(kf.split(X, y)):
    fold_start = time.time()
    
    X_train, X_val = X[tr_idx], X[t_idx]
    y_train, y_val = y[tr_idx], y[t_idx]
    
    clf3 = clf3.fit(X_train, y_train) 
    val_preds3 = clf3.predict_proba(X_val)
    val_preds3 = np.array(val_preds3)
    oof_preds3[t_idx] = np.array(val_preds3)
    
    loss3 = log_loss(np.ravel(y_val), np.ravel(val_preds3))
    oof_losses3.append(loss3)
    
    fold_end = time.time()
    print('fold time: ', fold_end - fold_start)
print(oof_losses3)    

fold time:  744.5497477054596
fold time:  753.6238639354706
fold time:  770.0116214752197
fold time:  802.581018447876
fold time:  746.485387802124
[0.021129359037136582, 0.021639216346358108, 0.021779772649308103, 0.020202002061217948, 0.02137149255399953]


In [15]:
# fourth model 

oof_preds4 = np.zeros(y.shape)
oof_losses4 = []

for k_f, (tr_idx, t_idx) in enumerate(kf.split(X, y)):
    fold_start = time.time()
    
    X_train, X_val = X[tr_idx], X[t_idx]
    y_train, y_val = y[tr_idx], y[t_idx]
    
    clf4 = clf4.fit(X_train, y_train) 
    val_preds4 = clf4.predict_proba(X_val)
    val_preds4 = np.array(val_preds4)
    oof_preds4[t_idx] = np.array(val_preds4)
    loss4 = log_loss(np.ravel(y_val), np.ravel(val_preds4))
    oof_losses4.append(loss4)
    
    fold_end = time.time()
    print('fold time: ', fold_end - fold_start)
print(oof_losses4)

fold time:  575.3615190982819
fold time:  576.5796873569489
fold time:  572.8690297603607
fold time:  580.7065134048462
fold time:  575.86767578125
[0.01991970239313964, 0.020351519318704846, 0.02093639339655242, 0.019168043720590784, 0.020690117126422273]


In [97]:
# creat list and append output of 'time' for-loop to record prediction times for each model

models = ['SGDClassifier', 'Linear Regression', 'XGBoostClassifier', 'RandomForestClassifier']
clfs = [clf1, clf2, clf3, clf4]
model_predict_time = []

for clf in clfs:
    pred_start = time.time()
    clf.predict(test)
    pred_finish = time.time()
    total = round((pred_finish - pred_start),2)
    model_predict_time.append(total)
    
model_predict_time

[0.92, 0.92, 10.13, 12.41]

In [98]:
# create list and append the mean train & predict time during each KFold trip
# create list and append the mean log loss score across all 5 KFolds

vals = [oof_preds, oof_preds2, oof_preds3, oof_preds4]
model_times = [SGD_time, LR_time, XGB_time, RF_time]

mean_ll = []
mean_tp_time = []

for val, time in zip(vals, model_times):
    m_loss = log_loss(np.ravel(y), np.ravel(val))
    m_time = np.mean(time)
    mean_ll.append(m_loss)
    mean_tp_time.append(m_time)
mean_tp_time, mean_ll

([11.690075635910034, 578.7450776100159, 763.45032787323, 576.2768850803375],
 [0.016813928357025127,
  0.030575128977418726,
  0.02122436234413888,
  0.020213135162446088])

In [99]:
# Lists of each KFold log loss output and train & predict times, respectively


SGD_lls = [0.01670911375545155, 0.016992782722502972, 0.01724335821926487, 0.016240868223501997, 0.016883533478118513] 
LR_lls = [0.030242662109904228, 0.031145176664757922, 0.031212612469562716, 0.02901750874922711, 0.031257828227517416]
XGB_lls = [0.021129359037136582, 0.021639216346358108, 0.021779772649308103, 0.020202002061217948, 0.02137149255399953]
RF_lls = [0.01991970239313964, 0.020351519318704846, 0.02093639339655242, 0.019168043720590784, 0.020690117126422273]


SGD_time = [12.514066696166992, 11.79899001121521, 11.581927299499512, 11.190450191497803, 11.364943981170654]
LR_time = [582.286411523819, 580.2102625370026, 575.5257470607758, 580.1989614963531, 575.5040054321289]
XGB_time = [744.5497477054596, 753.6238639354706, 770.0116214752197, 802.581018447876, 746.485387802124]
RF_time = [575.3615190982819, 576.5796873569489, 572.8690297603607, 580.7065134048462, 575.86767578125]




In [120]:
# Create datatframe of final evaluation metrics and descriptions and export as csv file

metrics = {'model': models, 'OVR_wrap': True, 'StandardScaler': True, 'Pipeline': True, 'mean_log_loss: 5-fold':mean_ll, 
           'mean_train_&_predict_time: (sec)': mean_tp_time, 'model_predict_time':model_predict_time, 
           'features': 'all original, minus 42 "gene expression" columns'}

df = pd.DataFrame(metrics)
df.to_csv('Model_metrics.csv')
df.head()

Unnamed: 0,model,OVR_wrap,StandardScaler,Pipeline,mean_log_loss: 5-fold,mean_train_&_predict_time: (sec),model_predict_time,features
0,SGDClassifier,True,True,True,0.016814,11.690076,0.92,"all original, minus 42 ""gene expression"" columns"
1,Linear Regression,True,True,True,0.030575,578.745078,0.92,"all original, minus 42 ""gene expression"" columns"
2,XGBoostClassifier,True,True,True,0.021224,763.450328,10.13,"all original, minus 42 ""gene expression"" columns"
3,RandomForestClassifier,True,True,True,0.020213,576.276885,12.41,"all original, minus 42 ""gene expression"" columns"


In [122]:
# use pickle to save model

pickle.dump(clf1, open('OvR', 'wb'))