In [1]:
import numpy as np
import pandas as pd

#Metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

##Classifiers
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

#Hyperparameters
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from pprint import pprint
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

from sklearn.model_selection import cross_val_score

In [9]:
from google.colab import files
uploaded = files.upload()

Saving x_test.csv to x_test (1).csv
Saving x_train.csv to x_train.csv
Saving y_test.csv to y_test (1).csv
Saving y_train.csv to y_train.csv


In [15]:
X = pd.read_csv('x_test.csv')
y = pd.read_csv('y_test.csv').squeeze()
X_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv').squeeze()


In [16]:
X = X.drop(columns=['is_weekday', 'is_workday', 'is_holiday', 'correct_diagnosis'])
X_train = X_train.drop(columns=['is_weekday', 'is_workday', 'is_holiday', 'correct_diagnosis'])

In [19]:
print('Test data dimensions:', X.shape)
print()
print('Train data dimensions:', X_train.shape)

Test data dimensions: (111191, 15)

Train data dimensions: (444760, 15)


In [95]:
metrics = {'accuracy':accuracy_score,'precision':precision_score,'recall':recall_score,
          'f1':f1_score,'roc_auc':roc_auc_score}

def evaluate_model_on_test(model):
  #model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
   #           n_estimators = 10, algorithm="SAMME.R", learning_rate = 0.5)
    
  scores = {metric:[] for metric in metrics.keys()} 
  model.fit(X_train.values, y_train.values)
  ypred = model.predict(X.values)

  for i, (metric, score) in enumerate(metrics.items()):
    scores[metric].append(score(y.values, ypred))

  return pd.DataFrame(scores).transpose()

## Decision Tree Baseline Model

In [96]:
baseline = evaluate_model_on_test(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
            n_estimators = 10, algorithm="SAMME.R", learning_rate = 0.5))
baseline

Unnamed: 0,0
accuracy,0.799723
precision,0.814169
recall,0.94241
f1,0.873609
roc_auc,0.673749


## Random Hyperparameter: Best params on test data

In [None]:
ada_random.best_params_

{'learning_rate': 0.7000000000000001, 'n_estimators': 6}

In [97]:
random_tuning_on_test = evaluate_model_on_test(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
            n_estimators = 6,
            algorithm="SAMME.R",
            learning_rate = 0.7))
random_tuning_on_test

Unnamed: 0,0
accuracy,0.802403
precision,0.80912
recall,0.956639
f1,0.876718
roc_auc,0.666232


## Grid Search: Best params on test data

In [None]:
ada_grid.best_params_ 

{'learning_rate': 0.1, 'n_estimators': 9}

In [98]:
grid_tuning_on_test = evaluate_model_on_test(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
            n_estimators = 9,
            algorithm="SAMME.R",
            learning_rate = 0.1))
grid_tuning_on_test

Unnamed: 0,0
accuracy,0.787672
precision,0.787461
recall,0.973709
f1,0.870737
roc_auc,0.623425


## Comparison on test data

In [104]:
names = ['Baseline', 'Random Search', 'Grid Search']
results = pd.concat([baseline, random_tuning_on_test, grid_tuning_on_test], axis=1)
results.columns = names
results

Unnamed: 0,Baseline,Random Search,Grid Search
accuracy,0.799723,0.802403,0.787672
precision,0.814169,0.80912,0.787461
recall,0.94241,0.956639,0.973709
f1,0.873609,0.876718,0.870737
roc_auc,0.673749,0.666232,0.623425
