In [1]:
import os 
import pandas as pd
import numpy as np

# import sklearn methods 
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sys 
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification
from catboost import CatBoostClassifier

In [2]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")

    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")

    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")

    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [3]:
################ not scaled dataset ###################
df_original = pd.read_pickle('../input/ddd-final-datasets/final_step2.pkl')

X = df_original.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]

y = df_original.loc[:, "DROWSINESS"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [4]:
################ scaled dataset ###################
df_original2 = pd.read_pickle('../input/ddd-final-datasets/final_step2_scaled.pkl')

X2 = df_original2.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]

y2 = df_original2.loc[:, "DROWSINESS"]


X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, stratify = y2)

In [5]:
# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=10, random_state = 42)

# CART decision tree
cat = CatBoostClassifier(random_seed=42)

# parameters 
parameters = {
            'l2_leaf_reg': [0.1, 1, 10],
            'iterations': [100, 500],
            'depth': [5, 10],
            'learning_rate':[0.001, 0.01, 0.1]

}

In [6]:
# grid search for parameters
grid = GridSearchCV(estimator=cat, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)



0:	learn: 0.6688448	total: 252ms	remaining: 2m 5s
1:	learn: 0.6508698	total: 418ms	remaining: 1m 44s
2:	learn: 0.6355976	total: 599ms	remaining: 1m 39s
3:	learn: 0.6242818	total: 775ms	remaining: 1m 36s
4:	learn: 0.6143555	total: 948ms	remaining: 1m 33s
5:	learn: 0.6060068	total: 1.12s	remaining: 1m 32s
6:	learn: 0.5999694	total: 1.3s	remaining: 1m 31s
7:	learn: 0.5950102	total: 1.46s	remaining: 1m 29s
8:	learn: 0.5901474	total: 1.63s	remaining: 1m 28s
9:	learn: 0.5860694	total: 1.81s	remaining: 1m 28s
10:	learn: 0.5830635	total: 2.08s	remaining: 1m 32s
11:	learn: 0.5798128	total: 2.36s	remaining: 1m 35s
12:	learn: 0.5774168	total: 2.62s	remaining: 1m 38s
13:	learn: 0.5753279	total: 2.88s	remaining: 1m 40s
14:	learn: 0.5732556	total: 3.15s	remaining: 1m 41s
15:	learn: 0.5714910	total: 3.32s	remaining: 1m 40s
16:	learn: 0.5696284	total: 3.51s	remaining: 1m 39s
17:	learn: 0.5678589	total: 3.7s	remaining: 1m 39s
18:	learn: 0.5664777	total: 3.87s	remaining: 1m 38s
19:	learn: 0.5652896	tota

fit for scaled dataset

In [7]:
# grid search for parameters
grid2 = GridSearchCV(estimator=cat, param_grid=parameters, cv=cv, n_jobs=-1)
grid2.fit(X_train2, y_train2)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid2.best_params_, grid2.best_score_))

# prediction results
y_pred2 = grid2.predict(X_test2)

# print accuracy metrics
results2, false2 = display_test_scores(y_test2, y_pred2)
print(results2)



0:	learn: 0.6689349	total: 272ms	remaining: 2m 15s
1:	learn: 0.6507635	total: 455ms	remaining: 1m 53s
2:	learn: 0.6359597	total: 677ms	remaining: 1m 52s
3:	learn: 0.6247516	total: 853ms	remaining: 1m 45s
4:	learn: 0.6150410	total: 1.04s	remaining: 1m 42s
5:	learn: 0.6078960	total: 1.21s	remaining: 1m 39s
6:	learn: 0.6009823	total: 1.4s	remaining: 1m 38s
7:	learn: 0.5952072	total: 1.58s	remaining: 1m 36s
8:	learn: 0.5907006	total: 1.74s	remaining: 1m 34s
9:	learn: 0.5867822	total: 1.91s	remaining: 1m 33s
10:	learn: 0.5833639	total: 2.09s	remaining: 1m 33s
11:	learn: 0.5807390	total: 2.27s	remaining: 1m 32s
12:	learn: 0.5784470	total: 2.45s	remaining: 1m 31s
13:	learn: 0.5763975	total: 2.63s	remaining: 1m 31s
14:	learn: 0.5743022	total: 2.81s	remaining: 1m 30s
15:	learn: 0.5721623	total: 2.99s	remaining: 1m 30s
16:	learn: 0.5703229	total: 3.18s	remaining: 1m 30s
17:	learn: 0.5688288	total: 3.35s	remaining: 1m 29s
18:	learn: 0.5674729	total: 3.54s	remaining: 1m 29s
19:	learn: 0.5658092	to