In [1]:
import os 
import pandas as pd
import numpy as np

# import sklearn methods 
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sys 
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")


    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")

    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")

    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [3]:
################ scaled dataset ###################
df_original2 = pd.read_pickle('../input/rlddandmerged/merged_normalized_scaled.pkl')

X2 = df_original2.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]

y2 = df_original2.loc[:, "DROWSINESS"]
y2 = y2.map({0: 0, 0.5: 1, 1:2})
df_original2.loc[:, "DROWSINESS"] = y2

y2_f = df_original2.loc[:, "DROWSINESS"]


X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2_f, test_size = 0.2, stratify = y2)

In [4]:
# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=10, random_state = 42)

# CART decision tree
extra_trees = ExtraTreesClassifier(random_state=0)


# parameters 
parameters = {
                #"max_features": [None, "sqrt"],
                #"class_weight": [None, "balanced_subsample"],
                #"max_samples": [0.3, 0.7],
                "n_estimators": [10]
                }

In [5]:
# grid search for parameters
grid2 = GridSearchCV(estimator=extra_trees, param_grid=parameters, cv=cv, n_jobs=-1)
grid2.fit(X_train2, y_train2)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid2.best_params_, grid2.best_score_))

# prediction results
y_pred2 = grid2.predict(X_test2)

# print accuracy metrics
results2, false2 = display_test_scores(y_test2, y_pred2)
print(results2)

The best parameters are {'n_estimators': 10} with a score of 0.7974
TEST SCORES

ACCURACY: 0.8010

CONFUSION MATRIX:
[[209631  14157  25921]
 [ 29364 149669  19287]
 [ 37257  17907 219823]]

FALSE POSITIVES:
29364 out of 179033 positive labels (0.1640%)

FALSE NEGATIVES:
14157 out of 223788 negative labels (0.0633%)

PRECISION, RECALL, F1 scores:
              precision    recall  f1-score   support

           0       0.76      0.84      0.80    249709
           1       0.82      0.75      0.79    198320
           2       0.83      0.80      0.81    274987

    accuracy                           0.80    723016
   macro avg       0.80      0.80      0.80    723016
weighted avg       0.80      0.80      0.80    723016

