In [1]:
import os 
import pandas as pd
import numpy as np 
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sys 
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostClassifier

In [2]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")

    
    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")

    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")

    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [3]:
################ rldd_normalized_scaled dataset ###################
df_original = pd.read_pickle('../input/rlddandmerged/rldd_normalized_scaled.pkl')

X = df_original.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]

### encode y values to multiclass ###
y = df_original.loc[:, "DROWSINESS"]
y = y.map({0: 0, 0.5: 1, 1:2})
df_original.loc[:, "DROWSINESS"] = y


y_f = df_original.loc[:, "DROWSINESS"]


X_train, X_test, y_train, y_test = train_test_split(X, y_f, test_size = 0.2, stratify = y)

In [4]:
# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=5, random_state = 42)

#creating an AdaBoost instance
ada = AdaBoostClassifier(random_state=0)

# parameters 
parameters = {
                "n_estimators": [200, 300, 400]
                
}

In [5]:
# grid search for parameters
grid = GridSearchCV(estimator=ada, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

The best parameters are {'n_estimators': 400} with a score of 0.6172
TEST SCORES

ACCURACY: 0.6175

CONFUSION MATRIX:
[[152369  27023  19492]
 [ 54783  91034  52503]
 [ 33003  42756 127134]]

FALSE POSITIVES:
54783 out of 145817 positive labels (0.3757%)

FALSE NEGATIVES:
27023 out of 179392 negative labels (0.1506%)

PRECISION, RECALL, F1 scores:
              precision    recall  f1-score   support

           0       0.63      0.77      0.69    198884
           1       0.57      0.46      0.51    198320
           2       0.64      0.63      0.63    202893

    accuracy                           0.62    600097
   macro avg       0.61      0.62      0.61    600097
weighted avg       0.61      0.62      0.61    600097

