In [1]:
import os 
import pandas as pd
import numpy as np 
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sys 
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")

    

    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")

    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")

    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [3]:
################ rldd+nthu dataset ###################
df_original = pd.read_pickle('../input/rlddandmerged/merged_normalized_scaled.pkl')

X = df_original.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]


### encode y values to multiclass ###
y = df_original.loc[:, "DROWSINESS"]
y = y.map({0: 0, 0.5: 1, 1:2})
df_original.loc[:, "DROWSINESS"] = y

y_f = df_original.loc[:, "DROWSINESS"]


X_train, X_test, y_train, y_test = train_test_split(X, y_f, test_size = 0.2, stratify = y)

In [4]:
# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=5, random_state = 42)


#creating an extreme Gradient boosting instance
xgb = XGBClassifier(random_state=0, tree_method = "gpu_hist")

# parameters 
parameters = {
                "n_estimators": [200, 300],
                "max_depth": [9, 11, 13],
                "subsample": [0.5, 0.7, 1]            
}

In [5]:
# grid search for parameters
grid = GridSearchCV(estimator=xgb, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)



The best parameters are {'max_depth': 13, 'n_estimators': 300, 'subsample': 0.7} with a score of 0.8491
TEST SCORES

ACCURACY: 0.8512

CONFUSION MATRIX:
[[214515  11105  24089]
 [ 19327 164059  14934]
 [ 27391  10705 236891]]

FALSE POSITIVES:
19327 out of 183386 positive labels (0.1054%)

FALSE NEGATIVES:
11105 out of 225620 negative labels (0.0492%)

PRECISION, RECALL, F1 scores:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84    249709
           1       0.88      0.83      0.85    198320
           2       0.86      0.86      0.86    274987

    accuracy                           0.85    723016
   macro avg       0.85      0.85      0.85    723016
weighted avg       0.85      0.85      0.85    723016

