In [1]:
import os 
import pandas as pd
import numpy as np

# import sklearn methods 
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sys 
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

In [2]:
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")


    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")

    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")

    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

In [3]:
################ not scaled dataset ###################
df_original = pd.read_pickle('../input/rlddandmerged/merged_normalized_scaled.pkl')

X = df_original.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]


y = df_original.loc[:, "DROWSINESS"]
y = y.map({0: 0, 0.5: 1, 1:2})
df_original.loc[:, "DROWSINESS"] = y


y = df_original.loc[:, "DROWSINESS"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [4]:
################ scaled dataset ###################
df_original2 = pd.read_pickle('../input/rlddandmerged/rldd_normalized_scaled.pkl')

X2 = df_original2.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]


y2 = df_original2.loc[:, "DROWSINESS"]
y2 = y2.map({0: 0, 0.5: 1, 1:2})
df_original2.loc[:, "DROWSINESS"] = y2


y2 = df_original2.loc[:, "DROWSINESS"]


X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, stratify = y2)

In [5]:
# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=10, random_state = 42)

# CART decision tree
logistic_regression = LogisticRegression(random_state=0)
print(logistic_regression.get_params().keys())

# parameters 
parameters = {
                "C": [0.01, 0.1, 1, 10, 100],
                "max_iter": [1000,5000]
                }

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])


In [6]:
# grid search for parameters
grid = GridSearchCV(estimator=logistic_regression, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

The best parameters are {'C': 100, 'max_iter': 1000} with a score of 0.5137
TEST SCORES

ACCURACY: 0.5136

CONFUSION MATRIX:
[[185627   5985  58097]
 [110728  24453  63139]
 [ 95170  18539 161278]]

FALSE POSITIVES:
110728 out of 135181 positive labels (0.8191%)

FALSE NEGATIVES:
5985 out of 191612 negative labels (0.0312%)

PRECISION, RECALL, F1 scores:
              precision    recall  f1-score   support

           0       0.47      0.74      0.58    249709
           1       0.50      0.12      0.20    198320
           2       0.57      0.59      0.58    274987

    accuracy                           0.51    723016
   macro avg       0.51      0.48      0.45    723016
weighted avg       0.52      0.51      0.47    723016



In [7]:
# grid search for parameters
grid2 = GridSearchCV(estimator=logistic_regression, param_grid=parameters, cv=cv, n_jobs=-1)
grid2.fit(X_train2, y_train2)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid2.best_params_, grid2.best_score_))

# prediction results
y_pred2 = grid2.predict(X_test2)

# print accuracy metrics
results2, false2 = display_test_scores(y_test2, y_pred2)
print(results2)

The best parameters are {'C': 10, 'max_iter': 1000} with a score of 0.4838
TEST SCORES

ACCURACY: 0.4841

CONFUSION MATRIX:
[[143643  31107  24134]
 [ 97305  53277  47738]
 [ 59763  49532  93598]]

FALSE POSITIVES:
97305 out of 150582 positive labels (0.6462%)

FALSE NEGATIVES:
31107 out of 174750 negative labels (0.1780%)

PRECISION, RECALL, F1 scores:
              precision    recall  f1-score   support

           0       0.48      0.72      0.58    198884
           1       0.40      0.27      0.32    198320
           2       0.57      0.46      0.51    202893

    accuracy                           0.48    600097
   macro avg       0.48      0.48      0.47    600097
weighted avg       0.48      0.48      0.47    600097

