In [1]:
import os 
import pandas as pd
import numpy as np

# import sklearn methods 
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import sys 
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification
from sklearn.svm import SVC

In [2]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")
    
    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")

    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")

    
    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

# RLDD DATA

In [3]:
df_original2 = pd.read_pickle('../input/rlddandmerged/rldd_normalized_scaled.pkl')

X2 = df_original2.loc[:, ["n_EAR", 
                    "n_MAR", "n_MOE", "n_EC",
                    "n_LEB", "n_SOP", "PERCLOS", "CLOSENESS"]]

y2 = df_original2.loc[:, "DROWSINESS"].map({0: 0, 0.5: 1, 1:2})

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, stratify = y2)

# cross-validation with 5 splits
cross_val = StratifiedShuffleSplit(n_splits=10, random_state = 42)

# SVM
svm = SVC(tol=1e-3, random_state=0, max_iter=5000)

# parameters 
parameters = {
                'kernel': ['rbf', 'sigmoid'],
                #'C': [0.01, 0.1, 1, 10, 100],
                #'max_iter': [1000, 5000]
                #'gamma' : [2**(-10), 2**(-5), 2**(-3), 2, 2**3],
                #'gamma' : ['scale', 'auto']
}

# grid search for parameters
grid2 = GridSearchCV(estimator=svm, param_grid=parameters, cv=cross_val, n_jobs=-1)#multithreading; all cores are used
grid2.fit(X_train2, y_train2)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid2.best_params_, grid2.best_score_))

# prediction results
y_pred2 = grid2.predict(X_test2)

# print accuracy metrics
results2, false2 = display_test_scores(y_test2, y_pred2)
print(results2)



The best parameters are {'kernel': 'sigmoid'} with a score of 0.3794
TEST SCORES

ACCURACY: 0.3925

CONFUSION MATRIX:
[[96084 84179 18621]
 [71399 94768 32153]
 [67777 90424 44692]]

FALSE POSITIVES:
71399 out of 166167 positive labels (0.4297%)

FALSE NEGATIVES:
84179 out of 180263 negative labels (0.4670%)

PRECISION, RECALL, F1 scores:
              precision    recall  f1-score   support

           0       0.41      0.48      0.44    198884
           1       0.35      0.48      0.41    198320
           2       0.47      0.22      0.30    202893

    accuracy                           0.39    600097
   macro avg       0.41      0.39      0.38    600097
weighted avg       0.41      0.39      0.38    600097

