## Traditional ML classification models under wavelet analysis structures

In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### SVM
#### Read Data

In [2]:
# Configurations
LOWLUX = 'lowlux'
MIDLUX = 'middlelux'
HIGHLUX = 'highlux'
NOBACK = 'nothing'
ONEBACK = 'ONEBACK'
TWOBACK = 'TWOBACK'
THREEBACK = 'THREEBACK'
FILENAME = 'Filename'
IPA2 = 'IPA 2'
INDEX = 'index'
LUX = 'Luminance'
LABELS = 'Labels'
ISBLINK_LEFT = 'isBlink-Left'
ISBLINK_RIGHT = 'isBlink-Right'

# Training and testing ratio.
TEST_SIZE_PCT = 0.1

# Setting thresholds for 2 eyes' blinking rates.
THRES_BLINKING_RATE = 0.7

In [3]:
# Read data and make them into processable formalities.
filepath = '../Data/Results/21-11-13-13/results.csv'
df_raw_features = pd.read_csv(filepath)

In [4]:
df_raw_features

Unnamed: 0,Left-0,Left-1,Left-2,Left-3,Left-4,Left-5,Left-6,Left-7,Left-8,Left-9,...,Right-147,Right-148,Right-149,Averaged Diameter,Difference Diameter,isBlink-Left,isBlink-Right,Luminance,PID,Labels
0,90.878101,90.000122,89.429937,90.488841,90.425234,90.336186,90.534690,90.501258,90.304912,90.299277,...,94.824154,95.681502,95.550234,46.214449,-2.993481,0.325000,0.370000,lowlux,7,nothing
1,90.272990,90.943095,90.475785,90.534682,90.372235,90.283447,90.175733,90.071457,89.966849,89.862239,...,94.965808,95.486500,94.833888,46.102734,-2.943094,0.168333,0.203333,lowlux,7,nothing
2,89.356322,88.406687,88.938648,89.101954,88.632723,89.213376,88.886822,88.681709,89.061931,88.684184,...,96.326834,96.544795,95.752655,46.330987,-3.043528,0.000000,0.000000,lowlux,7,nothing
3,89.591427,90.381350,89.822829,90.178344,90.201855,90.039328,89.327095,90.271126,89.445756,89.488081,...,94.325592,93.692027,93.015056,46.389218,-2.974158,0.135000,0.195000,lowlux,7,nothing
4,89.352206,89.354459,90.085756,90.154200,90.493916,88.939724,89.547730,90.279304,90.332414,91.536459,...,93.245723,93.272628,94.553329,45.993263,-2.558402,0.135000,0.195000,lowlux,7,nothing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2807,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.581850,91.573155,91.780403,46.211527,0.826298,1.000000,0.431667,lowlux,2,THREEBACK
2808,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.303723,91.315547,91.318091,46.219696,0.809960,1.000000,0.530000,lowlux,2,THREEBACK
2809,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.241145,91.231029,91.214267,46.202013,0.845325,1.000000,0.663333,lowlux,2,THREEBACK
2810,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,89.802735,90.034177,89.511720,46.105213,1.038926,1.000000,0.816667,lowlux,2,THREEBACK


#### Encoder

This part encodes features into numeric values.

In [5]:
# Encoding and Labeling
luxes = []
nbacks = []
df_numeric_features = df_raw_features.copy()

for index, row in df_numeric_features.iterrows():
    # Label luxes
    if LOWLUX in row[LUX]:
        luxes.append(1)
    elif MIDLUX in row[LUX]:
        luxes.append(2)
    elif HIGHLUX in row[LUX]:
        luxes.append(3)
    
    # Label task difficulties
    if NOBACK in row[LABELS]:
        nbacks.append(0)
    elif ONEBACK in row[LABELS]:
        nbacks.append(1)
    elif TWOBACK in row[LABELS]:
        nbacks.append(2)
    elif THREEBACK in row[LABELS]:
        nbacks.append(3)
        
df_numeric_features[LUX] = luxes
df_numeric_features[LABELS] = nbacks

In [6]:
df_numeric_features

Unnamed: 0,Left-0,Left-1,Left-2,Left-3,Left-4,Left-5,Left-6,Left-7,Left-8,Left-9,...,Right-147,Right-148,Right-149,Averaged Diameter,Difference Diameter,isBlink-Left,isBlink-Right,Luminance,PID,Labels
0,90.878101,90.000122,89.429937,90.488841,90.425234,90.336186,90.534690,90.501258,90.304912,90.299277,...,94.824154,95.681502,95.550234,46.214449,-2.993481,0.325000,0.370000,1,7,0
1,90.272990,90.943095,90.475785,90.534682,90.372235,90.283447,90.175733,90.071457,89.966849,89.862239,...,94.965808,95.486500,94.833888,46.102734,-2.943094,0.168333,0.203333,1,7,0
2,89.356322,88.406687,88.938648,89.101954,88.632723,89.213376,88.886822,88.681709,89.061931,88.684184,...,96.326834,96.544795,95.752655,46.330987,-3.043528,0.000000,0.000000,1,7,0
3,89.591427,90.381350,89.822829,90.178344,90.201855,90.039328,89.327095,90.271126,89.445756,89.488081,...,94.325592,93.692027,93.015056,46.389218,-2.974158,0.135000,0.195000,1,7,0
4,89.352206,89.354459,90.085756,90.154200,90.493916,88.939724,89.547730,90.279304,90.332414,91.536459,...,93.245723,93.272628,94.553329,45.993263,-2.558402,0.135000,0.195000,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2807,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.581850,91.573155,91.780403,46.211527,0.826298,1.000000,0.431667,1,2,3
2808,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.303723,91.315547,91.318091,46.219696,0.809960,1.000000,0.530000,1,2,3
2809,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.241145,91.231029,91.214267,46.202013,0.845325,1.000000,0.663333,1,2,3
2810,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,89.802735,90.034177,89.511720,46.105213,1.038926,1.000000,0.816667,1,2,3


#### Data visualization

In [7]:
# sns.pairplot(df_labels_IPA2, hue=LABELS)

#### Machine Learning

In [8]:
# Remove blinks.
print(len(df_numeric_features))
df_numeric_features = df_numeric_features.loc[df_numeric_features[ISBLINK_LEFT] <= THRES_BLINKING_RATE]
df_numeric_features = df_numeric_features.loc[df_numeric_features[ISBLINK_RIGHT] <= THRES_BLINKING_RATE]
df_numeric_features = df_numeric_features.reset_index()
print(len(df_numeric_features))
# Get the X and y sets.
X = df_numeric_features.loc[:, df_numeric_features.columns != LABELS]
y = df_numeric_features.loc[:, df_numeric_features.columns == LABELS]

# Split the data into training dataset and validation dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE_PCT, random_state=999)

# # Feature Selection
# FEATURE_SET = [IPA2, LUX] # The lux labels should be researved, or all the predicted results are labeled as 1 same label.
# # FEATURE_SET = [IPA2]

# # Training data
# X_train = df_training[FEATURE_SET].to_numpy() 
# y_train = df_training[[LABELS]].to_numpy()
# y_train = np.reshape(y_train, -1)
                                      
# # Validation data 
# df_validation = df_labels_IPA2.copy().loc[VALIDATIONSET]
# X_test = df_validation[FEATURE_SET].to_numpy()
# y_test = df_validation[[LABELS]].to_numpy()
# y_test = np.reshape(y_test, -1)

2812
2028


In [9]:
# %%time
# # Try SVM, bacause it is one of the most widely used models in cognitive workload claissfications.
# clf = make_pipeline(StandardScaler(),SVC(kernel='poly', degree=7)) #SVC(kernel='rbf', gamma='auto'))
# clf.fit(X_train, y_train)

# y_pred_svm = clf.predict(X_test)

# accuracy_score(y_test, y_pred_svm)

### KNN

In [10]:
%%time
# Train the model.
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

# Test.
y_pred_knn = neigh.predict(X_test)
# print(y_pred, y_test)

# Accuracy score calculation.
accuracy_score(y_test, y_pred_knn)

CPU times: total: 93.8 ms
Wall time: 32.3 ms


  return self._fit(X, y)


0.8669950738916257

In [11]:
# Now the best parameter set:
# param = {train_test_size = 0.1, threshold_blinking_rate=0.7, SVM acc=0.4333, knn_num_neighbor=5, knn acc=0.86699}