# Classifiers - busy vs relaxed - no TFv1
Exploring different classifiers with different autoencoders. No contractive autoencoder because it needs TFv1 compatibility.

#### Table of contents:  

classifiers:  
[Dense classifier](#Dense-classifier)  
[LSTM-based classifier](#LSTM-based-classifier)  
[kNN](#kNN)  
[SVC](#SVC)  
[Random Forest](#Random-Forest)  
[XGBoost](#XGBoost)  

In [1]:
import datareader # made by the previous author for reading the collected data
import dataextractor # same as above
import scipy
import pandas
import numpy as np

import tensorflow as tf
tf.get_logger().setLevel('ERROR') # Disable printing some annoying warnings

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow.keras.backend as K

import json
from datetime import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler, scale
from sklearn.model_selection import RandomizedSearchCV

import warnings

import talos
from talos.utils import lr_normalizer

from tensorflow import keras
from tensorflow.keras import layers, regularizers
import matplotlib.pyplot as plt

tf.keras.backend.set_floatx('float32') # set keras to use float32 to avoid a warning message
metrics = ['accuracy']

In [2]:
# Start the notebook in the terminal with "PYTHONHASHSEED=0 jupyter notebook" 
# or in anaconda "set PYTHONHASHSEED=0" then start jupyter notebook
import os

if os.environ.get("PYTHONHASHSEED") != "0":
    raise Exception("You must set PYTHONHASHSEED=0 before starting the Jupyter server to get reproducible results.")

In [3]:
import random

random.seed(1)
np.random.seed(4)
tf.random.set_seed(2)

This is modfied original author's code for reading data:

In [4]:
def get_busy_vs_relax_timeframes_br_hb(path, ident, seconds):
    """Returns raw data from either 'on task' or 'relax' time frames and their class (0 or 1)."""
    
    dataread = datareader.DataReader(path, ident)  # initialize path to data
    data = dataread.read_grc_data()  # read from files
    samp_rate = int(round(len(data[1]) / max(data[0])))
    cog_res = dataread.read_cognitive_load_study(str(ident) + '-primary-extract.txt')

    tasks_data = np.empty((0, seconds*samp_rate))
    tasks_y = np.empty((0, 1))
    breathing = np.empty((0,12))
    heartbeat = np.empty((0,10))

    busy_n = dataread.get_data_task_timestamps(return_indexes=True)
    relax_n = dataread.get_relax_timestamps(return_indexes=True)

    for i in cog_res['task_number']:
        task_num_table = i - 225  # 0 - 17
        tmp_tasks_data = np.empty((0, seconds*samp_rate))
        tmp_tasks_y = np.empty((0, 1))
        tmp_breathing = np.empty((0,12))
        tmp_heartbeat = np.empty((0,10))

        ### task versus relax (1 sample each)
        dataextract = dataextractor.DataExtractor(data[0][busy_n[task_num_table][0]:busy_n[task_num_table][1]],
                                                  data[1][busy_n[task_num_table][0]:busy_n[task_num_table][1]],
                                                  samp_rate)

        dataextract_relax = dataextractor.DataExtractor(data[0][relax_n[task_num_table][0]:relax_n[task_num_table][1]],
                                                        data[1][relax_n[task_num_table][0]:relax_n[task_num_table][1]],
                                                        samp_rate)

        try:

            # get extracted features for breathing
            tmpBR_busy = dataextract.extract_from_breathing_time(dataextract.t[-samp_rate*seconds:],
                                                                 dataextract.y[-samp_rate*seconds:])
            tmpBR_relax = dataextract_relax.extract_from_breathing_time(dataextract_relax.t[-samp_rate*seconds:],
                                                                 dataextract_relax.y[-samp_rate*seconds:])
            #get extracted features for heartbeat
            tmpHB_busy = dataextract.extract_from_heartbeat_time(dataextract.t[-samp_rate*seconds:],
                                                                 dataextract.y[-samp_rate*seconds:])
            tmpHB_relax = dataextract.extract_from_heartbeat_time(dataextract_relax.t[-samp_rate*seconds:],
                                                                 dataextract_relax.y[-samp_rate*seconds:])

            tmp_tasks_data = np.vstack((tmp_tasks_data, dataextract.y[-samp_rate * seconds:]))
            tmp_tasks_y = np.vstack((tasks_y, 1))
            tmp_tasks_data = np.vstack((tmp_tasks_data, dataextract_relax.y[-samp_rate * seconds:]))
            tmp_tasks_y = np.vstack((tmp_tasks_y, 0))

            # put busy frames then relaxed frames under the previous frames
            tmp_breathing = np.vstack((tmp_breathing, tmpBR_busy.to_numpy(dtype='float64', na_value=0)[0][:-1]))
            tmp_breathing = np.vstack((tmp_breathing, tmpBR_relax.to_numpy(dtype='float64', na_value=0)[0][:-1]))

            tmp_heartbeat = np.vstack((tmp_heartbeat, tmpHB_busy.to_numpy(dtype='float64', na_value=0)[0][:-1]))
            tmp_heartbeat = np.vstack((tmp_heartbeat, tmpHB_relax.to_numpy(dtype='float64', na_value=0)[0][:-1]))

        except ValueError:
#             print(ident)  # ignore short windows
            continue

        # put busy frames then relaxed frames under the previous frames
        tasks_data = np.vstack((tasks_data, dataextract.y[-samp_rate * seconds:]))
        tasks_y = np.vstack((tasks_y, 1))
        tasks_data = np.vstack((tasks_data, dataextract_relax.y[-samp_rate * seconds:]))
        tasks_y = np.vstack((tasks_y, 0))

        breathing = np.vstack((breathing, tmpBR_busy.to_numpy(dtype='float64', na_value=0)[0][:-1]))
        breathing = np.vstack((breathing, tmpBR_relax.to_numpy(dtype='float64', na_value=0)[0][:-1]))

        heartbeat = np.vstack((heartbeat, tmpHB_busy.to_numpy(dtype='float64', na_value=0)[0][:-1]))
        heartbeat = np.vstack((heartbeat, tmpHB_relax.to_numpy(dtype='float64', na_value=0)[0][:-1]))

    return tasks_data, tasks_y, breathing, heartbeat

In [5]:
def get_data_from_idents_br_hb(path, idents, seconds):
    """Go through all user data and take out windows of only <seconds> long time frames,
    along with the given class (from 'divide_each_task' function).
    """
    samp_rate = 43  # hard-coded sample rate
    data, ys = np.empty((0, samp_rate*seconds)), np.empty((0, 1))
    brs = np.empty((0,12))
    hbs = np.empty((0,10))
    combined = np.empty((0,22))
    
    # was gettign some weird warnings; stack overflow said to ignore them
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        for i in idents:
            x, y, br, hb = get_busy_vs_relax_timeframes_br_hb(path, i, seconds) # either 'get_busy_vs_relax_timeframes',
            # get_engagement_increase_vs_decrease_timeframes, get_task_complexities_timeframes or get_TLX_timeframes

            data = np.vstack((data, x))
            ys = np.vstack((ys, y))
            brs = np.vstack((brs, br))
            hbs = np.vstack((hbs, hb))
        combined = np.hstack((brs,hbs))
    
    return data, ys, brs, hbs, combined

In [6]:
# Accs is a dictionary which holds 1d arrays of accuracies in each key
# except the key 'test id' which holds strings of the id which yielded the coresponding accuracies
def print_accs_stats(accs):
    
    printDict = {}
    # loop over each key
    for key in accs:
    
        if (key == 'test id'):
            # skip calculating ids
            continue
        printDict[key] = {}
        tmpDict = printDict[key]
        # calculate and print some statistics
        tmpDict['min'] = np.min(accs[key])
        tmpDict['max'] = np.max(accs[key])
        tmpDict['mean'] = np.mean(accs[key])
        tmpDict['median'] = np.median(accs[key])
    
    print(pandas.DataFrame.from_dict(printDict).to_string())

In [7]:
def clear_session_and_set_seeds():
    # clear session and set seeds again
    K.clear_session()
    random.seed(1)
    np.random.seed(4)
    tf.random.set_seed(2)

## Prepare data

Initialize variables:

In [8]:
# used for reading the data into an array
seconds = 30  # time window length
samp_rate = 43  # hard-coded sample rate
phase_shape = np.empty((0, samp_rate*seconds))
y_shape = np.empty((0, 1))
breathing_shape = np.empty((0,12))
heartbeat_shape = np.empty((0,10))
combined_shape = np.empty((0,22))
idents = ['2gu87', 'iz2ps', '1mpau', '7dwjy', '7swyk', '94mnx', 'bd47a', 'c24ur', 'ctsax', 'dkhty', 'e4gay',
              'ef5rq', 'f1gjp', 'hpbxa', 'pmyfl', 'r89k1', 'tn4vl', 'td5pr', 'gyqu9', 'fzchw', 'l53hg', '3n2f9',
              '62i9y']
path = '../../../StudyData/'


# change to len(idents) at the end to use all the data
n = len(idents)

In [9]:
# load all phase data to use for training autoencoders
phase_all_train = get_data_from_idents_br_hb(path, idents[:-2], seconds)[0]
# Scale each row with MinMax to range [0,1]
phase_all_train = MinMaxScaler().fit_transform(phase_all_train.T).T

# load all validation phase data to use for training autoencoders
phase_all_valid = get_data_from_idents_br_hb(path, idents[-2:], seconds)[0]
# Scale each row with MinMax to range [0,1]
phase_all_valid = MinMaxScaler().fit_transform(phase_all_valid.T).T

In [10]:
# Holds all the data so it doesnt have to be read from file each time
data_dict = {}

Fill the data dictionary:

In [11]:
for ident in idents:
    
    # initialize ident in 
    data_dict[ident] = {}
    tmpDataDict = data_dict[ident]
    
    # read data
    phase, y, breathing, heartbeat, combined = get_data_from_idents_br_hb(path, [ident], seconds)
    
    # load data into dictionary
    tmpDataDict['phase'] = phase
    tmpDataDict['y'] = y
    tmpDataDict['breathing'] = breathing

## Autoencoders  
Train autoencoders to save their encoded representations in the data dictionary:

In [12]:
# AE Training params
batch_size = 128
epochs = 1000
encoding_dim = 30
ae_encoded_shape = np.empty((0,encoding_dim))

#### Undercomplete Autoencoder  
from https://blog.keras.io/building-autoencoders-in-keras.html

In [13]:
def undercomplete_ae(x, encoding_dim=64, encoded_as_model=False):
    # Simplest possible autoencoder from https://blog.keras.io/building-autoencoders-in-keras.html

    # this is our input placeholder
    input_data = Input(shape=x[0].shape, name="input")
    dropout = Dropout(0.125, name="dropout", seed=42)(input_data)
    # "encoded" is the encoded representation of the input
    encoded = Dense(encoding_dim, activation='sigmoid', name="encoded")(dropout)
    
    # "decoded" is the lossy reconstruction of the input
    decoded = Dense(x[0].shape[0], activation='sigmoid', name="decoded")(encoded)

    autoencoder = Model(input_data, decoded)
    
    # compile the model
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)
    
    # if return encoder in the encoded variable
    if encoded_as_model:
        encoded = Model(input_data, encoded)
    
    return autoencoder, encoded

Train autoencoder on data:

In [14]:
clear_session_and_set_seeds()
uc_ae, uc_enc = undercomplete_ae(phase_all_train, encoding_dim=encoding_dim, encoded_as_model=True)
uc_ae.fit(phase_all_train, phase_all_train,
          validation_data=(phase_all_valid, phase_all_valid),
          batch_size=batch_size,
          shuffle=True,
          epochs=epochs,
          verbose=0)

<tensorflow.python.keras.callbacks.History at 0x1ca2b39b908>

Store the encoded representations in the data dictionary:

In [15]:
for ident in data_dict:
    
    tmpDataDict = data_dict[ident]
    
    # read data
    phase = tmpDataDict['phase']
    
    uc_data = uc_enc.predict(phase)
    
    # load data into dictionary
    tmpDataDict['undercomplete_encoded'] = uc_data

Helper function to get data from the dictionary:

In [16]:
def get_ident_data_from_dict(idents, data_dict):
    
    # Initialize data variables
    y = y_shape.copy()
    phase = phase_shape.copy()
    breathing = breathing_shape.copy()
    heartbeat = heartbeat_shape.copy()
    combined = combined_shape.copy()
    undercomplete_encoded = ae_encoded_shape.copy()
    sparse_encoded = ae_encoded_shape.copy()
    deep_encoded = ae_encoded_shape.copy()
    
    # Stack data form each ident into the variables
    for tmp_id in idents:
        y = np.vstack((y, data_dict[tmp_id]['y']))
        phase = np.vstack((phase, data_dict[tmp_id]['phase']))
        breathing = np.vstack((breathing, data_dict[tmp_id]['breathing']))
        undercomplete_encoded = np.vstack((undercomplete_encoded, data_dict[tmp_id]['undercomplete_encoded']))
    
    return y, phase, breathing, undercomplete_encoded

# Classifiers

Load train and test data into variables:

In [17]:
train_idents = idents

train_y, train_phase, train_breathing, train_undercomplete_encoded = get_ident_data_from_dict(train_idents, data_dict)

# train_phase = scale(train_phase) # standardize by row
train_phase = MinMaxScaler().fit_transform(train_phase) # normalize by row

sscaler_breathing = StandardScaler() # Standardize by column
train_breathing = sscaler_breathing.fit_transform(train_breathing)

sscaler_undercomplete = StandardScaler() # Standardize by column
train_undercomplete_encoded = sscaler_undercomplete.fit_transform(train_undercomplete_encoded)

In [18]:
train_phase.shape

(742, 1290)

In [19]:
train_breathing.shape

(742, 12)

In [20]:
train_undercomplete_encoded.shape

(742, 30)

## Dense classifier

### Phase data:

In [21]:
def create_dense_phase_classifier(dropout=0.25, hidden_size=32, activation="relu", last_activation="sigmoid",
                            loss="binary_crossentropy", optimizer="Nadam", learning_rate=0.001):
    
    model = Sequential()
    model.add(Dropout(dropout))
    model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(1, activation=last_activation))

    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=metrics)

    return model

In [22]:
params_dense_phase = {
    'dropout': [0.3,0.4,0.5],
    'hidden_size': [26,28,32],
#     'activation': ['sigmoid', 'relu'],
#     'loss': ['binary_crossentropy', 'poisson'],
#     'optimizer': ['Adam', 'Nadam'],
    'batch_size': [128],
    'epochs': [200,300,400]
}

In [23]:
clear_session_and_set_seeds()
model = KerasClassifier(create_dense_phase_classifier, verbose=0)
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_dense_phase, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_phase, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  4.5min finished


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001CA370D2688>,
                   n_iter=40,
                   param_distributions={'batch_size': [128],
                                        'dropout': [0.3, 0.4, 0.5],
                                        'epochs': [200, 300, 400],
                                        'hidden_size': [26, 28, 32]},
                   random_state=42, scoring='accuracy', verbose=1)

In [24]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_hidden_size,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
12,26,300,0.4,128,0.667086,1
5,32,300,0.3,128,0.661709,2
16,28,400,0.4,128,0.656295,3
7,28,400,0.3,128,0.654956,4
17,32,400,0.4,128,0.654934,5


In [25]:
dfd.tail(5)

Unnamed: 0,param_hidden_size,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
11,32,200,0.4,128,0.614541,23
9,26,200,0.4,128,0.614519,24
10,28,200,0.4,128,0.609154,25
19,28,200,0.5,128,0.605127,26
0,26,200,0.3,128,0.603734,27


In [26]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6670856732401723 
 {'hidden_size': 26, 'epochs': 300, 'dropout': 0.4, 'batch_size': 128}


### Breathing data:

In [27]:
def create_dense_breathing_classifier(dropout=0.25, hidden_size=32, activation="relu", last_activation="sigmoid",
                            loss="poisson", optimizer=Nadam, learning_rate=0.001):
    
    model = Sequential()
    model.add(Dropout(dropout))
    model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(1, activation=last_activation))

    model.compile(loss=loss,
                  optimizer=optimizer(learning_rate=learning_rate),
                  metrics=metrics)

    return model

In [28]:
params_dense_breathing = {
    'dropout': [0.05,0.1],
    'hidden_size': [24,26,28],
#     'activation': ['sigmoid', 'relu'],
#     'loss': ['poisson', 'binary_crossentropy'],
#     'optimizer': [Adam, Nadam],
    'learning_rate': [0.05],
    'batch_size': [128],
    'epochs': [100,200,300]
}

In [29]:
clear_session_and_set_seeds()
model = KerasClassifier(create_dense_breathing_classifier, verbose=0)
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_dense_breathing, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_breathing, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  1.2min finished


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001CA36F54FC8>,
                   n_iter=40,
                   param_distributions={'batch_size': [128],
                                        'dropout': [0.05, 0.1],
                                        'epochs': [100, 200, 300],
                                        'hidden_size': [24, 26, 28],
                                        'learning_rate': [0.05]},
                   random_state=42, scoring='accuracy', verbose=1)

In [30]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_learning_rate,param_hidden_size,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
0,0.05,24,100,0.05,128,0.700764,1
16,0.05,26,300,0.1,128,0.687296,2
17,0.05,28,300,0.1,128,0.683296,3
10,0.05,26,100,0.1,128,0.68322,4
2,0.05,28,100,0.05,128,0.677849,5


In [31]:
dfd.tail(5)

Unnamed: 0,param_learning_rate,param_hidden_size,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
14,0.05,28,200,0.1,128,0.646794,14
6,0.05,24,300,0.05,128,0.645526,15
1,0.05,26,100,0.05,128,0.6441,16
8,0.05,28,300,0.05,128,0.641488,17
7,0.05,26,300,0.05,128,0.640155,18


In [32]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.7007640067911715 
 {'learning_rate': 0.05, 'hidden_size': 24, 'epochs': 100, 'dropout': 0.05, 'batch_size': 128}


### Undercomplete encoded data:

In [33]:
def create_dense_ae_enc_classifier(dropout=0.25, hidden_size=32, activation="relu", last_activation="sigmoid",
                            loss="binary_crossentropy", optimizer=Adam, learning_rate=0.001):
    
    model = Sequential()
    model.add(Dropout(dropout))
    model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(1, activation=last_activation))

    model.compile(loss=loss,
                  optimizer=optimizer(learning_rate=learning_rate),
                  metrics=metrics)

    return model

In [34]:
params_dense_ae_enc = {
    'dropout': [0.05,0.1],
    'hidden_size': [30,32,34],
#     'activation': ['relu'],
#     'loss': ['binary_crossentropy'],
#     'optimizer': [Adam],
    'learning_rate': [0.005,0.01],
    'batch_size': [106,156],
    'epochs': [100,200,300]
}

In [35]:
clear_session_and_set_seeds()
model = KerasClassifier(create_dense_ae_enc_classifier, verbose=0)
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_dense_ae_enc, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_undercomplete_encoded, train_y)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  2.6min finished


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001CA39F37D88>,
                   n_iter=40,
                   param_distributions={'batch_size': [106, 156],
                                        'dropout': [0.05, 0.1],
                                        'epochs': [100, 200, 300],
                                        'hidden_size': [30, 32, 34],
                                        'learning_rate': [0.005, 0.01]},
                   random_state=42, scoring='accuracy', verbose=1)

In [36]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_learning_rate,param_hidden_size,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
6,0.005,34,200,0.05,106,0.641505,1
31,0.01,30,100,0.1,106,0.64015,2
23,0.005,32,200,0.05,156,0.638805,3
18,0.01,32,300,0.1,106,0.638789,4
36,0.005,34,200,0.1,156,0.630687,5


In [37]:
dfd.tail(5)

Unnamed: 0,param_learning_rate,param_hidden_size,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
19,0.01,32,100,0.05,156,0.595648,36
9,0.005,30,100,0.1,156,0.595637,37
35,0.01,30,300,0.05,106,0.595631,38
14,0.01,34,100,0.05,106,0.59297,39
26,0.01,30,100,0.1,156,0.583573,40


In [38]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6415045056810761 
 {'learning_rate': 0.005, 'hidden_size': 34, 'epochs': 200, 'dropout': 0.05, 'batch_size': 106}


# LSTM-based classifier  
based on the original author's code

In [39]:
# Reshape data to fit some layers
xt_phase = train_phase.reshape(-1, train_phase[0].shape[0], 1)
xt_breathing = train_breathing.reshape(-1, train_breathing[0].shape[0], 1)
xt_undercomplete_encoded = train_undercomplete_encoded.reshape(-1, train_undercomplete_encoded[0].shape[0], 1)

### Phase data:

In [40]:
def create_lstm_phase_classifier(kernel_size=4, filters=64, strides=4, pool_size=4, dropout=0.09,
               lstm_output_size=256, activation='relu', last_activation='sigmoid',
               loss='binary_crossentropy', optimizer=Nadam, learning_rate=0.001):
    
    # Define the model
    model = Sequential()
    model.add(Dropout(dropout))
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation=activation,
                     strides=strides))

    model.add(MaxPooling1D(pool_size=pool_size))
    
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation=activation,
                     strides=strides))

    model.add(MaxPooling1D(pool_size=pool_size))

    model.add(Dropout(dropout))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(1))
    model.add(Activation(last_activation))

    # Compile the model
    model.compile(loss=loss,
                  optimizer=optimizer(learning_rate=learning_rate),
                  metrics=metrics)
    
    return model

In [41]:
params_lstm_phase = {
    'kernel_size': [4],
    'filters': [32],
    'strides': [2,4],
    'pool_size': [4],
    'dropout': [0.01,0.1],
    'lstm_output_size': [22],
#     'activation': ['relu'],
#     'last_activation': ['sigmoid'],
    'loss': ['binary_crossentropy', 'poisson'],
#     'optimizer': [Nadam],
    'learning_rate': [0.005,0.01],
    'batch_size': [186],
    'epochs': [100, 200]
}

In [42]:
clear_session_and_set_seeds()
model = KerasClassifier(create_lstm_phase_classifier, verbose=0)
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_lstm_phase, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(xt_phase, train_y)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 23.6min finished


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001CA3B965CC8>,
                   n_iter=40,
                   param_distributions={'batch_size': [186],
                                        'dropout': [0.01, 0.1],
                                        'epochs': [100, 200], 'filters': [32],
                                        'kernel_size': [4],
                                        'learning_rate': [0.005, 0.01],
                                        'loss': ['binary_crossentropy',
                                                 'poisson'],
                                        'lstm_output_size': [22],
                                        'pool_size': [4], 'strides': [2, 4]},
                   random_state=42, scoring='accuracy', verbose=1)

In [43]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_strides,param_pool_size,param_lstm_output_size,param_loss,param_learning_rate,param_kernel_size,param_filters,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
24,2,4,22,binary_crossentropy,0.005,4,32,200,0.1,186,0.733125,1
16,2,4,22,binary_crossentropy,0.005,4,32,100,0.1,186,0.719581,2
8,2,4,22,binary_crossentropy,0.005,4,32,200,0.01,186,0.716947,3
28,2,4,22,binary_crossentropy,0.01,4,32,200,0.1,186,0.715576,4
0,2,4,22,binary_crossentropy,0.005,4,32,100,0.01,186,0.714221,5


In [44]:
dfd.tail(5)

Unnamed: 0,param_strides,param_pool_size,param_lstm_output_size,param_loss,param_learning_rate,param_kernel_size,param_filters,param_epochs,param_dropout,param_batch_size,mean_test_score,rank_test_score
7,4,4,22,poisson,0.01,4,32,100,0.01,186,0.654994,28
2,2,4,22,poisson,0.005,4,32,100,0.01,186,0.653667,29
22,2,4,22,poisson,0.01,4,32,100,0.1,186,0.642865,30
14,2,4,22,poisson,0.01,4,32,200,0.01,186,0.630632,31
6,2,4,22,poisson,0.01,4,32,100,0.01,186,0.614519,32


In [45]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.733125462539724 
 {'strides': 2, 'pool_size': 4, 'lstm_output_size': 22, 'loss': 'binary_crossentropy', 'learning_rate': 0.005, 'kernel_size': 4, 'filters': 32, 'epochs': 200, 'dropout': 0.1, 'batch_size': 186}


### Breathing data:

In [46]:
def create_lstm_breathing_classifier(kernel_size=4, filters=64, strides=4, pool_size=4, dropout=0.09,
               lstm_output_size=256, activation='relu', last_activation='sigmoid',
               loss='poisson', optimizer=Nadam, learning_rate=0.001):
    
    # Define the model
    model = Sequential()
    model.add(Dropout(dropout))
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation=activation,
                     strides=strides))

    model.add(MaxPooling1D(pool_size=pool_size))

    model.add(Dropout(dropout))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(1))
    model.add(Activation(last_activation))

    # Compile the model
    model.compile(loss=loss,
                  optimizer=optimizer(learning_rate=learning_rate),
                  metrics=metrics)
    
    return model

In [47]:
params_lstm_breathing = {
    'kernel_size': [2,4],
    'filters': [6,8,12],
    'strides': [1,2],
    'pool_size': [1,2],
    'dropout': [0.01],
    'lstm_output_size': [32,64],
#     'activation': ['sigmoid','relu'],
#     'last_activation': ['sigmoid','relu'],
#     'loss': ['binary_crossentropy','poisson'],
#     'optimizer': [Adam,Nadam],
    'learning_rate': [0.001,0.01],
    'batch_size': [64,256],
    'epochs': [100]
}

In [48]:
clear_session_and_set_seeds()
model = KerasClassifier(create_lstm_breathing_classifier, verbose=0)
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_lstm_breathing, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(xt_breathing, train_y)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  9.8min finished


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001CA78D8F908>,
                   n_iter=40,
                   param_distributions={'batch_size': [64, 256],
                                        'dropout': [0.01], 'epochs': [100],
                                        'filters': [6, 8, 12],
                                        'kernel_size': [2, 4],
                                        'learning_rate': [0.001, 0.01],
                                        'lstm_output_size': [32, 64],
                                        'pool_size': [1, 2],
                                        'strides': [1, 2]},
                   random_state=42, scoring='accuracy', verbose=1)

In [49]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(epochs)|(batch_size)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_strides,param_pool_size,param_lstm_output_size,param_learning_rate,param_kernel_size,param_filters,param_dropout,mean_test_score,rank_test_score
35,2,1,64,0.001,2,6,0.01,0.680521,1
21,1,2,32,0.01,4,8,0.01,0.679204,2
19,1,1,64,0.001,2,12,0.01,0.673817,3
39,1,2,32,0.001,4,12,0.01,0.671091,4
5,2,1,32,0.001,4,12,0.01,0.668419,5


In [50]:
dfd.tail(5)

Unnamed: 0,param_strides,param_pool_size,param_lstm_output_size,param_learning_rate,param_kernel_size,param_filters,param_dropout,mean_test_score,rank_test_score
32,2,2,32,0.001,4,6,0.01,0.618524,36
25,2,2,32,0.001,4,6,0.01,0.618519,37
20,2,2,32,0.01,2,12,0.01,0.598396,38
3,2,2,64,0.01,2,8,0.01,0.583524,39
6,2,2,64,0.01,2,6,0.01,0.579502,40


In [51]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6805210918114145 
 {'strides': 2, 'pool_size': 1, 'lstm_output_size': 64, 'learning_rate': 0.001, 'kernel_size': 2, 'filters': 6, 'epochs': 100, 'dropout': 0.01, 'batch_size': 64}


### Undercomplete encoded data:

In [52]:
def create_lstm_ae_enc_classifier(kernel_size=4, filters=64, strides=4, pool_size=4, dropout=0.09,
               lstm_output_size=256, activation='relu', last_activation='sigmoid',
               loss='poisson', optimizer=Nadam, learning_rate=0.001):
    
    # Define the model
    model = Sequential()
    model.add(Dropout(dropout))
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation=activation,
                     strides=strides))

    model.add(MaxPooling1D(pool_size=pool_size))

    model.add(Dropout(dropout))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(1))
    model.add(Activation(last_activation))

    # Compile the model
    model.compile(loss=loss,
                  optimizer=optimizer(learning_rate=learning_rate),
                  metrics=metrics)
    
    return model

In [53]:
params_lstm_ae_enc = {
    'kernel_size': [2],
    'filters': [6,8,12],
    'strides': [2],
    'pool_size': [2],
    'dropout': [0.01],
    'lstm_output_size': [32],
#     'activation': ['sigmoid','relu'],
#     'last_activation': ['sigmoid','relu'],
#     'loss': ['binary_crossentropy','poisson'],
#     'optimizer': [Adam,Nadam],
    'learning_rate': [0.001,0.01],
    'batch_size': [64],
    'epochs': [100]
}

In [54]:
clear_session_and_set_seeds()
model = KerasClassifier(create_lstm_ae_enc_classifier, verbose=0)
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_lstm_ae_enc, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(xt_undercomplete_encoded, train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001CA553C1108>,
                   n_iter=40,
                   param_distributions={'batch_size': [64], 'dropout': [0.01],
                                        'epochs': [100], 'filters': [6, 8, 12],
                                        'kernel_size': [2],
                                        'learning_rate': [0.001, 0.01],
                                        'lstm_output_size': [32],
                                        'pool_size': [2], 'strides': [2]},
                   random_state=42, scoring='accuracy', verbose=1)

In [55]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(epochs)|(batch_size)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_strides,param_pool_size,param_lstm_output_size,param_learning_rate,param_kernel_size,param_filters,param_dropout,mean_test_score,rank_test_score
0,2,2,32,0.001,2,6,0.01,0.592927,1
4,2,2,32,0.001,2,12,0.01,0.574061,2
2,2,2,32,0.001,2,8,0.01,0.570012,3
1,2,2,32,0.01,2,6,0.01,0.555233,4
3,2,2,32,0.01,2,8,0.01,0.533667,5


In [56]:
dfd.tail(5)

Unnamed: 0,param_strides,param_pool_size,param_lstm_output_size,param_learning_rate,param_kernel_size,param_filters,param_dropout,mean_test_score,rank_test_score
4,2,2,32,0.001,2,12,0.01,0.574061,2
2,2,2,32,0.001,2,8,0.01,0.570012,3
1,2,2,32,0.01,2,6,0.01,0.555233,4
3,2,2,32,0.01,2,8,0.01,0.533667,5
5,2,2,32,0.01,2,12,0.01,0.522811,6


In [57]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.5929269513734708 
 {'strides': 2, 'pool_size': 2, 'lstm_output_size': 32, 'learning_rate': 0.001, 'kernel_size': 2, 'filters': 6, 'epochs': 100, 'dropout': 0.01, 'batch_size': 64}


## kNN

In [58]:
from sklearn.neighbors import KNeighborsClassifier

def KNN_classifier():
    model = KNeighborsClassifier(p=3, n_neighbors=7, metric='cosine')
    return model

### Phase data:

In [59]:
params_knn_phase = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'metric': ['cosine', 'manhattan', 'l1', 'l2',],
}

In [60]:
clear_session_and_set_seeds()
model = KNN_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_knn_phase, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_phase, train_y)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:   18.2s finished


RandomizedSearchCV(cv=3,
                   estimator=KNeighborsClassifier(metric='cosine',
                                                  n_neighbors=7, p=3),
                   n_iter=40,
                   param_distributions={'metric': ['cosine', 'manhattan', 'l1',
                                                   'l2'],
                                        'n_neighbors': [3, 5, 7, 9, 11, 13,
                                                        15]},
                   random_state=42, scoring='accuracy', verbose=1)

In [61]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_score,rank_test_score
1,5,cosine,0.634779,1
4,11,cosine,0.626687,2
5,13,cosine,0.622649,3
6,15,cosine,0.622644,4
3,9,cosine,0.617262,5


In [62]:
dfd.tail(5)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_score,rank_test_score
10,9,manhattan,0.543125,23
18,11,l1,0.540437,25
11,11,manhattan,0.540437,25
9,7,manhattan,0.533706,27
16,7,l1,0.533706,27


In [63]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6347786339297374 
 {'n_neighbors': 5, 'metric': 'cosine'}


### Breathing data:

In [64]:
params_knn_breathing = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'metric': ['cosine', 'manhattan', 'l1', 'l2',],
}

In [65]:
clear_session_and_set_seeds()
model = KNN_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_knn_breathing, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_breathing, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    0.7s finished


RandomizedSearchCV(cv=3,
                   estimator=KNeighborsClassifier(metric='cosine',
                                                  n_neighbors=7, p=3),
                   n_iter=40,
                   param_distributions={'metric': ['cosine', 'manhattan', 'l1',
                                                   'l2'],
                                        'n_neighbors': [3, 5, 7, 9, 11, 13,
                                                        15]},
                   random_state=42, scoring='accuracy', verbose=1)

In [66]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_score,rank_test_score
13,15,manhattan,0.695404,1
20,15,l1,0.695404,1
18,11,l1,0.695399,3
11,11,manhattan,0.695399,3
19,13,l1,0.688662,5


In [67]:
dfd.tail(5)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_score,rank_test_score
1,5,cosine,0.653661,24
14,3,l1,0.652279,25
7,3,manhattan,0.652279,25
2,7,cosine,0.649585,27
21,3,l2,0.64015,28


In [68]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6954039876365853 
 {'n_neighbors': 15, 'metric': 'manhattan'}


### Undercomplete encoded data:

In [69]:
from sklearn.neighbors import KNeighborsClassifier

def KNN_classifier():
    model = KNeighborsClassifier(p=3, n_neighbors=7, metric='cosine')
    return model

In [70]:
params_knn_ae_enc = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'metric': ['cosine', 'manhattan', 'l1', 'l2',],
}

In [71]:
clear_session_and_set_seeds()
model = KNN_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_knn_ae_enc, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_undercomplete_encoded, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    1.0s finished


RandomizedSearchCV(cv=3,
                   estimator=KNeighborsClassifier(metric='cosine',
                                                  n_neighbors=7, p=3),
                   n_iter=40,
                   param_distributions={'metric': ['cosine', 'manhattan', 'l1',
                                                   'l2'],
                                        'n_neighbors': [3, 5, 7, 9, 11, 13,
                                                        15]},
                   random_state=42, scoring='accuracy', verbose=1)

In [72]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_score,rank_test_score
6,15,cosine,0.574159,1
18,11,l1,0.574137,2
11,11,manhattan,0.574137,2
4,11,cosine,0.572771,4
12,13,manhattan,0.571438,5


In [73]:
dfd.tail(5)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_score,rank_test_score
16,7,l1,0.551228,23
24,9,l2,0.549873,25
25,11,l2,0.54583,26
27,15,l2,0.544485,27
26,13,l2,0.540431,28


In [74]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.5741587218667015 
 {'n_neighbors': 15, 'metric': 'cosine'}


##  SVC

In [75]:
from sklearn.svm import SVC

def SVC_classifier():
    model = SVC(kernel='rbf', C=1.5)
    return model

### Phase data:

In [76]:
params_svc_phase = {
    'C': [2, 3, 6, 8, 10, 13, 15, 20],
    'kernel': ['rbf'],
    'gamma': ['scale']
}

In [77]:
clear_session_and_set_seeds()
model = SVC_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_svc_phase, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_phase, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   10.9s finished


RandomizedSearchCV(cv=3, estimator=SVC(C=1.5), n_iter=40,
                   param_distributions={'C': [2, 3, 6, 8, 10, 13, 15, 20],
                                        'gamma': ['scale'], 'kernel': ['rbf']},
                   random_state=42, scoring='accuracy', verbose=1)

In [78]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_kernel,param_gamma,param_C,mean_test_score,rank_test_score
4,rbf,scale,10,0.615907,1
3,rbf,scale,8,0.613224,2
7,rbf,scale,20,0.609176,3
2,rbf,scale,6,0.607837,4
5,rbf,scale,13,0.607826,5


In [79]:
dfd.tail(5)

Unnamed: 0,param_kernel,param_gamma,param_C,mean_test_score,rank_test_score
2,rbf,scale,6,0.607837,4
5,rbf,scale,13,0.607826,5
0,rbf,scale,2,0.607821,6
6,rbf,scale,15,0.603794,7
1,rbf,scale,3,0.599756,8


In [80]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6159070131905445 
 {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}


### Breathing data:

In [81]:
params_svc_breathing = {
    'C': [2, 3, 4, 5],
    'kernel': ['poly', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [82]:
clear_session_and_set_seeds()
model = SVC_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_svc_breathing, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_breathing, train_y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.3s finished


RandomizedSearchCV(cv=3, estimator=SVC(C=1.5), n_iter=40,
                   param_distributions={'C': [2, 3, 4, 5],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['poly', 'rbf']},
                   random_state=42, scoring='accuracy', verbose=1)

In [83]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_kernel,param_gamma,param_C,mean_test_score,rank_test_score
3,rbf,auto,2,0.692651,1
1,rbf,scale,2,0.689957,2
9,rbf,scale,4,0.687252,3
11,rbf,auto,4,0.687247,4
13,rbf,scale,5,0.685908,5


In [84]:
dfd.tail(5)

Unnamed: 0,param_kernel,param_gamma,param_C,mean_test_score,rank_test_score
0,poly,scale,2,0.672429,12
10,poly,auto,4,0.671118,13
14,poly,auto,5,0.671102,14
2,poly,auto,2,0.671085,15
8,poly,scale,4,0.669752,16


In [85]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6926505158678333 
 {'kernel': 'rbf', 'gamma': 'auto', 'C': 2}


### Undercomplete encoded data:

In [86]:
params_svc_ae_enc = {
    'C': [0.1,1,3,3.5,4,4.5,5],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto']
}

In [87]:
clear_session_and_set_seeds()
model = SVC_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_svc_ae_enc, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_undercomplete_encoded, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:    0.6s finished


RandomizedSearchCV(cv=3, estimator=SVC(C=1.5), n_iter=40,
                   param_distributions={'C': [0.1, 1, 3, 3.5, 4, 4.5, 5],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['rbf']},
                   random_state=42, scoring='accuracy', verbose=1)

In [88]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_kernel,param_gamma,param_C,mean_test_score,rank_test_score
5,rbf,auto,3,0.611826,1
3,rbf,auto,1,0.611804,2
2,rbf,scale,1,0.611799,3
4,rbf,scale,3,0.610476,4
12,rbf,scale,5,0.607777,5


In [89]:
dfd.tail(5)

Unnamed: 0,param_kernel,param_gamma,param_C,mean_test_score,rank_test_score
10,rbf,scale,4.5,0.602385,10
9,rbf,auto,4.0,0.601051,11
13,rbf,auto,5.0,0.60103,12
0,rbf,scale,0.1,0.557992,13
1,rbf,auto,0.1,0.552604,14


In [90]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6118257803317225 
 {'kernel': 'rbf', 'gamma': 'auto', 'C': 3}


## Random Forest

In [91]:
from sklearn.ensemble import RandomForestClassifier
def random_forest_classifier():
    model = RandomForestClassifier(random_state=42, max_features='log2', criterion='entropy', bootstrap=True)
    return model

### Phase data:

In [92]:
params_rf_phase = {
    'n_estimators': [170,180,190],
#     'criterion': ['entropy'],
    'max_depth': [50,100,None],
    'min_samples_split': [3,4],
    'min_samples_leaf': [1,2],
#     'max_features': ['log2'],
#     'bootstrap': [True],
    'oob_score': [False],
    'ccp_alpha': [0.001, 0.005]
}

In [93]:
clear_session_and_set_seeds()
model = random_forest_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_rf_phase, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_phase, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    max_features='log2',
                                                    random_state=42),
                   n_iter=40,
                   param_distributions={'ccp_alpha': [0.001, 0.005],
                                        'max_depth': [50, 100, None],
                                        'min_samples_leaf': [1, 2],
                                        'min_samples_split': [3, 4],
                                        'n_estimators': [170, 180, 190],
                                        'oob_score': [False]},
                   random_state=42, scoring='accuracy', verbose=1)

In [94]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split._)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_ccp_alpha,mean_test_score,rank_test_score
0,False,180,4,1,50.0,0.001,0.652322,1
21,False,180,4,1,100.0,0.001,0.652322,1
4,False,180,4,1,,0.001,0.652322,1
36,False,180,4,1,,0.005,0.646935,4
34,False,180,4,1,50.0,0.005,0.646935,4


In [95]:
dfd.tail(5)

Unnamed: 0,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_ccp_alpha,mean_test_score,rank_test_score
20,False,190,4,2,100.0,0.005,0.638849,36
30,False,190,4,2,,0.005,0.638849,36
16,False,190,3,2,100.0,0.005,0.638849,36
10,False,190,4,2,50.0,0.005,0.638849,36
23,False,190,3,2,50.0,0.005,0.638849,36


In [96]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6523224935788603 
 {'oob_score': False, 'n_estimators': 180, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 50, 'ccp_alpha': 0.001}


### Breathing data:

In [97]:
params_rf_breathing = {
    'n_estimators': [170,180,190],
#     'criterion': ['entropy'],
    'max_depth': [20,30,None],
    'min_samples_split': [2,3,4],
    'min_samples_leaf': [3],
#     'max_features': ['log2'],
#     'bootstrap': [True],
    'oob_score': [True, False],
    'ccp_alpha': [0.005, 0.015]
}

In [98]:
clear_session_and_set_seeds()
model = random_forest_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_rf_breathing, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_breathing, train_y)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   43.9s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    max_features='log2',
                                                    random_state=42),
                   n_iter=40,
                   param_distributions={'ccp_alpha': [0.005, 0.015],
                                        'max_depth': [20, 30, None],
                                        'min_samples_leaf': [3],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': [170, 180, 190],
                                        'oob_score': [True, False]},
                   random_state=42, scoring='accuracy', verbose=1)

In [99]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split._)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_ccp_alpha,mean_test_score,rank_test_score
33,False,180,4,3,30.0,0.005,0.722329,1
32,False,180,3,3,20.0,0.005,0.722329,1
29,True,180,3,3,,0.005,0.722329,1
7,False,180,3,3,,0.005,0.722329,1
28,False,180,4,3,20.0,0.005,0.722329,1


In [100]:
dfd.tail(5)

Unnamed: 0,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_ccp_alpha,mean_test_score,rank_test_score
16,True,180,2,3,20,0.015,0.704791,36
4,True,180,3,3,20,0.015,0.704791,36
10,False,180,3,3,30,0.015,0.704791,36
13,True,180,3,3,30,0.015,0.704791,36
15,False,180,4,3,20,0.015,0.704791,36


In [101]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.7223292412171869 
 {'oob_score': False, 'n_estimators': 180, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': None, 'ccp_alpha': 0.005}


### Undercomplete encoded data:

In [102]:
params_rf_ae_enc = {
    'n_estimators': [50,100,130],
#     'criterion': ['entropy'],
    'max_depth': [100,200,None],
    'min_samples_split': [2,3,4,5],
    'min_samples_leaf': [3,4,5],
#     'max_features': ['log2'],
#     'bootstrap': [True],
    'oob_score': [True, False],
    'ccp_alpha': [0.005, 0.015]
}

In [103]:
clear_session_and_set_seeds()
model = random_forest_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_rf_ae_enc, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_undercomplete_encoded, train_y)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   29.0s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    max_features='log2',
                                                    random_state=42),
                   n_iter=40,
                   param_distributions={'ccp_alpha': [0.005, 0.015],
                                        'max_depth': [100, 200, None],
                                        'min_samples_leaf': [3, 4, 5],
                                        'min_samples_split': [2, 3, 4, 5],
                                        'n_estimators': [50, 100, 130],
                                        'oob_score': [True, False]},
                   random_state=42, scoring='accuracy', verbose=1)

In [104]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split._)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_ccp_alpha,mean_test_score,rank_test_score
11,True,130,5,5,100.0,0.005,0.615929,1
37,True,100,5,5,200.0,0.005,0.614568,2
8,False,100,2,5,,0.005,0.614568,2
4,True,100,3,4,,0.015,0.611875,4
31,False,100,4,4,,0.015,0.611875,4


In [105]:
dfd.tail(5)

Unnamed: 0,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_ccp_alpha,mean_test_score,rank_test_score
30,True,50,3,5,200.0,0.005,0.587605,35
12,False,50,3,5,,0.005,0.587605,35
22,True,50,5,5,,0.015,0.586277,38
6,True,50,3,5,,0.015,0.586277,38
34,False,50,5,5,,0.015,0.586277,38


In [106]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6159287797657916 
 {'oob_score': True, 'n_estimators': 130, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_depth': 100, 'ccp_alpha': 0.005}


## XGBoost

In [107]:
from xgboost import XGBClassifier

def XGBoost_classifier():
    model = XGBClassifier(random_state=42)
    return model

### Phase data:

In [123]:
params_xgb_phase = {
    'n_estimators': [50,100,130],
    'max_depth': [50,100,200],
    'booster': ['gbtree', 'dart']
}

In [124]:
clear_session_and_set_seeds()
model = XGBoost_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_xgb_phase, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_phase, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  1.3min finished


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=42, reg_alpha=None,
                            

In [125]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_booster,mean_test_score,rank_test_score
0,50,50,gbtree,0.628036,1
15,50,200,dart,0.628036,1
3,50,100,gbtree,0.628036,1
6,50,200,gbtree,0.628036,1
9,50,50,dart,0.628036,1


In [126]:
dfd.tail(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_booster,mean_test_score,rank_test_score
8,130,200,gbtree,0.614579,13
5,130,100,gbtree,0.614579,13
2,130,50,gbtree,0.614579,13
11,130,50,dart,0.614579,13
17,130,200,dart,0.614579,13


In [127]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6280364372469636 
 {'n_estimators': 50, 'max_depth': 50, 'booster': 'gbtree'}


### Breathing data:

In [153]:
params_xgb_breathing = {
    'n_estimators': [50,100,130],
    'max_depth': [2,3,4,5,6,9],
    'booster': ['gbtree', 'dart']
}

In [154]:
clear_session_and_set_seeds()
model = XGBoost_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_xgb_breathing, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_breathing, train_y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    5.3s finished


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=42, reg_alpha=None,
                            

In [155]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_booster,mean_test_score,rank_test_score
26,130,4,dart,0.691377,1
8,130,4,gbtree,0.691377,1
6,50,4,gbtree,0.69,3
24,50,4,dart,0.69,3
4,100,3,gbtree,0.68599,5


In [156]:
dfd.tail(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_booster,mean_test_score,rank_test_score
5,130,3,gbtree,0.6725,31
11,130,5,gbtree,0.671172,33
29,130,5,dart,0.671172,33
31,100,6,dart,0.665791,35
13,100,6,gbtree,0.665791,35


In [157]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.6913771712158808 
 {'n_estimators': 130, 'max_depth': 4, 'booster': 'gbtree'}


### Undercomplete encoded data:

In [160]:
params_xgb_ae_enc = {
    'n_estimators': [50,100,130],
    'max_depth': [2,3,4,5,6,9],
    'booster': ['gbtree', 'dart']
}

In [161]:
clear_session_and_set_seeds()
model = XGBoost_classifier()
random_search_model = RandomizedSearchCV(estimator=model, param_distributions=params_xgb_ae_enc, n_iter=40,
                                         cv=3, verbose=1, random_state=42, scoring="accuracy")

random_search_model.fit(train_undercomplete_encoded, train_y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    7.0s finished


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=42, reg_alpha=None,
                            

In [162]:
df = pandas.DataFrame(random_search_model.cv_results_)
dfd = df.drop(df.filter(regex="(time)|(split)|(std_test_score)|(params)"), axis=1)
dfd = dfd.sort_values('rank_test_score')
dfd.head(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_booster,mean_test_score,rank_test_score
28,100,5,dart,0.590293,1
10,100,5,gbtree,0.590293,1
29,130,5,dart,0.588955,3
11,130,5,gbtree,0.588955,3
27,50,5,dart,0.588949,5


In [163]:
dfd.tail(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_booster,mean_test_score,rank_test_score
19,100,2,dart,0.568771,31
24,50,4,dart,0.564701,33
6,50,4,gbtree,0.564701,33
12,50,6,gbtree,0.559298,35
30,50,6,dart,0.559298,35


In [164]:
print(random_search_model.best_score_, "\n", random_search_model.best_params_)

0.5902931957685777 
 {'n_estimators': 100, 'max_depth': 5, 'booster': 'gbtree'}
