# Machine learning and deep learning methods applied to predicting customer status

Purpose: Predict active and not active clients based on the proposed data structure and naive data structure. Here, we implemented Support Vector Machines (SVM), Random Forest (RF), K-nearest Neighbours (KNN) and Lasso.

Author: Gabriel Rodrigues Palma and Rafael de Andrade Moral

# Packages used in the project

In [1]:
# visualisation modules
import matplotlib.pyplot as plt

# Data manipulation modules
import numpy as np
import pandas as pd

# Machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Deep learning modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# Machine learning packages
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import collections
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import LeaveOneOut

# Additional packages
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Conv2DTranspose, UpSampling2D, Flatten, Reshape
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import L1
from keras import metrics
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K
import tensorflow as tf

# Testing GPU from MacOs
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# Functions used in the project

In [22]:
def create_dataset(path, response_class):
    ''' This function reads and prepare the datasets for applying the ML and DL methods'''
    data = pd.read_csv(path)
    data = data.drop(columns = ['Unnamed: 0', 'Subject'])
    explanatory_variables = data.drop(columns = response_class)
    response_variables = data[response_class]        
    binarizer = LabelBinarizer()
    response_variables = binarizer.fit_transform(response_variables)
    
    onehot_encoder = OneHotEncoder(sparse=False)
    hot_encode_response_variable = onehot_encoder.fit_transform(np.array(response_variables).reshape(-1, 1))          
    
    return(explanatory_variables, 
           hot_encode_response_variable, response_variables)

def check_zero_division_and_get_rates(cm):
    ''' This functions checks for divisions per zeros in the computation of 
       True and false positive rate based on the confusion matrix array. Also, 
       this function returns the checked rates'''
    
    if any(np.sum(cm, axis = 1)==0):            
            tpr = cm[:,1][1]/(np.sum(cm, axis = 1)[1]+1e-16)
            fpr = cm[:,1][0]/(np.sum(cm, axis = 1)[0]+1e-16)    
    else:           
        rates = cm[:,1]/np.sum(cm, axis = 1)    
        fpr = rates[0]    
        tpr = rates[1]
    if np.isnan(tpr):        
        tpr=0
        
    return(tpr, fpr)

def check_and_compute_rates(predictions, 
                            classes, 
                            cm):    
    ''' This function returns the values of true and false positive rate for special cases
       where the division is not possible to obtain automatically based on the confusion
       matrix array provided by sklearn'''
        
    if (sum(predictions) == 0 and sum(classes) == 0):        
        fpr = 0
        tpr = 0
    elif (sum(predictions) == len(predictions) and sum(classes) == len(classes)):        
        fpr = 0
        tpr = 1
    else:            
        tpr, fpr = check_zero_division_and_get_rates(cm)
        
    return(tpr, fpr)
    
def get_rates(y_pred,
              y_true):
    ''' This function get the true and false positive rates based on the 
       predictied'''
            
    cm = confusion_matrix(y_true = y_true, y_pred = y_pred)
    tpr, fpr = check_and_compute_rates(y_pred, y_true, cm)
        
    return(tpr, fpr)

def get_rates_by_cross_validation(raw_data):
    ''' This function obtains the accuracy, true and false positive rates 
       based on the cross k-fold cross validation'''
            
    for train_index, test_index in KFold(n_splits=5, shuffle=True).split(patterns):
        
        x_train, x_test = patterns[train_index], patterns[test_index]
        y_train, y_test = classes[train_index], classes[test_index]
                
        pbp_predictions = pbp_prediction(patterns_array=x_test, 
                                         clustered_patterns = clustered_patterns, 
                                         d_base = d_base, alpha = alpha, 
                   outbreak_p_means = prediction.obtain_p_means_with_distance,
                   outbreak_prediction = prediction.predict_with_distance)
        rates = get_rates(predictions = pbp_predictions, classes = y_test)
        tpr.append(rates[0])
        fpr.append(rates[1])
    
    return(np.mean(tpr[tpr!=np.nan]), np.mean(fpr[fpr!=np.nan]))

def get_statistics(y_pred, y_true):
    ''' This function obtains the following statistics: Accuracy, True positive rate and 
       False positive rate'''

    auc = roc_auc_score(y_score = y_pred,
              y_true = y_true)
    acc = accuracy_score(y_pred = y_pred,
                         y_true = y_true)
    rates = get_rates(y_pred = y_pred,
              y_true = y_true)
    tpr = rates[0]
    fpr = rates[1]
    return([acc, tpr, fpr, auc])

# def get_methods_performance(X_train, X_test, 
#                             y_train, y_test, 
#                             raw_data):
#     ''' This function obtain the performance of each selected model'''
#     kf = KFold(n_splits = 5, random_state = 42, shuffle = True)
#     # Support Vector Machine -----
#     ## Non-linear
#     nonlinear_svm = svm.NuSVC(gamma="auto")
#     # Create the parameter space
#     params = {"kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
#               "gamma": ['auto', 'scale'],
#              "degree": np.arange(1, 10, 1)}
#     nonlinear_svm_cv = RandomizedSearchCV(nonlinear_svm, params, cv=kf)
#     nonlinear_svm_cv.fit(X_train, y_train)
#     nonlinear_svm_predictions = nonlinear_svm_cv.predict(X_test)
    
#     ## Polinomial
#     polinomial_svm = svm.SVC(kernel='poly', degree = 3)
#     polinomial_svm.fit(X_train, y_train)
#     polinomial_svm_predictions = polinomial_svm.predict(X_test)
    
#     # Random forest -----
#     rf = RandomForestClassifier(max_depth=5, random_state=0, n_estimators = 1000)
#     rf.fit(X_train, y_train)
#     random_forest_predictions = rf.predict(X_test)
    
#     # KNN -----
#     knn = KNeighborsClassifier(n_neighbors=1)
#     knn.fit(X_train, y_train)
#     knn_predictions = knn.predict(X_test)
    
#     # Lasso -----
#     lasso = Lasso(alpha=0.02)
#     lasso.fit(X_train, y_train)
#     lasso_predictions = np.round(lasso.predict(X_test))
    
#     # Deep Neural Network -----
# #     if raw_data==True:
# #         dnn = keras.models.load_model('DNN/DNN_model_params.h5')
# #         dnn_predictions = np.round(dnn.predict(X_test))
# #     elif raw_data == 'combined':
# #         dnn = keras.models.load_model('DNN/DNN_model_fulldata.h5')
# #         dnn_predictions = np.round(dnn.predict(X_test))
# #     else:    
# #         dnn = keras.models.load_model('DNN/DNN_model_hmm.h5')
# #         dnn_predictions = np.round(dnn.predict(X_test))

    
#     # Obtaining statistics -----
#     polynomial_svm_performance = get_statistics(y_pred = polinomial_svm_predictions,
#                                         y_true = y_test)
#     nonlinear_svm_performance = get_statistics(y_pred = nonlinear_svm_predictions,
#                                        y_true = y_test)
#     random_forest_performance = get_statistics(y_pred = random_forest_predictions,
#                                        y_true = y_test)
#     knn_performance = get_statistics(y_pred = knn_predictions,
#                                        y_true = y_test)    
#     lasso_performance = get_statistics(y_pred = lasso_predictions,
#                                        y_true = y_test)
# #     dnn_performance = get_statistics(y_pred = dnn_predictions,
# #                                        y_true = y_test)

#     return(polynomial_svm_performance, nonlinear_svm_performance, 
#            random_forest_performance, knn_performance, 
#            lasso_performance)
    
# def get_results_data(explanatory_variables,                             
#                      response_variables,                                                         
#                      test_size, raw_data):
#     ''' This function obtain the performance of the ML and DL methods based on
#        the prediction of client status'''
#     X_train, X_test, y_train, y_test = train_test_split(explanatory_variables, 
#                                                         response_variables, 
#                                                         test_size = test_size, 
#                                                         random_state = 42)
   
#     # Obtaining methods performance
#     polynomial_svm_performance, nonlinear_svm_performance, \
#     random_forest_performance, knn_performance, \
#     lasso_performance = get_methods_performance(X_train, X_test, 
#                                               y_train, y_test, 
#                                               raw_data)
        
    
#     model_outputs_performance = pd.DataFrame({'Statistics':['Accuracy', 
#                                                             'True Positive Rate', 
#                                                             'False Positive Rate', 
#                                                             'AUROC'],
#                                               'Polinomial SVM': [polynomial_svm_performance[0], 
#                                                                  polynomial_svm_performance[1], 
#                                                                  polynomial_svm_performance[2],
#                                                                 polynomial_svm_performance[3]], 
#                                               'Non linear SVM': [nonlinear_svm_performance[0], 
#                                                                  nonlinear_svm_performance[1], 
#                                                                  nonlinear_svm_performance[2], 
#                                                                 nonlinear_svm_performance[3]], 
#                                               'Random Forest': [random_forest_performance[0], 
#                                                                 random_forest_performance[1], 
#                                                                 random_forest_performance[2], 
#                                                                random_forest_performance[3]], 
#                                               'KNN': [knn_performance[0], 
#                                                       knn_performance[1], 
#                                                       knn_performance[2], 
#                                                       knn_performance[3]],                                                
#                                               'Lasso': [lasso_performance[0], 
#                                                         lasso_performance[1], 
#                                                         lasso_performance[2], 
#                                                         lasso_performance[3]]})

#     return(model_outputs_performance)
    

In [23]:
def get_methods_predictions(X_train, X_test, 
                            y_train, y_test, 
                            raw_data):
    ''' This function obtain the performance of each selected model'''
    kf = KFold(n_splits = 5, random_state = 42, shuffle = True)
    # Support Vector Machine -----
    ## Non-linear
    nonlinear_svm = svm.NuSVC()
    
    nonlinear_svm.fit(X_train, y_train)
    
    nonlinear_svm_predictions = nonlinear_svm.predict(X_test)
    
    ## Polinomial
    polinomial_svm = svm.SVC(kernel='poly', degree = 3)
    polinomial_svm.fit(X_train, y_train)
    polinomial_svm_predictions = polinomial_svm.predict(X_test)
    
    # Random forest -----
    rf = RandomForestClassifier(max_depth=5, random_state=0, n_estimators = 1000)
    rf.fit(X_train, y_train)
    random_forest_predictions = rf.predict(X_test)
    
    # KNN -----
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train, y_train)
    knn_predictions = knn.predict(X_test)
    
    # Lasso -----
    lasso = Lasso(alpha=0.02)
    lasso.fit(X_train, y_train)
    lasso_predictions = np.round(lasso.predict(X_test))
    
    # Deep Neural Network -----
#     if raw_data==True:
#         dnn = keras.models.load_model('DNN/DNN_model_params.h5')
#         dnn_predictions = np.round(dnn.predict(X_test_scale))
#     elif raw_data == 'combined':
#         dnn = keras.models.load_model('DNN/DNN_model_fulldata.h5')
#         dnn_predictions = np.round(dnn.predict(X_test_scale))
#     else:    
#         dnn = keras.models.load_model('DNN/DNN_model_hmm.h5')
#         dnn_predictions = np.round(dnn.predict(X_test_scale))
    

    return(polinomial_svm_predictions, nonlinear_svm_predictions, 
           random_forest_predictions, knn_predictions, 
           lasso_predictions)
def get_methods_predictions_loo(polinomial_svm_predictions, nonlinear_svm_predictions, 
                                random_forest_predictions, knn_predictions, 
                                lasso_predictions, dnn_predictions, 
                                reponse_variable):
    
    '''This function obtain the methods performance based on their predictions for the leave one out cross valition'''
    polynomial_svm_performance = get_statistics(y_pred = polinomial_svm_predictions,
                                    y_true = reponse_variable)
    nonlinear_svm_performance = get_statistics(y_pred = nonlinear_svm_predictions,
                                       y_true = reponse_variable)
    random_forest_performance = get_statistics(y_pred = random_forest_predictions,
                                       y_true = reponse_variable)
    knn_performance = get_statistics(y_pred = knn_predictions,
                                       y_true = reponse_variable)    
    lasso_performance = get_statistics(y_pred = lasso_predictions,
                                       y_true = reponse_variable)
#     dnn_performance = get_statistics(y_pred = dnn_predictions,
#                                        y_true = reponse_variable)
    return(polynomial_svm_performance, nonlinear_svm_performance, 
           random_forest_performance, knn_performance, 
           lasso_performance)

    
def get_results_data_loo(explanatory_variables,                         
                         response_variables,                                                         
                         raw_data):
    ''' This function obtain the performance of the ML and DL methods based on
       the prediction of client status based on leave one out cross validation'''

    loo = LeaveOneOut()
    loo.get_n_splits(explanatory_variables)
    polinomial_svm_predictions, nonlinear_svm_predictions, \
           random_forest_predictions, knn_predictions, \
           lasso_predictions, dnn_predictions = ([], [], [], [], [], [])
    
    for train_index, test_index in loo.split(explanatory_variables):
        X_train, X_test = explanatory_variables[train_index], explanatory_variables[test_index]
        y_train, y_test = response_variables[train_index], response_variables[test_index]                
             
        ## Correcting errors related to column vector (n_observations, ) to (, n_observations)
        y_train = y_train.ravel() 
        y_test = y_test.ravel()        
    
        polynomial_svm_prediction, nonlinear_svm_prediction, \
        random_forest_prediction, knn_prediction, \
        lasso_prediction = get_methods_predictions(X_train, X_test, 
                                                   y_train, y_test, 
                                                   raw_data)
        
        polinomial_svm_predictions.append(polynomial_svm_prediction[0])
        nonlinear_svm_predictions.append(nonlinear_svm_prediction[0])
        random_forest_predictions.append(random_forest_prediction[0])
        knn_predictions.append(knn_prediction[0])        
        lasso_predictions.append(lasso_prediction[0]) 
#         dnn_predictions.append(dnn_prediction[0])
 
    polynomial_svm_performance, nonlinear_svm_performance, \
       random_forest_performance, knn_performance, \
       lasso_performance = get_methods_predictions_loo(polinomial_svm_predictions, nonlinear_svm_predictions, 
                            random_forest_predictions, knn_predictions, 
                            lasso_predictions, dnn_predictions, 
                            list(response_variables.flatten()))
    
    model_outputs_performance = pd.DataFrame({'Statistics':['Accuracy', 
                                                            'True Positive Rate', 
                                                            'False Positive Rate', 
                                                            'AUROC'],
                                              'Polinomial SVM': [polynomial_svm_performance[0], 
                                                                 polynomial_svm_performance[1], 
                                                                 polynomial_svm_performance[2],
                                                                polynomial_svm_performance[3]], 
                                              'Non linear SVM': [nonlinear_svm_performance[0], 
                                                                 nonlinear_svm_performance[1], 
                                                                 nonlinear_svm_performance[2], 
                                                                nonlinear_svm_performance[3]], 
                                              'Random Forest': [random_forest_performance[0], 
                                                                random_forest_performance[1], 
                                                                random_forest_performance[2], 
                                                               random_forest_performance[3]], 
                                              'KNN': [knn_performance[0], 
                                                      knn_performance[1], 
                                                      knn_performance[2], 
                                                      knn_performance[3]],                                                
                                              'Lasso': [lasso_performance[0], 
                                                        lasso_performance[1], 
                                                        lasso_performance[2], 
                                                        lasso_performance[3]]})

    return(model_outputs_performance)

# Importing Datasets

Before obtaining the performance of the learning algorithms selected for this paper, we need to import the datasets related to the Hidden Markov Models features and the peak features.

## Raw Midline EEG data Trial 1

In [55]:
all_raw_explanatory_variables_t1, \
  all_raw_hot_encode_response_variable_t1, all_raw_response_variable_t1 = create_dataset(path = '../output_data/New_data/SplitDatasets/RawMidlineTrial1.csv', 
                                                                                        response_class = 'features_class')

In [29]:
all_raw_explanatory_variables_t1.head()

Unnamed: 0,MM_Intercept,MM_Hess,average_peak_magnitude,average_peak_curvature,hmm2_mean.1,hmm2_mean.2,hmm2_sd.1,hmm2_sd.2,hmm2_p1,hmm2_p2,...,hmm5_sd.1,hmm5_sd.2,hmm5_sd.3,hmm5_sd.4,hmm5_sd.5,hmm5_p1,hmm5_p2,hmm5_p3,hmm5_p4,hmm5_p5
0,1.085978e-07,-1.050929e-07,8.29414e-07,-5.906837e-10,0.868,0.604,0.129,0.074,0.5159,0.4841,...,0.022,0.116,0.032,0.047,0.025,0.180843,0.240996,0.243103,0.144828,0.19023
1,1.080854e-07,-1.045971e-07,8.527908e-07,-6.459901e-10,0.59,0.899,0.096,0.184,0.530876,0.469124,...,0.224,0.033,0.046,0.051,0.03,0.156503,0.214782,0.282709,0.138557,0.207449
2,-6.47864e-08,6.269551e-08,6.327935e-07,-5.413374e-10,0.709,0.466,0.127,0.065,0.404959,0.595041,...,0.036,0.032,0.12,0.021,0.018,0.229291,0.181818,0.179896,0.210456,0.198539
3,-1.573635e-09,1.522849e-09,6.501585e-07,-4.326145e-10,0.699,0.481,0.089,0.067,0.440678,0.559322,...,0.021,0.072,0.028,0.019,0.044,0.190293,0.208398,0.214176,0.224769,0.162365
4,2.00196e-07,-1.93735e-07,9.778229e-07,-7.200449e-10,0.722,1.087,0.105,0.164,0.627565,0.372435,...,0.048,0.15,0.056,0.03,0.027,0.237708,0.252226,0.111692,0.255517,0.142857


## Raw Midline EEG data Trial 12

In [32]:
all_raw_explanatory_variables_t12, \
  all_raw_hot_encode_response_variable_t12, all_raw_response_variable_t12 = create_dataset(path = '../output_data/New_data/SplitDatasets/RawMidlineTrial12.csv', 
                                                                                           response_class = 'features_class')

In [33]:
all_raw_explanatory_variables_t12.head()

Unnamed: 0,MM_Intercept,MM_Hess,average_peak_magnitude,average_peak_curvature,hmm2_mean.1,hmm2_mean.2,hmm2_sd.1,hmm2_sd.2,hmm2_p1,hmm2_p2,...,hmm5_sd.1,hmm5_sd.2,hmm5_sd.3,hmm5_sd.4,hmm5_sd.5,hmm5_p1,hmm5_p2,hmm5_p3,hmm5_p4,hmm5_p5
0,1.156881e-07,-1.077369e-07,9.16539e-07,-7.693143e-10,0.959,0.677,0.146,0.095,0.315457,0.684543,...,0.034,0.015,0.056,0.027,0.133,0.194532,0.219769,0.256572,0.170347,0.15878
1,9.669676e-08,-9.005086e-08,8.743374e-07,-7.271431e-10,0.62,1.054,0.058,0.297,0.590447,0.409553,...,0.02,0.029,0.015,0.088,0.305,0.14939,0.257114,0.183943,0.295732,0.113821
2,-4.570386e-08,4.256267e-08,5.965999e-07,-4.592086e-10,0.425,0.694,0.044,0.18,0.470644,0.529356,...,0.031,0.007,0.013,0.052,0.182,0.129735,0.188447,0.133523,0.357008,0.191288
3,1.880012e-08,-1.7508e-08,7.036014e-07,-5.416457e-10,0.787,0.513,0.118,0.062,0.423729,0.576271,...,0.011,0.027,0.018,0.052,0.095,0.09545,0.32917,0.11686,0.266726,0.191793
4,1.369923e-07,-1.27577e-07,9.263932e-07,-7.455344e-10,1.006,0.643,0.159,0.118,0.412071,0.587929,...,0.044,0.052,0.015,0.022,0.136,0.262227,0.234131,0.123829,0.138398,0.241415


## Minmax scale Midline EEG data Trial 1

In [34]:
all_Minmax_scale_explanatory_variables_t1, \
  all_Minmax_scale_hot_encode_response_variable_t1, all_Minmax_scale_response_variable_t1 = create_dataset(path = '../output_data/New_data/SplitDatasets/MinmaxscaleTrial1.csv', 
                                                                                           response_class = 'features_class')

In [35]:
all_Minmax_scale_explanatory_variables_t1.head()

Unnamed: 0,MM_Intercept,MM_Hess,average_peak_magnitude,average_peak_curvature,hmm2_mean.1,hmm2_mean.2,hmm2_sd.1,hmm2_sd.2,hmm2_p1,hmm2_p2,...,hmm5_sd.1,hmm5_sd.2,hmm5_sd.3,hmm5_sd.4,hmm5_sd.5,hmm5_p1,hmm5_p2,hmm5_p3,hmm5_p4,hmm5_p5
0,0.052283,-10.740852,0.329249,-0.000296,0.348,0.216,0.065,0.037,0.5159,0.4841,...,0.011,0.058,0.016,0.024,0.012,0.180843,0.240996,0.243103,0.144828,0.19023
1,0.052178,-10.722025,0.340976,-0.000324,0.209,0.364,0.048,0.092,0.530876,0.469124,...,0.112,0.016,0.023,0.026,0.015,0.156503,0.214782,0.282709,0.138557,0.207449
2,-0.030231,6.213944,0.230614,-0.000272,0.269,0.147,0.064,0.032,0.404959,0.595041,...,0.018,0.016,0.06,0.01,0.009,0.229291,0.181818,0.179896,0.210456,0.198539
3,-0.000309,0.064725,0.239325,-0.000217,0.264,0.155,0.045,0.034,0.440678,0.559322,...,0.011,0.036,0.014,0.009,0.022,0.190485,0.208398,0.214176,0.224576,0.162365
4,0.094943,-19.507132,0.403699,-0.000361,0.275,0.459,0.053,0.082,0.627565,0.372435,...,0.024,0.075,0.028,0.015,0.014,0.237708,0.252226,0.111692,0.255517,0.142857


## Minmax scale Midline EEG data Trial 12

In [36]:
all_Minmax_scale_explanatory_variables_t12, \
  all_Minmax_scale_hot_encode_response_variable_t12, all_Minmax_scale_response_variable_t12 = create_dataset(path = '../output_data/New_data/SplitDatasets/MinmaxscaleTrial12.csv', 
                                                                                           response_class = 'features_class')

In [37]:
all_Minmax_scale_explanatory_variables_t12.head()

Unnamed: 0,MM_Intercept,MM_Hess,average_peak_magnitude,average_peak_curvature,hmm2_mean.1,hmm2_mean.2,hmm2_sd.1,hmm2_sd.2,hmm2_p1,hmm2_p2,...,hmm5_sd.1,hmm5_sd.2,hmm5_sd.3,hmm5_sd.4,hmm5_sd.5,hmm5_p1,hmm5_p2,hmm5_p3,hmm5_p4,hmm5_p5
0,0.06113,0.70504,0.372955,-0.000386,0.394,0.253,0.073,0.048,0.315457,0.684543,...,0.017,0.008,0.028,0.014,0.067,0.194532,0.219769,0.256572,0.170347,0.15878
1,0.020163,-88.391867,0.351785,-0.000365,0.224,0.442,0.029,0.149,0.590447,0.409553,...,0.01,0.014,0.008,0.044,0.153,0.14939,0.257114,0.183943,0.295732,0.113821
2,-0.044913,-101.184206,0.212457,-0.00023,0.126,0.261,0.022,0.09,0.470644,0.529356,...,0.016,0.004,0.007,0.026,0.091,0.129735,0.188447,0.133523,0.357008,0.191288
3,-0.001816,-48.01673,0.266135,-0.000272,0.308,0.171,0.059,0.031,0.423729,0.576271,...,0.006,0.013,0.009,0.026,0.048,0.09545,0.32917,0.11686,0.266726,0.191793
4,0.037201,-95.792486,0.377899,-0.000374,0.418,0.236,0.08,0.059,0.412071,0.587929,...,0.022,0.026,0.007,0.011,0.068,0.262227,0.234131,0.123829,0.138398,0.241415


## Zscored scale Midline EEG data Trial 12

In [38]:
all_Zscored_scale_explanatory_variables_t1, \
  all_Zscored_scale_hot_encode_response_variable_t1, all_Zscored_scale_response_variable_t1 = create_dataset(path = '../output_data/New_data/SplitDatasets/ZscoredScaledTrial1.csv', 
                                                                                           response_class = 'features_class')

In [39]:
all_Zscored_scale_explanatory_variables_t1.head()

Unnamed: 0,MM_Intercept,MM_Hess,average_peak_magnitude,average_peak_curvature,hmm2_mean.1,hmm2_mean.2,hmm2_sd.1,hmm2_sd.2,hmm2_p1,hmm2_p2,...,hmm5_sd.1,hmm5_sd.2,hmm5_sd.3,hmm5_sd.4,hmm5_sd.5,hmm5_p1,hmm5_p2,hmm5_p3,hmm5_p4,hmm5_p5
0,0.430807,-9.240701,0.899261,-0.002411,0.348,0.216,0.065,0.037,0.5159,0.4841,...,0.011,0.058,0.016,0.024,0.012,0.180843,0.240996,0.243103,0.144828,0.19023
1,0.379726,-28.295271,0.994679,-0.002637,0.209,0.364,0.048,0.092,0.530876,0.469124,...,0.112,0.016,0.023,0.026,0.015,0.156503,0.214782,0.282709,0.138557,0.207449
2,-0.19053,30.395558,0.096708,-0.00221,0.269,0.147,0.064,0.032,0.404959,0.595041,...,0.018,0.016,0.06,0.01,0.009,0.229291,0.181818,0.179896,0.210456,0.198539
3,0.018348,11.094229,0.167587,-0.001766,0.264,0.155,0.045,0.034,0.440678,0.559322,...,0.011,0.036,0.014,0.009,0.022,0.190485,0.208398,0.214176,0.224576,0.162365
4,0.738628,-31.642053,1.505026,-0.002939,0.275,0.459,0.053,0.082,0.627565,0.372435,...,0.024,0.075,0.028,0.015,0.014,0.237708,0.252226,0.111692,0.255517,0.142857


## Zscored scale Midline EEG data Trial 12

In [40]:
all_Zscored_scale_explanatory_variables_t12, \
  all_Zscored_scale_hot_encode_response_variable_t12, all_Zscored_scale_response_variable_t12 = create_dataset(path = '../output_data/New_data/SplitDatasets/ZscoredScaledTrial12.csv', 
                                                                                           response_class = 'features_class')

In [41]:
all_Zscored_scale_explanatory_variables_t12.head()

Unnamed: 0,MM_Intercept,MM_Hess,average_peak_magnitude,average_peak_curvature,hmm2_mean.1,hmm2_mean.2,hmm2_sd.1,hmm2_sd.2,hmm2_p1,hmm2_p2,...,hmm5_sd.1,hmm5_sd.2,hmm5_sd.3,hmm5_sd.4,hmm5_sd.5,hmm5_p1,hmm5_p2,hmm5_p3,hmm5_p4,hmm5_p5
0,0.505831,3.734294,1.254882,-0.00314,0.394,0.253,0.073,0.048,0.315457,0.684543,...,0.017,0.008,0.028,0.014,0.067,0.194532,0.219769,0.256572,0.170347,0.15878
1,0.154855,-90.618525,1.082626,-0.002968,0.224,0.442,0.029,0.149,0.590447,0.409553,...,0.01,0.014,0.008,0.044,0.153,0.14939,0.257114,0.183943,0.295732,0.113821
2,-0.381763,-108.664622,-0.051024,-0.001874,0.126,0.261,0.022,0.09,0.470644,0.529356,...,0.016,0.004,0.007,0.026,0.091,0.129735,0.188447,0.133523,0.357008,0.191288
3,-0.024115,-51.478291,0.385727,-0.002211,0.308,0.171,0.059,0.031,0.423729,0.576271,...,0.006,0.013,0.009,0.026,0.048,0.09545,0.32917,0.11686,0.266726,0.191793
4,0.289524,-99.254789,1.295104,-0.003043,0.418,0.236,0.08,0.059,0.412071,0.587929,...,0.022,0.026,0.007,0.011,0.068,0.262227,0.234131,0.123829,0.138398,0.241415


# Obtaining learning algorithms performance based on Leave one out cross validation

Now, we are obtaining accuracy, trus and false positive rates for Polinomial and no linear Support Vector Machines, Random Forests, K-neirest neighbors and Lasso learning algorithms.

## Raw Midline EEG data Trial 1

For the raw Midline EEG data, we are going to analise train and analyse the variables in the following way:

    1. All the features;
    2. The features for HMM with 2 parameters;
    3. The features for HMM with 3 parameters;
    4. The features for HMM with 4 parameters;
    5. The features for HMM with 5 parameters;

### All the features

In [42]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t1),                                                  
                     response_variables = all_raw_response_variable_t1,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.297872,0.446809,0.382979,0.425532,0.489362
1,True Positive Rate,0.227273,0.409091,0.272727,0.227273,0.454545
2,False Positive Rate,0.64,0.52,0.52,0.4,0.48
3,AUROC,0.293636,0.444545,0.376364,0.413636,0.487273


### The features for HMM with 2 parameters

In [43]:
selected_columns_m2 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                    'average_peak_curvature', 'hmm2_mean.1', 
                    'hmm2_mean.2', 'hmm2_sd.1', 'hmm2_sd.2', 
                    'hmm2_p1']
selected_columns_m2_array = [elem in selected_columns_m2 for elem in all_raw_explanatory_variables_t1.columns]

In [44]:
all_raw_explanatory_variables_t1_m2 = all_raw_explanatory_variables_t1[selected_columns_m2]

In [45]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t1_m2),                                                  
                     response_variables = all_raw_response_variable_t1,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.425532,0.531915,0.446809,0.446809,0.531915
1,True Positive Rate,0.227273,0.409091,0.363636,0.5,0.0
2,False Positive Rate,0.4,0.36,0.48,0.6,0.0
3,AUROC,0.413636,0.524545,0.441818,0.45,0.5


### The features for HMM with 3 parameters

In [46]:
selected_columns_m3 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                    'average_peak_curvature', 'hmm3_mean.1', 'hmm3_mean.2', 
                    'hmm3_mean.3', 'hmm3_sd.1', 
                    'hmm3_sd.2', 'hmm3_sd.3', 'hmm3_p1',
                    'hmm3_p2']
selected_columns_m3_array = [elem in selected_columns_m3 for elem in all_raw_explanatory_variables_t1.columns]

In [47]:
all_raw_explanatory_variables_t1_m3 = all_raw_explanatory_variables_t1[selected_columns_m3]

In [48]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t1_m3),                                                  
                     response_variables = all_raw_response_variable_t1,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.212766,0.553191,0.382979,0.617021,0.531915
1,True Positive Rate,0.0,0.409091,0.272727,0.590909,0.0
2,False Positive Rate,0.6,0.32,0.52,0.36,0.0
3,AUROC,0.2,0.544545,0.376364,0.615455,0.5


### The features for HMM with 4 parameters

In [49]:
selected_columns_m4 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                       'average_peak_curvature', 'hmm4_mean.1', 'hmm4_mean.2', 'hmm4_mean.3',
                       'hmm4_mean.4', 'hmm4_sd.1', 'hmm4_sd.2', 'hmm4_sd.3', 'hmm4_sd.4',
                       'hmm4_p1', 'hmm4_p2', 'hmm4_p3']
selected_columns_m4_array = [elem in selected_columns_m3 for elem in all_raw_explanatory_variables_t1.columns]

In [50]:
all_raw_explanatory_variables_t1_m4 = all_raw_explanatory_variables_t1[selected_columns_m4]

In [51]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t1_m4),                                               
                     response_variables = all_raw_response_variable_t1,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.574468,0.595745,0.510638,0.553191,0.489362
1,True Positive Rate,0.772727,0.636364,0.454545,0.5,0.454545
2,False Positive Rate,0.6,0.44,0.44,0.4,0.48
3,AUROC,0.586364,0.598182,0.507273,0.55,0.487273


### The features for HMM with 5 parameters

In [56]:
selected_columns_m5 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                       'average_peak_curvature', 'hmm5_mean.1',
                       'hmm5_mean.2', 'hmm5_mean.3', 'hmm5_mean.4', 'hmm5_mean.5', 'hmm5_sd.1',
                       'hmm5_sd.2', 'hmm5_sd.3', 'hmm5_sd.4', 'hmm5_sd.5', 'hmm5_p1',
                       'hmm5_p2', 'hmm5_p3', 'hmm5_p4']
selected_columns_m5_array = [elem in selected_columns_m5 for elem in all_raw_explanatory_variables_t1.columns]

In [57]:
all_raw_explanatory_variables_t1_m5 = all_raw_explanatory_variables_t1[selected_columns_m5]

In [58]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t1_m5),                                                  
                     response_variables = all_raw_response_variable_t1,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.510638,0.404255,0.340426,0.382979,0.319149
1,True Positive Rate,0.409091,0.409091,0.272727,0.363636,0.0
2,False Positive Rate,0.4,0.6,0.6,0.6,0.4
3,AUROC,0.504545,0.404545,0.336364,0.381818,0.3


## Raw Midline EEG data Trial 12

For the Minmax scale EEG data, we are going to analise train and analyse the variables in the following way:

    1. All the features;
    2. The features for HMM with 2 parameters;
    3. The features for HMM with 3 parameters;
    4. The features for HMM with 4 parameters;
    5. The features for HMM with 5 parameters;

### All the features

In [66]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t12),                                                  
                     response_variables = all_raw_response_variable_t12,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.468085,0.553191,0.510638,0.425532,0.531915
1,True Positive Rate,0.318182,0.5,0.318182,0.227273,0.363636
2,False Positive Rate,0.4,0.4,0.32,0.4,0.32
3,AUROC,0.459091,0.55,0.499091,0.413636,0.521818


### The features for HMM with 2 parameters

In [67]:
selected_columns_m2 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                    'average_peak_curvature', 'hmm2_mean.1', 
                    'hmm2_mean.2', 'hmm2_sd.1', 'hmm2_sd.2', 
                    'hmm2_p1']
selected_columns_m2_array = [elem in selected_columns_m2 for elem in all_raw_explanatory_variables_t12.columns]

In [68]:
all_raw_explanatory_variables_t12_m2 = all_raw_explanatory_variables_t12[selected_columns_m2]

In [69]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t12_m2),                                                 
                     response_variables = all_raw_response_variable_t12,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.340426,0.489362,0.382979,0.319149,0.531915
1,True Positive Rate,0.090909,0.5,0.318182,0.272727,0.0
2,False Positive Rate,0.44,0.52,0.56,0.64,0.0
3,AUROC,0.325455,0.49,0.379091,0.316364,0.5


### The features for HMM with 3 parameters

In [70]:
selected_columns_m3 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                    'average_peak_curvature', 'hmm3_mean.1', 'hmm3_mean.2', 
                    'hmm3_mean.3', 'hmm3_sd.1', 
                    'hmm3_sd.2', 'hmm3_sd.3', 'hmm3_p1',
                    'hmm3_p2']
selected_columns_m3_array = [elem in selected_columns_m3 for elem in all_raw_explanatory_variables_t12.columns]

In [71]:
all_raw_explanatory_variables_t12_m3 = all_raw_explanatory_variables_t12[selected_columns_m3]

In [72]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t12_m3),                                                
                     response_variables = all_raw_response_variable_t12,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.489362,0.574468,0.489362,0.489362,0.531915
1,True Positive Rate,0.545455,0.454545,0.409091,0.363636,0.0
2,False Positive Rate,0.56,0.32,0.44,0.4,0.0
3,AUROC,0.492727,0.567273,0.484545,0.481818,0.5


### The features for HMM with 4 parameters

In [99]:
selected_columns_m4 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                       'average_peak_curvature', 'hmm4_mean.1', 'hmm4_mean.2', 'hmm4_mean.3',
                       'hmm4_mean.4', 'hmm4_sd.1', 'hmm4_sd.2', 'hmm4_sd.3', 'hmm4_sd.4',
                       'hmm4_p1', 'hmm4_p2', 'hmm4_p3']
selected_columns_m4_array = [elem in selected_columns_m4 for elem in all_raw_explanatory_variables_t12.columns]

In [100]:
all_raw_explanatory_variables_t12_m4 = all_raw_explanatory_variables_t12[selected_columns_m4]

In [101]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t12_m4),                                                  
                     response_variables = all_raw_response_variable_t12,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.297872,0.468085,0.531915,0.531915,0.361702
1,True Positive Rate,0.045455,0.409091,0.5,0.545455,0.0
2,False Positive Rate,0.48,0.48,0.44,0.48,0.32
3,AUROC,0.282727,0.464545,0.53,0.532727,0.34


### The features for HMM with 5 parameters

In [102]:
selected_columns_m5 = ['MM_Intercept', 'MM_Hess', 'average_peak_magnitude', 
                       'average_peak_curvature', 'hmm5_mean.1',
                       'hmm5_mean.2', 'hmm5_mean.3', 'hmm5_mean.4', 'hmm5_mean.5', 'hmm5_sd.1',
                       'hmm5_sd.2', 'hmm5_sd.3', 'hmm5_sd.4', 'hmm5_sd.5', 'hmm5_p1',
                       'hmm5_p2', 'hmm5_p3', 'hmm5_p4']
selected_columns_m5_array = [elem in selected_columns_m5 for elem in all_raw_explanatory_variables_t12.columns]

In [169]:
all_raw_explanatory_variables_t1_m5 = all_raw_explanatory_variables_t12[selected_columns_m5]

In [103]:
get_results_data_loo(explanatory_variables = np.array(all_raw_explanatory_variables_t12_m5),                                                  
                     response_variables = all_raw_response_variable_t12,                                                                           
                     raw_data = True)

Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso
0,Accuracy,0.553191,0.574468,0.574468,0.489362,0.510638
1,True Positive Rate,0.272727,0.454545,0.5,0.409091,0.181818
2,False Positive Rate,0.2,0.32,0.36,0.44,0.2
3,AUROC,0.536364,0.567273,0.57,0.484545,0.490909
