# Project

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import uniform, randint, ttest_rel, ttest_ind

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = 10

import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt
%matplotlib inline

import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px

# from pandas_profiling import ProfileReport

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif 
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, SelectFromModel, RFE

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

#preprocessing:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, normalize, scale, Normalizer, MinMaxScaler, FunctionTransformer

# models:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import FeatureAgglomeration
from sklearn.ensemble import StackingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [45]:
cognitive_abilities = ["BarChartLit", "Meara", "VerbalWM_longest"]
best_window = {"BarChartLit": 3, "Meara": 4, "VerbalWM_longest": 2}

cognitive_ability = cognitive_abilities[1]
task_number = best_window[cognitive_ability]

# loading the dataset
path = ("Data/"+ str(cognitive_ability) + "/Task_" + str(task_number) + ".csv")
df = pd.read_csv(path)

display(df)
print(df.shape)

Unnamed: 0,abspathanglesrate,enddistance,endpupilsize,eyemovementvelocity,fixationrate,fixationsaccadetimeratio,longestsaccadedistance,longestsaccadeduration,maxdistance,maxpupilsize,maxpupilvelocity,maxsaccadespeed,meanabspathangles,meandistance,meanfixationduration,meanpathdistance,meanpupilsize,meanpupilvelocity,meanrelpathangles,meansaccadedistance,meansaccadeduration,meansaccadespeed,mindistance,minpupilsize,minpupilvelocity,minsaccadespeed,relpathanglesrate,startdistance,startpupilsize,stddevabspathangles,stddevdistance,stddevfixationduration,stddevpathdistance,stddevpupilsize,stddevpupilvelocity,stddevrelpathangles,stddevsaccadedistance,stddevsaccadeduration,stddevsaccadespeed,Relevant.bars_fixationrate,Relevant.bars_leftclicrate,Relevant.bars_longestfixation,Relevant.bars_maxdistance,Relevant.bars_maxpupilsize,Relevant.bars_maxpupilvelocity,Relevant.bars_meandistance,Relevant.bars_meanfixationduration,Relevant.bars_meanpupilsize,Relevant.bars_meanpupilvelocity,Relevant.bars_mindistance,Relevant.bars_minpupilsize,Relevant.bars_minpupilvelocity,Relevant.bars_numtransfrom_Non.relevant.bars,Relevant.bars_numtransfrom_Refs,Relevant.bars_numtransfrom_Relevant.bars,Relevant.bars_numtransfrom_Text,Relevant.bars_numtransfrom_Viz,Relevant.bars_numtransfrom_labels,Relevant.bars_numtransfrom_legend,Relevant.bars_proportionnum,Relevant.bars_proportiontime,Relevant.bars_proptransfrom_Non.relevant.bars,Relevant.bars_proptransfrom_Refs,Relevant.bars_proptransfrom_Relevant.bars,Relevant.bars_proptransfrom_Text,Relevant.bars_proptransfrom_Viz,Relevant.bars_proptransfrom_labels,Relevant.bars_proptransfrom_legend,Relevant.bars_stddevdistance,Relevant.bars_stddevfixationduration,Relevant.bars_stddevpupilsize,Relevant.bars_stddevpupilvelocity,Relevant.bars_timetofirstfixation,Non.relevant.bars_fixationrate,Non.relevant.bars_leftclicrate,Non.relevant.bars_longestfixation,Non.relevant.bars_maxdistance,Non.relevant.bars_maxpupilsize,Non.relevant.bars_maxpupilvelocity,Non.relevant.bars_meandistance,Non.relevant.bars_meanfixationduration,Non.relevant.bars_meanpupilsize,Non.relevant.bars_meanpupilvelocity,Non.relevant.bars_mindistance,Non.relevant.bars_minpupilsize,Non.relevant.bars_minpupilvelocity,Non.relevant.bars_numtransfrom_Non.relevant.bars,Non.relevant.bars_numtransfrom_Refs,Non.relevant.bars_numtransfrom_Relevant.bars,Non.relevant.bars_numtransfrom_Text,Non.relevant.bars_numtransfrom_Viz,Non.relevant.bars_numtransfrom_labels,Non.relevant.bars_numtransfrom_legend,Non.relevant.bars_proportionnum,Non.relevant.bars_proportiontime,Non.relevant.bars_proptransfrom_Non.relevant.bars,Non.relevant.bars_proptransfrom_Refs,Non.relevant.bars_proptransfrom_Relevant.bars,Non.relevant.bars_proptransfrom_Text,Non.relevant.bars_proptransfrom_Viz,Non.relevant.bars_proptransfrom_labels,Non.relevant.bars_proptransfrom_legend,Non.relevant.bars_stddevdistance,Non.relevant.bars_stddevfixationduration,Non.relevant.bars_stddevpupilsize,Non.relevant.bars_stddevpupilvelocity,Non.relevant.bars_timetofirstfixation,Text_fixationrate,Text_leftclicrate,Text_longestfixation,Text_maxdistance,Text_maxpupilsize,Text_maxpupilvelocity,Text_meandistance,Text_meanfixationduration,Text_meanpupilsize,Text_meanpupilvelocity,Text_mindistance,Text_minpupilsize,Text_minpupilvelocity,Text_numtransfrom_Non.relevant.bars,Text_numtransfrom_Refs,Text_numtransfrom_Relevant.bars,Text_numtransfrom_Text,Text_numtransfrom_Viz,Text_numtransfrom_labels,Text_numtransfrom_legend,Text_proportionnum,Text_proportiontime,Text_proptransfrom_Non.relevant.bars,Text_proptransfrom_Refs,Text_proptransfrom_Relevant.bars,Text_proptransfrom_Text,Text_proptransfrom_Viz,Text_proptransfrom_labels,Text_proptransfrom_legend,Text_stddevdistance,Text_stddevfixationduration,Text_stddevpupilsize,Text_stddevpupilvelocity,Text_timetofirstfixation,Refs_fixationrate,Refs_leftclicrate,Refs_longestfixation,Refs_maxdistance,Refs_maxpupilsize,Refs_maxpupilvelocity,Refs_meandistance,Refs_meanfixationduration,Refs_meanpupilsize,Refs_meanpupilvelocity,Refs_mindistance,Refs_minpupilsize,Refs_minpupilvelocity,Refs_numtransfrom_Non.relevant.bars,Refs_numtransfrom_Refs,Refs_numtransfrom_Relevant.bars,Refs_numtransfrom_Text,Refs_numtransfrom_Viz,Refs_numtransfrom_labels,Refs_numtransfrom_legend,Refs_proportionnum,Refs_proportiontime,Refs_proptransfrom_Non.relevant.bars,Refs_proptransfrom_Refs,Refs_proptransfrom_Relevant.bars,Refs_proptransfrom_Text,Refs_proptransfrom_Viz,Refs_proptransfrom_labels,Refs_proptransfrom_legend,Refs_stddevdistance,Refs_stddevfixationduration,Refs_stddevpupilsize,Refs_stddevpupilvelocity,Refs_timetofirstfixation,labels_fixationrate,labels_leftclicrate,labels_longestfixation,labels_maxdistance,labels_maxpupilsize,labels_maxpupilvelocity,labels_meandistance,labels_meanfixationduration,labels_meanpupilsize,labels_meanpupilvelocity,labels_mindistance,labels_minpupilsize,labels_minpupilvelocity,labels_numtransfrom_Non.relevant.bars,labels_numtransfrom_Refs,labels_numtransfrom_Relevant.bars,labels_numtransfrom_Text,labels_numtransfrom_Viz,labels_numtransfrom_labels,labels_numtransfrom_legend,labels_proportionnum,labels_proportiontime,labels_proptransfrom_Non.relevant.bars,labels_proptransfrom_Refs,labels_proptransfrom_Relevant.bars,labels_proptransfrom_Text,labels_proptransfrom_Viz,labels_proptransfrom_labels,labels_proptransfrom_legend,labels_stddevdistance,labels_stddevfixationduration,labels_stddevpupilsize,labels_stddevpupilvelocity,labels_timetofirstfixation,Viz_fixationrate,Viz_leftclicrate,Viz_longestfixation,Viz_maxdistance,Viz_maxpupilsize,Viz_maxpupilvelocity,Viz_meandistance,Viz_meanfixationduration,Viz_meanpupilsize,Viz_meanpupilvelocity,Viz_mindistance,Viz_minpupilsize,Viz_minpupilvelocity,Viz_numtransfrom_Non.relevant.bars,Viz_numtransfrom_Refs,Viz_numtransfrom_Relevant.bars,Viz_numtransfrom_Text,Viz_numtransfrom_Viz,Viz_numtransfrom_labels,Viz_numtransfrom_legend,Viz_proportionnum,Viz_proportiontime,Viz_proptransfrom_Non.relevant.bars,Viz_proptransfrom_Refs,Viz_proptransfrom_Relevant.bars,Viz_proptransfrom_Text,Viz_proptransfrom_Viz,Viz_proptransfrom_labels,Viz_proptransfrom_legend,Viz_stddevdistance,Viz_stddevfixationduration,Viz_stddevpupilsize,Viz_stddevpupilvelocity,Viz_timetofirstfixation,legend_fixationrate,legend_leftclicrate,legend_longestfixation,legend_maxdistance,legend_maxpupilsize,legend_maxpupilvelocity,legend_meandistance,legend_meanfixationduration,legend_meanpupilsize,legend_meanpupilvelocity,legend_mindistance,legend_minpupilsize,legend_minpupilvelocity,legend_numtransfrom_Non.relevant.bars,legend_numtransfrom_Refs,legend_numtransfrom_Relevant.bars,legend_numtransfrom_Text,legend_numtransfrom_Viz,legend_numtransfrom_labels,legend_numtransfrom_legend,legend_proportionnum,legend_proportiontime,legend_proptransfrom_Non.relevant.bars,legend_proptransfrom_Refs,legend_proptransfrom_Relevant.bars,legend_proptransfrom_Text,legend_proptransfrom_Viz,legend_proptransfrom_labels,legend_proptransfrom_legend,legend_stddevdistance,legend_stddevfixationduration,legend_stddevpupilsize,legend_stddevpupilvelocity,legend_timetofirstfixation,Meara
0,0.005253,584.27,0.242973,0.616038,0.004208,5.656897,801.322503,158,597.97,0.352859,0.04750,9.533353,1.257630,577.898311,193.704797,147.481525,0.141281,0.001564,1.468932,151.982941,44.272289,3.072011,562.18,-0.081281,0,1.005474,0.006090,564.22,0.324036,1.339614,4.998676,91.614244,129.867612,0.053090,0.001529,1.291865,127.501722,16.232477,1.554824,0.005569,0,408,587.15,0.246575,0.015000,575.552965,179,0.156026,0.001412,565.65,0.075442,0,3,1,11,0,22,7,1,0.070111,0.050232,0.066667,0.022222,0.244444,0.000000,0.488889,0.155556,0.022222,4.301809,70.289556,0.032816,0.001365,29862,0.003889,0.0,400,587.36,0.235767,0.023125,574.235015,257.0,0.152979,0.001467,565.92,0.026803,0,1,1,1,0,5,5,0,0.014760,0.015144,0.076923,0.076923,0.076923,0.000000,0.384615,0.384615,0.000000,5.489807,79.565786,0.052762,0.001808,29663,0.004754,0.0,858,597.97,0.342050,0.028750,578.654423,210,0.149280,0.001490,569.72,0.025002,0,0,21,2,60,1,1,0,0.162362,0.136272,0.000000,0.247059,0.023529,0.705882,0.011765,0.011765,0.000000,5.380431,117.116211,0.043289,0.001381,483,0.005456,0.0,491,596.71,0.293412,0.023333,580.031474,183,0.135458,0.001600,568.67,-0.005622,0,0,202,1,23,6,2,0,0.437269,0.319809,0.000000,0.863248,0.004274,0.098291,0.025641,0.008547,0.000000,4.364106,75.973708,0.054177,0.001495,11009,0.004606,0,433,587.21,0.347454,0.025000,575.235915,217,0.164458,0.001475,562.18,0.008789,0,3,1,8,1,28,19,1,0.079336,0.068732,0.049180,0.016393,0.131148,0.016393,0.459016,0.311475,0.016393,4.260119,79.841062,0.057084,0.001480,58,0.004977,0.0,658,587.21,0.352859,0.025000,575.261347,200,0.139940,0.001557,562.18,-0.023636,0,4,4,23,2,105,27,14,0.273063,0.218904,0.022346,0.022346,0.128492,0.011173,0.586592,0.150838,0.078212,3.999396,97.003892,0.057547,0.001481,58,0.005174,0,425,584.71,0.279001,0.010625,576.960161,193.0,0.142465,0.001549,566.98,0.026803,0,0,0,1,1,13,2,4,0.027675,0.021343,0.000000,0.000000,0.047619,0.047619,0.619048,0.095238,0.190476,2.360448,88.729927,0.054982,0.001421,26681,1
1,0.003738,627.74,0.004676,0.380553,0.003372,8.013937,900.926643,125,637.39,0.285134,0.06750,10.277079,1.113450,625.292156,260.786207,113.366435,0.051404,0.001517,1.631835,124.562240,37.373698,2.897559,598.58,-0.172828,0,1.175484,0.005453,608.90,0.096979,1.298567,6.201663,156.543351,114.573254,0.053242,0.001409,1.308442,116.253271,14.146733,1.424202,0.003998,0,491,635.21,0.166205,0.005000,628.134024,250,0.068362,0.001238,600.00,-1.000000,-1,1,1,9,1,10,7,1,0.025287,0.020769,0.033333,0.033333,0.300000,0.033333,0.333333,0.233333,0.033333,2.222891,103.503623,0.055203,0.001036,-1,0.004074,0.0,491,634.62,0.173306,0.010625,628.362383,245.0,0.077030,0.001272,600.00,-1.000000,-1,8,0,2,2,5,5,0,0.019540,0.015750,0.363636,0.000000,0.090909,0.090909,0.227273,0.227273,0.000000,2.089551,116.621718,0.040012,0.001182,-1,0.003911,0.0,1091,636.54,0.176856,0.022222,627.503324,255,0.054561,0.001392,600.00,-1.000000,-1,0,11,0,186,5,1,0,0.233333,0.195917,0.000000,0.054187,0.000000,0.916256,0.024631,0.004926,0.000000,3.299586,135.141785,0.038587,0.001175,-1,0.003417,0.0,1607,637.27,0.285134,0.017500,629.658376,292,0.041462,0.001520,600.00,-1.000000,-1,1,237,1,10,4,1,0,0.296552,0.284999,0.003937,0.933071,0.003937,0.039370,0.015748,0.003937,0.000000,1.942539,203.913625,0.056349,0.001297,-1,0.003506,0,1207,637.39,0.167980,0.008750,628.499394,285,0.064595,0.001284,600.00,-1.000000,-1,5,2,5,3,24,16,0,0.044828,0.041980,0.090909,0.036364,0.090909,0.054545,0.436364,0.290909,0.000000,2.079193,188.926329,0.045532,0.001068,-1,0.004168,0.0,1207,637.39,0.210582,0.052500,628.068812,239,0.059874,0.001357,600.00,-1.000000,-1,7,6,9,3,67,26,4,0.108046,0.085115,0.057377,0.049180,0.073770,0.024590,0.549180,0.213115,0.032787,2.329255,144.171493,0.052216,0.001457,-1,0.005828,0,250,633.58,0.120054,0.005000,630.013835,171.0,0.069985,0.001183,600.00,-1.000000,-1,0,0,0,0,5,2,1,0.005747,0.003238,0.000000,0.000000,0.000000,0.000000,0.625000,0.250000,0.125000,0.682840,50.027492,0.039429,0.000977,-1,0
2,0.005568,604.92,0.136492,0.582493,0.004269,5.702708,720.453525,100,638.75,0.215044,0.03500,9.614025,1.309664,611.338880,194.609312,137.005542,0.090810,0.001398,1.432051,140.904293,39.924350,3.197232,578.43,-0.047352,0,1.099640,0.006064,617.82,0.121451,1.332978,6.291579,84.829092,115.309742,0.031332,0.001341,1.280354,106.907355,12.019863,1.505335,0.005099,0,575,630.29,0.163233,0.013750,615.546475,196,0.076297,0.001298,604.34,-0.018940,0,5,5,18,0,17,5,3,0.046559,0.037287,0.094340,0.094340,0.339623,0.000000,0.320755,0.094340,0.056604,5.169629,99.624740,0.025906,0.001178,34226,0.005296,0.0,483,625.53,0.178275,0.006875,609.678379,188.0,0.086567,0.001305,600.00,-1.000000,-1,4,7,2,3,5,2,0,0.021255,0.016389,0.173913,0.304348,0.086957,0.130435,0.217391,0.086957,0.000000,4.505379,108.645064,0.026054,0.001123,-1,0.005162,0.0,616,628.71,0.206687,0.027500,610.482442,193,0.092502,0.001510,578.43,-0.017268,0,0,22,2,239,7,2,1,0.278340,0.220223,0.000000,0.080586,0.007326,0.875458,0.025641,0.007326,0.003663,7.801554,78.675904,0.029942,0.001475,741,0.005360,0.0,508,638.75,0.215044,0.027778,610.806490,186,0.094904,0.001375,587.88,-0.003898,0,1,426,6,23,7,4,1,0.476721,0.363189,0.002137,0.910256,0.012821,0.049145,0.014957,0.008547,0.002137,5.658883,68.481757,0.030406,0.001268,591,0.004324,0,466,630.47,0.146520,0.010000,612.373118,231,0.079533,0.001231,603.63,-0.047352,0,6,2,10,2,32,30,1,0.052632,0.049712,0.072289,0.024096,0.120482,0.024096,0.385542,0.361446,0.012048,5.100606,105.443322,0.037348,0.001096,34626,0.004446,0.0,799,630.47,0.181618,0.035000,613.247149,224,0.084316,0.001276,603.41,-0.047352,0,11,6,18,6,106,38,18,0.150810,0.138514,0.054187,0.029557,0.088670,0.029557,0.522167,0.187192,0.088670,4.837651,122.752635,0.034066,0.001195,33960,0.005570,0,450,628.56,0.146520,0.005625,617.433891,179.0,0.092642,0.001301,612.87,0.039556,0,0,2,2,1,18,2,11,0.023279,0.017067,0.000000,0.055556,0.055556,0.027778,0.500000,0.055556,0.305556,2.563003,76.790743,0.027494,0.001038,33960,1
3,0.004128,608.32,-0.023487,0.379858,0.003412,12.448308,499.449216,133,652.39,0.181460,0.05375,8.918736,1.217268,609.525874,158.880184,112.019377,0.023824,0.002654,1.570718,136.590231,43.864583,2.868412,564.52,-0.152082,0,1.233429,0.005293,601.06,0.143283,1.182150,7.777768,96.043249,121.029520,0.036924,0.002552,1.101999,110.134180,24.462652,1.286900,0.005127,0,616,641.94,0.117163,0.013750,607.478823,195,0.023512,0.001940,601.62,-0.051617,0,7,1,11,0,7,3,2,0.039939,0.025280,0.225806,0.032258,0.354839,0.000000,0.225806,0.096774,0.064516,6.476423,149.942789,0.031769,0.001671,33543,0.006290,0.0,366,642.01,0.125200,0.021250,608.566405,158.0,0.027166,0.002178,601.48,-0.079747,0,23,2,7,2,11,1,1,0.069124,0.035664,0.489362,0.042553,0.148936,0.042553,0.234043,0.021277,0.021277,6.939237,69.557694,0.029589,0.002035,35900,0.006185,0.0,683,650.04,0.181460,0.053750,610.948493,161,0.028939,0.002749,564.52,-0.152082,0,2,26,0,91,5,1,0,0.216590,0.113648,0.016000,0.208000,0.000000,0.728000,0.040000,0.008000,0.000000,8.239889,93.942460,0.045790,0.002704,208,0.006653,0.0,583,648.00,0.141274,0.042222,609.111355,150,0.022065,0.002753,600.27,-0.127970,0,2,251,0,22,1,0,0,0.471582,0.230053,0.007246,0.909420,0.000000,0.079710,0.003623,0.000000,0.000000,7.401913,87.768840,0.033920,0.002535,8794,0.005758,0,458,638.60,0.109125,0.017500,605.900612,173,0.034230,0.002346,601.66,-0.047599,0,1,0,1,0,4,1,0,0.009217,0.005195,0.142857,0.000000,0.142857,0.000000,0.571429,0.142857,0.000000,6.935181,148.062149,0.027832,0.002375,33860,0.005605,0.0,591,643.08,0.121181,0.017500,609.249664,178,0.021276,0.002230,601.66,-0.071710,0,10,1,8,1,40,1,8,0.101382,0.058701,0.144928,0.014493,0.115942,0.014493,0.579710,0.014493,0.115942,8.247453,123.917901,0.027065,0.002070,29871,0.007378,0,308,640.23,0.121181,0.008750,608.733515,135.0,0.020978,0.002034,602.44,-0.055636,0,1,0,2,0,7,0,6,0.016897,0.007433,0.062500,0.000000,0.125000,0.000000,0.437500,0.000000,0.375000,9.576531,0.000000,0.023641,0.001691,29871,0
4,0.004469,620.98,-0.012834,0.616210,0.004258,5.343540,809.217997,75,623.38,0.089868,0.02750,10.888936,1.061308,620.462383,199.901099,146.334538,-0.006185,0.000892,1.600592,156.357619,38.328612,3.625005,615.48,-0.084879,0,1.033238,0.006665,621.40,0.011692,1.270663,1.600162,89.907887,129.975614,0.026852,0.000857,1.343517,133.748630,10.443827,1.996898,0.003619,0,924,621.75,0.083736,0.003889,618.753633,276,-0.004657,0.000772,616.25,-0.078747,0,2,1,11,3,10,2,0,0.074176,0.081450,0.068966,0.034483,0.379310,0.103448,0.344828,0.068966,0.000000,1.607427,185.172165,0.031453,0.000675,16114,0.004380,0.0,625,621.32,0.089868,0.003750,619.273660,228.0,0.013782,0.000755,617.00,-0.060353,0,4,0,4,0,4,1,0,0.032967,0.029912,0.307692,0.000000,0.307692,0.000000,0.307692,0.076923,0.000000,0.921514,0.000000,0.032382,0.000664,16980,0.005404,0.0,566,623.38,0.071474,0.020000,621.438261,185,0.005751,0.000831,615.50,-0.068017,0,0,14,1,83,3,0,0,0.291209,0.214122,0.000000,0.138614,0.009901,0.821782,0.029703,0.000000,0.000000,1.089941,72.523756,0.026071,0.000846,5205,0.005008,0.0,458,622.76,0.062276,0.027500,620.964993,199,-0.009988,0.000919,615.48,-0.080280,0,1,108,0,11,0,0,0,0.370879,0.294306,0.008333,0.900000,0.000000,0.091667,0.000000,0.000000,0.000000,0.822880,61.238706,0.022024,0.000901,4264,0.004308,0,408,621.83,0.074539,0.004444,620.807471,232,-0.020847,0.000957,616.61,-0.066484,0,1,0,3,0,13,12,0,0.046703,0.043078,0.034483,0.000000,0.103448,0.000000,0.448276,0.413793,0.000000,0.955031,0.000000,0.025665,0.000801,-1,0.004969,0.0,483,622.04,0.074539,0.008750,619.010110,201,-0.021135,0.000958,615.90,-0.084879,0,4,1,9,1,37,13,0,0.153846,0.123021,0.061538,0.015385,0.138462,0.015385,0.569231,0.200000,0.000000,1.811607,77.219993,0.024742,0.000849,15689,0.000000,0,-1,622.04,0.037751,0.001875,619.708333,0.0,0.009648,0.001238,600.00,-1.000000,-1,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.350251,0.000000,0.021844,0.000575,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,0.004197,586.13,0.152576,0.335643,0.003837,9.146324,862.388871,100,631.19,0.317738,0.12750,10.390227,1.097428,582.827328,228.967769,87.755138,0.069363,0.001247,1.671567,93.114176,32.831126,2.427833,546.72,-0.381978,0,0.946625,0.006372,594.65,0.179215,1.237051,8.827773,134.318609,106.061331,0.046858,0.001875,1.244855,100.164795,12.884749,1.317761,0.004603,0,608,607.09,0.195199,0.082500,578.410241,217,0.071695,0.001197,551.43,-0.140451,0,5,1,48,1,58,21,5,0.094215,0.068938,0.035971,0.007194,0.345324,0.007194,0.417266,0.151079,0.035971,7.534927,125.095008,0.041604,0.002471,33969,0.003614,0.0,866,601.78,0.207630,0.040000,580.333961,276.0,0.088521,0.001179,600.00,-1.000000,-1,22,0,12,0,10,2,0,0.036364,0.033890,0.478261,0.000000,0.260870,0.000000,0.217391,0.043478,0.000000,4.561169,198.874625,0.052429,0.001827,-1,0.004429,0.0,758,625.61,0.317738,0.048750,588.129299,225,0.086158,0.001191,555.94,-0.097829,0,0,28,1,268,2,0,0,0.252066,0.191657,0.000000,0.093645,0.003344,0.896321,0.006689,0.000000,0.000000,8.977246,130.704833,0.051709,0.001495,333,0.004406,0.0,1083,615.73,0.216510,0.055000,581.874962,226,0.053309,0.001373,553.95,-0.119140,0,0,329,1,33,1,0,0,0.315702,0.241301,0.000000,0.903846,0.002747,0.090659,0.002747,0.000000,0.000000,7.551816,136.423572,0.042420,0.001652,10177,0.003926,0,933,599.35,0.207630,0.041250,576.101983,254,0.076122,0.001151,546.72,-0.136900,0,7,1,20,0,66,53,0,0.077686,0.066639,0.047619,0.006803,0.136054,0.000000,0.448980,0.360544,0.000000,9.346065,140.097662,0.044007,0.001760,36667,0.004336,0.0,933,607.46,0.218286,0.127500,581.214298,230,0.071554,0.001154,546.72,-0.381978,0,17,5,51,1,260,68,18,0.277686,0.215671,0.040476,0.011905,0.121429,0.002381,0.619048,0.161905,0.042857,8.983570,127.138542,0.041141,0.002174,28905,0.005249,0,375,592.44,0.196975,0.051250,571.274592,190.0,0.053663,0.001439,600.00,-1.000000,-1,2,0,2,0,19,0,14,0.019008,0.012197,0.054054,0.000000,0.054054,0.000000,0.513514,0.000000,0.378378,7.870279,78.255409,0.035128,0.003611,-1,0
49,0.004664,669.97,-0.043406,0.494179,0.003917,7.919922,928.434306,125,676.42,0.454334,0.07750,10.775385,1.196689,629.796446,186.325123,126.800908,0.017814,0.002318,1.581028,126.537680,38.186747,2.848540,534.21,-0.335050,0,0.789210,0.006131,612.90,0.100472,1.202019,14.378269,108.750782,141.057886,0.056758,0.003143,1.179814,133.683253,17.735347,1.485214,0.005638,0,433,674.47,0.123804,0.027500,638.624893,177,0.004324,0.001999,613.48,-0.121178,0,4,2,15,1,18,11,3,0.049261,0.027834,0.074074,0.037037,0.277778,0.018519,0.333333,0.203704,0.055556,11.752383,92.895999,0.037619,0.002173,31812,0.005447,0.0,433,665.59,0.102416,0.011111,635.577754,183.0,0.019672,0.001432,600.00,-1.000000,-1,9,3,3,0,9,3,0,0.029557,0.017285,0.333333,0.111111,0.111111,0.000000,0.333333,0.111111,0.000000,5.715124,110.440619,0.032325,0.001469,-1,0.005749,0.0,433,666.37,0.180188,0.021250,628.725319,173,0.040135,0.002130,553.85,-0.070626,0,0,18,0,49,2,1,0,0.089901,0.049810,0.000000,0.257143,0.000000,0.700000,0.028571,0.014286,0.000000,16.246231,93.044792,0.055353,0.002625,1033,0.005141,0.0,833,669.66,0.160745,0.022500,629.071075,194,0.009551,0.001947,537.10,-0.105623,0,0,178,2,19,8,6,1,0.263547,0.163303,0.000000,0.831776,0.009346,0.088785,0.037383,0.028037,0.004673,15.096005,112.083030,0.038905,0.001931,33,0.005222,0,758,676.42,0.158801,0.033750,637.547695,191,0.004304,0.001949,613.40,-0.187284,0,5,3,10,1,36,27,3,0.067734,0.041321,0.058824,0.035294,0.117647,0.011765,0.423529,0.317647,0.035294,10.723422,132.127604,0.043663,0.001964,30696,0.005070,0.0,758,676.42,0.174355,0.051250,636.023874,197,0.012251,0.001921,613.29,-0.187284,0,11,7,19,2,90,33,12,0.163793,0.102920,0.063218,0.040230,0.109195,0.011494,0.517241,0.189655,0.068966,11.573475,121.137162,0.043636,0.002052,27948,0.004707,0,450,676.08,0.147135,0.010000,643.774171,212.0,0.014384,0.002122,600.00,-1.000000,-1,1,0,2,0,13,1,7,0.019704,0.013334,0.041667,0.000000,0.083333,0.000000,0.541667,0.041667,0.291667,6.422749,122.680343,0.041813,0.001786,-1,1
50,0.003435,647.70,-0.020904,0.307631,0.002896,10.428310,881.292085,92,655.44,0.205512,0.12125,11.750561,1.191419,644.790581,315.404545,106.698228,-0.014763,0.001511,1.636148,115.926292,35.238866,2.835283,630.69,-0.378886,0,1.065377,0.004696,653.11,0.162677,1.243918,4.582827,195.367656,121.360740,0.041421,0.001763,1.223535,118.795623,13.302749,1.559938,0.003306,0,775,650.69,0.089245,0.067500,645.523984,302,-0.009286,0.001397,631.47,-0.204484,0,14,0,14,0,26,15,1,0.060227,0.050860,0.200000,0.000000,0.200000,0.000000,0.371429,0.214286,0.014286,3.965239,142.574744,0.036138,0.001838,5063,0.003196,0.0,1316,653.23,0.144319,0.016667,645.693182,312.0,-0.011302,0.001415,630.69,-0.117284,0,15,0,11,3,27,11,0,0.063636,0.055577,0.223881,0.000000,0.164179,0.044776,0.402985,0.164179,0.000000,3.274603,209.638650,0.044575,0.001258,600,0.003192,0.0,1407,654.76,0.148908,0.035556,646.326811,313,-0.001065,0.001361,633.26,-0.149410,0,3,21,0,198,6,2,0,0.264773,0.231538,0.013043,0.091304,0.000000,0.860870,0.026087,0.008696,0.000000,4.470448,205.317490,0.044727,0.001191,766,0.003066,0.0,1990,653.32,0.096894,0.040000,643.703624,326,-0.032706,0.001655,632.97,-0.201425,0,1,287,0,20,2,0,1,0.353409,0.321729,0.003215,0.922830,0.000000,0.064309,0.006431,0.000000,0.003215,3.881591,204.157924,0.030804,0.001499,1074,0.003231,0,858,650.77,0.069357,0.017500,644.932687,309,0.001259,0.001413,632.73,-0.095866,0,10,2,14,3,33,22,0,0.070455,0.060877,0.119048,0.023810,0.166667,0.035714,0.392857,0.261905,0.000000,5.090853,168.865986,0.027655,0.001286,6112,0.003157,0.0,1432,655.44,0.162677,0.031250,644.273781,316,-0.005198,0.001423,630.93,-0.146350,0,21,3,29,10,140,35,17,0.236364,0.208999,0.082353,0.011765,0.113725,0.039216,0.549020,0.137255,0.066667,5.447614,181.347546,0.040939,0.001309,2024,0.004020,0,475,652.07,0.101483,0.007500,643.643035,248.0,0.028637,0.001234,634.30,-0.075978,0,0,0,2,2,16,0,13,0.022727,0.015782,0.000000,0.000000,0.060606,0.060606,0.484848,0.000000,0.393939,4.999656,128.215485,0.026157,0.001019,2024,0
51,0.004258,621.97,0.000293,0.373186,0.003581,8.187138,652.467900,99,641.22,0.217075,0.02500,9.154320,1.193225,612.041242,249.814092,104.569533,0.044498,0.001070,1.564801,105.534867,33.857811,2.743302,586.89,-0.146809,0,0.958096,0.005565,610.83,-0.008997,1.313371,7.822530,132.316246,107.596752,0.050044,0.001050,1.314472,94.069298,11.563551,1.343627,0.003313,0,1208,632.78,0.200042,0.020625,612.693328,301,0.062542,0.001031,597.71,-0.108098,0,6,1,43,2,33,12,2,0.072156,0.072802,0.060606,0.010101,0.434343,0.020202,0.333333,0.121212,0.020202,7.503312,195.148356,0.060268,0.001032,47925,0.004137,0.0,708,638.52,0.144298,0.024375,612.184212,241.0,0.020993,0.001165,595.84,-0.146809,0,14,1,9,1,17,10,0,0.037351,0.030182,0.269231,0.019231,0.173077,0.019231,0.326923,0.192308,0.000000,11.105716,136.610123,0.059365,0.001426,48349,0.004223,0.0,741,639.55,0.217075,0.017222,611.420851,236,0.050140,0.001054,586.89,-0.126679,0,0,24,0,279,3,1,0,0.265705,0.210343,0.000000,0.078176,0.000000,0.908795,0.009772,0.003257,0.000000,6.547130,112.783773,0.042713,0.001006,466,0.004154,0.0,883,641.22,0.201591,0.021250,614.127621,240,0.049540,0.001024,590.07,-0.089516,0,1,371,0,19,4,2,1,0.346350,0.278686,0.002513,0.932161,0.000000,0.047739,0.010050,0.005025,0.002513,6.653053,119.377995,0.041194,0.000980,11308,0.003677,0,816,640.46,0.190752,0.012500,608.317564,271,0.050912,0.001030,596.19,-0.135970,0,10,1,15,1,77,67,1,0.088285,0.080253,0.058140,0.005814,0.087209,0.005814,0.447674,0.389535,0.005814,8.832449,137.142510,0.059928,0.000942,55736,0.003726,0.0,1216,640.53,0.190752,0.017778,609.513349,268,0.032831,0.001106,586.98,-0.135970,0,20,1,33,6,218,78,32,0.238540,0.213987,0.051546,0.002577,0.085052,0.015464,0.561856,0.201031,0.082474,8.370769,143.977280,0.056785,0.001043,48524,0.004288,0,733,624.98,0.187655,0.014375,611.033703,233.0,0.018601,0.001146,602.13,-0.129776,0,1,0,2,1,31,2,24,0.029711,0.023164,0.016393,0.000000,0.032787,0.016393,0.508197,0.032787,0.393443,6.948755,124.101240,0.053337,0.001166,48524,0


(53, 278)


In [46]:
target = cognitive_ability

X = df.drop(columns=[target])
y = df[target]

# baseline score
DC = DummyClassifier(strategy="most_frequent")
DC.fit(X, y)
DC.score(X, y)

0.5094339622641509

# Functions

In [5]:
#--------------------------------------------------------------------------------------

def results(y, y_pred):
    # confusion_matatrix
    # TN, FP, FN, TP = confusion_matatrix.ravel()
    confusion_matatrix = confusion_matrix(y, y_pred)
    report = classification_report(y, y_pred, digits=5, target_names=['Low', 'High'])

    class_accuracy = confusion_matatrix.diagonal()/confusion_matatrix.sum(axis=1)
    overall_accuracy = confusion_matatrix.diagonal().sum()/confusion_matatrix.sum()

    data = {'Overall': overall_accuracy, 'Low': class_accuracy[0], 'High': class_accuracy[1]}
    results = pd.DataFrame(data, index = ['Accuracy'])

    display(results)
    # print(report)
    # print(confusion_matatrix)
    
#--------------------------------------------------------------------------------------

def results2(name, y, y_pred):

    confusion_matatrix = confusion_matrix(y, y_pred)
    class_accuracy = confusion_matatrix.diagonal()/confusion_matatrix.sum(axis=1)
    overall_accuracy = confusion_matatrix.diagonal().sum()/confusion_matatrix.sum()

    data = {'Overall': overall_accuracy, 'Low': class_accuracy[0], 'High': class_accuracy[1]}
    results = pd.DataFrame(data, index = [name])

    return results
    
#--------------------------------------------------------------------------------------

def ttest_results(y, y_pred_before, y_pred_after):
    
    low_before     = (np.array(y[y==0]) == y_pred_before[y==0]).astype(int)
    high_before    = (np.array(y[y==1]) == y_pred_before[y==1]).astype(int)
    overall_before = (np.array(y) == y_pred_before).astype(int)

    low_after     = (np.array(y[y==0]) == y_pred_after[y==0]).astype(int)
    high_after    = (np.array(y[y==1]) == y_pred_after[y==1]).astype(int)
    overall_after = (np.array(y) == y_pred_after).astype(int)

    low_class  = ttest_rel(low_before, low_after)
    high_class = ttest_rel(high_before, high_after)
    overall    = ttest_rel(overall_before, overall_after)

    print("low    :", low_class)
    print("high   :", high_class)
    print("overall:", overall)
    
#--------------------------------------------------------------------------------------

# Classifiers/ hyper-parameter distributions

In [6]:
#--------------------------------------------------------------------------------------
class_weight = {0:1, 1:1}
seed = None

#--------------------------------------------------------------------------------------
# models:

classifiers = {
    #'rf'  : RandomForestClassifier(random_state=seed, max_depth=2),
    'lr'  : LogisticRegression(solver='liblinear'),
    'svm' : SVC(gamma='auto'),
    #'xgb' : GradientBoostingClassifier(),
    'knn' : KNeighborsClassifier(),
    #'gpc' : GaussianProcessClassifier(),
    #'ada' : AdaBoostClassifier(LogisticRegression(solver='liblinear'), n_estimators=100),
    #'gnb' : GaussianNB(),
    #'qda' : QuadraticDiscriminantAnalysis()
}

estimators = [
    #('rf', RandomForestClassifier()),
    ('lr', LogisticRegression(solver='liblinear', class_weight = {0:10, 1:1})),
    ('lr2', LogisticRegression(solver='liblinear', class_weight = {0:1, 1:10})),
    ('lr3', LogisticRegression(solver='liblinear', C=0.1)),
    ('lr4', LogisticRegression(solver='liblinear', C=1)),
    ('lr5', LogisticRegression(solver='liblinear', C=10)),
    ('knn', KNeighborsClassifier()),
    #('ada', classifiers['ada']),
    ('svm', SVC(gamma='auto', class_weight = {0:10, 1:1})),
    ('svm2', SVC(gamma='auto', C=1)),
    ('svm3', SVC(gamma='auto', C=10)),
    ('svm4', SVC(gamma='auto', C=0.1)),
]
              
ensemble = StackingClassifier(estimators=estimators, passthrough=True, n_jobs=-1,
                              final_estimator=LogisticRegression())

#--------------------------------------------------------------------------------------
# hyper-parameters

lr_dist = {
    'estimator__C'       : [0.1, 1, 10],
    'estimator__penalty' : ['l1', 'l2']}
    
rf_dist = {
    'estimator__n_estimators'   : randint(low=10, high=300),
    'estimator__max_depth'      : randint(low=10, high=30)}

knn_dist = {
    'estimator__n_neighbors' : [2, 3, 4, 5, 6, 7, 8, 9, 10]}

svm_dist = {
    'estimator__C' : [0.1, 1, 10]}

distributions = {
    'rf'  : rf_dist,
    'lr'  : lr_dist,
    'knn' : knn_dist,
    'svm' : svm_dist,
    'gpc' : None,
    'ada' : None
}

#--------------------------------------------------------------------------------------
model = 'lr'
estimator = classifiers[model]
# parameters = distributions[model]

estimator = ensemble

# Pipelines

In [7]:
#--------------------------------------------------------------------------------------
# pipeline

preprocessing = Pipeline([
    ('vart', VarianceThreshold(threshold=0.0)),
    #('normalize', Normalizer()),
    ('scaler', StandardScaler()),
    ('minmax', MinMaxScaler())
])

feature_selection = Pipeline([
    ('sp', SelectPercentile(percentile=100)),
    ('kbest', SelectKBest(chi2, k=10)),
    #('rfe', RFE(RandomForestClassifier(), n_features_to_select=10, step=10))
    #('kbest', SelectKBest(mutual_info_classif, k=10)),
])

dim_reduction = Pipeline([
    ('pca', PCA(n_components=1)),
    #('fa', FeatureAgglomeration(n_clusters=2))
    #('kpca', KernelPCA(n_components=1, kernel='rbf'))
])

original = Pipeline([
    ('estimator', estimator)])

engineered = Pipeline([
    ('preprocessing', preprocessing),                              
    ('feature_selection', feature_selection),
    #('dim_reduction', dim_reduction),
    ('estimator', estimator)
])

In [319]:
loo = LeaveOneOut()
original_models = pd.DataFrame()
for name, estimator in classifiers.items():
    
    original = Pipeline([
    ('preprocessing', preprocessing),
    ('estimator', estimator)])
    
    y_pred  = cross_val_predict(original, X, y, cv=loo, n_jobs=-1, verbose=0)
    original_models = original_models.append(results2(name, y, y_pred))

display(original_models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  53 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  53 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  53 | elaps

Unnamed: 0,Overall,Low,High
rf,0.641509,0.653846,0.62963
lr,0.584906,0.538462,0.62963
svm,0.377358,0.0,0.740741
xgb,0.509434,0.538462,0.481481
knn,0.471698,0.538462,0.407407


In [321]:
engineered_models = pd.DataFrame()
for name, estimator in classifiers.items():
    
    engineered = Pipeline([
        ('preprocessing', preprocessing),                              
        ('feature_selection', feature_selection),
        #('dim_reduction', dim_reduction),
        ('estimator', estimator)])
    
    y_pred  = cross_val_predict(engineered, X, y, cv=loo, n_jobs=-1, verbose=0)
    engineered_models = engineered_models.append(results2(name, y, y_pred))
    
display(engineered_models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  53 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  53 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  53 | elapsed:    0.6s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38

Unnamed: 0,Overall,Low,High
rf,0.622642,0.653846,0.592593
lr,0.679245,0.692308,0.666667
svm,0.641509,0.730769,0.555556
xgb,0.528302,0.538462,0.518519
knn,0.641509,0.576923,0.703704


In [328]:
#--------------------------------------------------------------------------------------
# outer cross-validation on model, inner cross-validation on hyperparameters
#--------------------------------------------------------------------------------------
tuned_models = pd.DataFrame()
for name, estimator in classifiers.items():
    
    engineered = Pipeline([
        ('preprocessing', preprocessing),                              
        ('feature_selection', feature_selection),
        ('estimator', estimator)])
    
    parameters = distributions[name]
    
    inner_loop = RandomizedSearchCV(engineered, parameters, n_iter=10, cv=loo, refit=True, n_jobs=-1, verbose=False)
    y_pred  = cross_val_predict(inner_loop, X, y, cv=loo, n_jobs=-1, verbose=1)
    tuned_models = tuned_models.append(results2(name, y, y_pred))
    
display(tuned_models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:   51.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:   24.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:  1.3min finished


Unnamed: 0,Overall,Low,High
lr,0.622642,0.538462,0.703704
svm,0.641509,0.653846,0.62963
knn,0.603774,0.615385,0.592593


In [422]:
estimators = [
    ('lr', LogisticRegression(solver='liblinear', C=0.1, penalty='l1')),
    ('knn', KNeighborsClassifier()),
    ('knn3', KNeighborsClassifier(n_neighbors=3)),
    ('knn4', KNeighborsClassifier(n_neighbors=4)),
    ('svm', SVC(gamma='auto', class_weight = {0:10, 1:1})),
    ('svm2', SVC(gamma='auto', C=1)),
    ('svm3', SVC(gamma='auto', C=10)),
]
              
ensemble = StackingClassifier(estimators=estimators, passthrough=True, n_jobs=-1,
                              final_estimator=LogisticRegression())

estimator = ensemble

engineered = Pipeline([
        ('preprocessing', preprocessing),                              
        ('kbest', SelectKBest(chi2, k=10)),
        ('estimator', estimator)])

y_pred  = cross_val_predict(engineered, X, y, cv=loo, n_jobs=-1, verbose=1)
ensembled = (results2('ensemble', y, y_pred))
display(ensembled)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:    1.7s finished


Unnamed: 0,Overall,Low,High
ensemble,0.735849,0.730769,0.740741


In [157]:
loo = LeaveOneOut()

estimators = [
    #('rf', RandomForestClassifier(random_state=seed, max_depth=None)),
    #('xgb', GradientBoostingClassifier())
    ('lr', LogisticRegression()),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
    ('knn3', KNeighborsClassifier(n_neighbors=5)),
    #('knn4', KNeighborsClassifier(n_neighbors=4)),
    #('svm', SVC()),
    #('svm2', SVC(gamma='auto', C=1)),
    #('svm3', SVC(gamma='auto', C=10)),
    #('gpc', GaussianProcessClassifier()),
    #('ada', AdaBoostClassifier(LogisticRegression(solver='liblinear'), n_estimators=100)),
    #('gnb', GaussianNB()),
    #('qda', QuadraticDiscriminantAnalysis())
]
              
ensemble = StackingClassifier(estimators=estimators, passthrough=True, n_jobs=-1,
                              final_estimator=LogisticRegression())

estimator = ensemble

engineered = Pipeline([
        ('preprocessing', preprocessing),                              
        ('kbest', SelectKBest(chi2, k=10)),
        #('pca', PCA(n_components=10)),
        ('estimator', estimator)])

y_pred  = cross_val_predict(engineered, X, y, cv=loo, n_jobs=-1, verbose=1)
ensembled = (results2('ensemble', y, y_pred))
display(ensembled)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    1.6s finished


Unnamed: 0,Overall,Low,High
ensemble,0.703704,0.692308,0.714286


In [54]:
loo = LeaveOneOut()

estimators = [
    #('rf', RandomForestClassifier(random_state=seed, max_depth=None)),
    #('xgb', GradientBoostingClassifier()),
    ('lr', LogisticRegression(C=10)),
    ('lr2', LogisticRegression(C=1)),
    ('lr3', LogisticRegression(C=0.1)),
    ('knn', KNeighborsClassifier(n_neighbors=10)),
    ('knn3', KNeighborsClassifier(n_neighbors=5)),
    ('knn4', KNeighborsClassifier(n_neighbors=4)),
    ('svm', SVC(C=0.1)),
    ('svm2', SVC(gamma='auto', C=1)),
    ('svm3', SVC(gamma='auto', C=10)),
    #('gpc', GaussianProcessClassifier()),
    #('ada', AdaBoostClassifier(LogisticRegression(solver='liblinear'), n_estimators=100)),
    #('gnb', GaussianNB()),
    #('qda', QuadraticDiscriminantAnalysis())
]
              
ensemble = StackingClassifier(estimators=estimators, passthrough=True, n_jobs=-1,
                              final_estimator=LogisticRegression())

estimator = LogisticRegression()

parameters = {
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
}

engineered = Pipeline([
        ('preprocessing', preprocessing),                              
        ('kbest', SelectKBest(chi2, k=10)),
        #('pca', PCA(n_components=10)),
        ('estimator', estimator)])

ensembled = pd.DataFrame()

for i in range(0):
    inner_loop = BayesSearchCV(estimator, parameters, n_iter=10, n_points=1, cv=10, refit=True, n_jobs=-1, verbose=False)
    y_pred  = cross_val_predict(inner_loop, X, y, cv=loo, n_jobs=-1, verbose=1)
    ensembled = ensembled.append(results2('ensemble', y, y_pred))
    

inner_loop = BayesSearchCV(estimator, parameters, n_iter=10, n_points=1, cv=10, refit=True, n_jobs=-1, verbose=False)
inner_loop.fit(X, y)
#outer_loop = cross_validate(inner_loop, X, y, cv=loo, return_train_score=True, 
#                            n_jobs=-1, verbose=1, return_estimator=True)

ensembled = ensembled.append(results2('ensemble', y, y_pred))

display(ensembled)

Unnamed: 0,Overall,Low,High
ensemble,0.54717,0.538462,0.555556


In [56]:
# display(ensembled.mean(axis=0))
#for i in range(53):
#   print(outer_loop['estimator'][i].best_estimator_.get_params()['C'])

inner_loop.cv_results_

defaultdict(list,
            {'split0_test_score': [0.6666666666666666,
              0.5,
              0.6666666666666666,
              0.6666666666666666,
              0.5,
              0.5,
              0.5,
              0.5,
              0.6666666666666666,
              0.6666666666666666],
             'split1_test_score': [0.5,
              0.5,
              0.5,
              0.5,
              0.3333333333333333,
              0.5,
              0.5,
              0.3333333333333333,
              0.3333333333333333,
              0.5],
             'split2_test_score': [0.5,
              0.5,
              0.5,
              0.5,
              0.5,
              0.5,
              0.5,
              0.5,
              0.5,
              0.5],
             'split3_test_score': [0.4,
              0.4,
              0.4,
              0.4,
              0.4,
              0.4,
              0.2,
              0.2,
              0.4,
              0.2],
             '

In [170]:
# loo = LeaveOneOut()

# inner_loop = GridSearchCV(pipeline, parameters, cv=inner_rkf, refit=True, n_jobs=-1, verbose=False)
# inner_loop = RandomizedSearchCV(pipeline, parameters, n_iter=10, cv=inner_rkf, refit=True, n_jobs=-1, verbose=False)
# inner_loop = BayesSearchCV(pipeline, parameters, n_iter=2, n_points=2, cv=10, refit=True, n_jobs=-1, verbose=False)

# outer_loop = cross_validate(pipeline, X, y, cv=loo, return_train_score=True, 
#                             n_jobs=-1, verbose=1, return_estimator=True)
# outer_loop['estimator'][0].best_estimator_

# y_pred_before = cross_val_predict(original, X, y, cv=loo, n_jobs=-1, verbose=1)
#y_pred_after  = cross_val_predict(engineered, X, y, cv=loo, n_jobs=-1, verbose=1)

#--------------------------------------------------------------------------------------

In [421]:
# print("train score: ", np.mean(outer_loop['train_score']))
# print("test score:  ", np.mean(outer_loop['test_score']))

# results(y, y_pred_before)
# results(y, y_pred_after)

# ttest_results(y, y_pred_before, y_pred_after)

# To Do:
2. model names
3. parmeter set for each model
4. statistical analysis

In [102]:
from sklearn.manifold import TSNE
# X_embedded = TSNE(n_components=2).fit_transform(X)

engineered = Pipeline([
    ('preprocessing', preprocessing),                              
    ('feature_selection', SelectKBest(chi2, k=10)),
    #('estimator', estimator)
])



X_embedded = engineered.fit_transform(X, y)
X_embedded = PCA(n_components=2).fit_transform(X)

# fig = go.Figure(data=[go.Scatter3d(x=X_embedded[:, 0], y=X_embedded[:, 1], z=X_embedded[:, 2],
#                                    mode='markers', marker=dict(size=5, color=y, opacity=0.8))])
fig = go.Figure(data=[go.Scatter(x=X_embedded[:, 0], y=X_embedded[:, 1],
                                  mode='markers', marker=dict(size=5, color=y, opacity=0.8))])
# fig.show()

engineered = Pipeline([
    ('preprocessing', preprocessing),                              
    ('feature_selection', feature_selection),
    #('dim_reduction', dim_reduction)
])



In [60]:
d = pd.DataFrame()
d.loc['1', '3']=3
d

d.to_csv(r'C:\Users\Alire\Desktop\DF.csv', index=True)