# Project

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import uniform, randint, ttest_rel, ttest_ind

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt
%matplotlib inline

import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px
import cufflinks as cf

# from pandas_profiling import ProfileReport

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif 
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, SelectFromModel, RFE

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import LeaveOneOut, LeaveOneGroupOut
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

#preprocessing:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, normalize, scale, Normalizer, MinMaxScaler, FunctionTransformer

# models:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import FeatureAgglomeration
from sklearn.ensemble import StackingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
# set before running

Across = True                           # (True = Across, False = Within)
CA = True                               # (True = Cognative Ability, False = Task Performance)

run = True                             # whether to perform training and cross-validation
tune = False                           # whether to perform hyper-parameter tuning
n_rep = 1
cognitive_ability = 'Meara'             # choices: ('Meara', 'BarChartLit', 'VerbalWM_longest')
performance_measure = 'mmd_accuracy'    # choices: ('mmd_accuracy', 'mmd_task_time')

#------------------------------------------------------------------------------------------------
# loading the datasets

datasets = {}

if (Across): # if Across Tasks
    
    n_windows = 15
    target = cognitive_ability
    result_path = ("Results/A_" + str(target))

    for window in range(1, n_windows + 1):
        path = ("Data/Data_Across_CA/"+ str(target) + "/Task_" + str(window) + ".csv")
        datasets[window] = pd.read_csv(path)

else: # if Within Task
    
    n_windows = 29
    
    if (CA): # if CA
        
        target = cognitive_ability
        result_path = ("Results/W_" + str(target))
        
        for window in range(1, n_windows + 1):
            path = ("Data/Data_Within_CA/"+ str(target) + "/Cognitive_" + str(window) + ".csv")
            datasets[window] = pd.read_csv(path)  
            
    else: # if TP
        
        target = performance_measure
        result_path = ("Results/W_" + str(target))
        
        for window in range(1, n_windows + 1):
            path = ("Data/Data_Within_TP/"+ str(target) + "/Time_" + str(window) + ".csv")
            datasets[window] = pd.read_csv(path)
        
#------------------------------------------------------------------------------------------------
# display(df)
# print(df.shape)

## Functions

In [3]:
#--------------------------------------------------------------------------------------

def result(df):
    df['mean_Overall'] = df.groupby(
        ['Classifier', 'Window'])['Overall_Acc'].transform('mean')

    idx = df.groupby(
        ['Classifier'])['mean_Overall'].transform(max) == df['mean_Overall']

    display(df[idx].groupby(['Classifier']).mean()[['Window', 'Overall_Acc', 'Low_Acc', 'High_Acc']])

#--------------------------------------------------------------------------------------

def save(phase, df):
    path = (result_path + "_" + phase + ".csv")
    df.to_csv(path, index=True)
    
def read(phase):
    return pd.read_csv((result_path + "_" + phase + ".csv"), index_col=0)

#--------------------------------------------------------------------------------------

def plot_classifier(df, classifier):
    data = df[df['Classifier']==classifier].loc[:, ['Overall_Acc', 'Low_Acc', 'High_Acc']]
    fig = data.iplot(asFigure=True, xTitle='Window', yTitle='Accuracy', 
              title=classifier, legend='top', theme='white')
    fig.show()
    
#--------------------------------------------------------------------------------------

## Classifiers

In [4]:
# models:

classifiers = {
    'RF'  : RandomForestClassifier(n_jobs=-1),
    'GB'  : GradientBoostingClassifier(),
    'XGB' : XGBClassifier(n_jobs=-1),
    'LR'  : LogisticRegression(),
    'SVM' : LinearSVC(dual=False),
    'KNN' : KNeighborsClassifier(n_jobs=-1),
}

# Pipelines

In [5]:
preprocessing = Pipeline([
    ('vart', VarianceThreshold(threshold=0.0)),
    ('scaler', StandardScaler()),
    ('minmax', MinMaxScaler())])

feature_selection = Pipeline([
    ('sp', SelectPercentile(percentile=40))])

## Baseline

In [133]:
# baseline result
df = datasets[1]
X = df.drop(columns=[target])
y = df[target]

# baseline score
DC = DummyClassifier(strategy="most_frequent")
DC.fit(X, y)
baseline = DC.score(X, y)
print(baseline)

0.5


## Original

In [139]:
phase = 'Original'

loo = LeaveOneOut()
logo = LeaveOneGroupOut()

columns = ["CA/TP", "Classifier", "Window", "Repetition", "Overall_Acc", "Low_Acc", "High_Acc"]
original = pd.DataFrame(columns=columns)

if (run):
    
    for name, estimator in classifiers.items():

        print(name)
        if name in ['RF', 'GB']:
            repetitions = n_rep
        else:
            repetitions = 1    

        for window in range(1, n_windows + 1):

            for rep in range(repetitions):

                print(window, end="\r", flush=True)

                df = datasets[window]

                pipeline = Pipeline([
                    ('vart', VarianceThreshold(threshold=0.0)),
                    ('estimator', estimator)])

                # evaluation
                if (Across):
                    X = df.drop(columns=[target])
                    y = df[target]
                    y_pred  = cross_val_predict(pipeline, X, y, cv=loo, n_jobs=-1, verbose=0)
                
                else:
                    groups = df["Uid"]
                    X = df.drop(columns=[target, "Sc_id", "Uid"])
                    y = df[target]
                    y_pred  = cross_val_predict(pipeline, X, y, cv=logo.split(X, y, groups=groups), n_jobs=-1, verbose=0)
                    
                report  = classification_report(y, y_pred, output_dict=True)

                # results
                row = {"CA/TP"       : target,
                       "Classifier"  : name,
                       "Window"      : window,
                       "Repetition"  : rep,
                       "Overall_Acc" : report['accuracy'],
                       "Low_Acc"     : report['0']['recall'],
                       "High_Acc"    : report['1']['recall']       
                      }

                original = original.append(row, ignore_index=True) 

    save(phase=phase, df=original)

In [140]:
try:
    df = read(phase=phase)

    # display(df.groupby(['Classifier', 'Window']).mean())
    # display(df.groupby(['Classifier', 'Window']).std())

    result(df)
    # plot_classifier(df, 'SVM')
    
except:
    print("File not found!")

File not found!


## Pre-Processing (PP)

In [141]:
phase = 'PP'

loo = LeaveOneOut()
logo = LeaveOneGroupOut()

columns = ["CA/TP", "Classifier", "Window", "Repetition", "Overall_Acc", "Low_Acc", "High_Acc"]
pp = pd.DataFrame(columns=columns)

if (run):
    
    for name, estimator in classifiers.items():

        print(name)
        if name in ['RF', 'GB']:
            repetitions = n_rep
        else:
            repetitions = 1    

        for window in range(1, n_windows + 1):

            for rep in range(repetitions):

                print(window, end="\r", flush=True)

                df = datasets[window]

                pipeline = Pipeline([
                    ('PP', preprocessing),
                    ('estimator', estimator)])

                # evaluation
                if (Across):
                    X = df.drop(columns=[target])
                    y = df[target]
                    y_pred  = cross_val_predict(pipeline, X, y, cv=loo, n_jobs=-1, verbose=0)
                
                else:
                    groups = df["Uid"]
                    X = df.drop(columns=[target, "Sc_id", "Uid"])
                    y = df[target]
                    y_pred  = cross_val_predict(pipeline, X, y, cv=logo.split(X, y, groups=groups), n_jobs=-1, verbose=0)
                    
                report  = classification_report(y, y_pred, output_dict=True)

                # results
                row = {"CA/TP"       : target,
                       "Classifier"  : name,
                       "Window"      : window,
                       "Repetition"  : rep,
                       "Overall_Acc" : report['accuracy'],
                       "Low_Acc"     : report['0']['recall'],
                       "High_Acc"    : report['1']['recall']       
                      }

                pp = pp.append(row, ignore_index=True) 

    save(phase=phase, df=pp)

In [142]:
try:
    df = read(phase=phase)

    # display(df.groupby(['Classifier', 'Window']).mean())
    # display(df.groupby(['Classifier', 'Window']).std())

    result(df)
    # plot_classifier(df, 'SVM')
    
except:
    print("File not found!")

File not found!


## Pre-Processing + Feature Selection (PP + FS)

In [143]:
phase = 'PP+FS'

loo = LeaveOneOut()
logo = LeaveOneGroupOut()

columns = ["CA/TP", "Classifier", "Window", "Repetition", "Overall_Acc", "Low_Acc", "High_Acc"]
ppfs = pd.DataFrame(columns=columns)

if (run):
    
    for name, estimator in classifiers.items():

        print(name)
        if name in ['RF', 'GB']:
            repetitions = n_rep
        else:
            repetitions = 1    

        for window in range(1, n_windows + 1):

            for rep in range(repetitions):

                print(window, end="\r", flush=True)

                df = datasets[window]

                pipeline = Pipeline([
                    ('PP', preprocessing),
                    ('FS', feature_selection),
                    ('estimator', estimator)])

                # evaluation
                if (Across):
                    X = df.drop(columns=[target])
                    y = df[target]
                    y_pred  = cross_val_predict(pipeline, X, y, cv=loo, n_jobs=-1, verbose=0)
                
                else:
                    groups = df["Uid"]
                    X = df.drop(columns=[target, "Sc_id", "Uid"])
                    y = df[target]
                    y_pred  = cross_val_predict(pipeline, X, y, cv=logo.split(X, y, groups=groups), n_jobs=-1, verbose=0)
                    
                report  = classification_report(y, y_pred, output_dict=True)

                # results
                row = {"CA/TP"       : target,
                       "Classifier"  : name,
                       "Window"      : window,
                       "Repetition"  : rep,
                       "Overall_Acc" : report['accuracy'],
                       "Low_Acc"     : report['0']['recall'],
                       "High_Acc"    : report['1']['recall']       
                      }

                ppfs = ppfs.append(row, ignore_index=True) 

    save(phase=phase, df=ppfs)

In [144]:
try:
    df = read(phase=phase)

    # display(df.groupby(['Classifier', 'Window']).mean())
    # display(df.groupby(['Classifier', 'Window']).std())

    result(df)
    # plot_classifier(df, 'SVM')
    
except:
    print("File not found!")

File not found!


## Hyperparameter optimization

In [145]:
# hyper-parameter distributions

RF_dist = {
    'estimator__n_estimators'  : Integer(10, 500),
    'estimator__max_depth'     : Integer(1, 15)}

LR_dist = {
    'estimator__C'             : Real(1e-6, 1e+6, prior='log-uniform'),
    'estimator__penalty'       : Categorical(['l1', 'l2'])}


SVM_dist = {
    'estimator__C'             : Real(1e-6, 1e+6, prior='log-uniform')}

GB_dist = {
    'estimator__loss'          : Categorical(['deviance', 'exponential']),
    'estimator__max_depth'     : Integer(1, 10),
    'estimator__gamma'         : Real(0, 0.5),
    'estimator__learning_rate' : Real(0.05, 0.30)}

XGB_dist = {
    'estimator__max_depth'     : Integer(1, 10), 
    'estimator__gamma'         : Real(0, 0.5),
    'estimator__learning_rate' : Real(0.05, 0.30)}

KNN_dist = {
    'estimator__n_neighbors'   : Integer(1,10)}

distributions = {
    'RF'  : RF_dist,
    'LR'  : LR_dist,
    'SVM' : SVM_dist,
    'GB'  : GB_dist,
    'XGB' : XGB_dist,
    'KNN' : KNN_dist,}

#--------------------------------------------------------------------------------------

In [146]:
phase = 'tuned'

loo = LeaveOneOut()
logo = LeaveOneGroupOut()

repetitions = n_rep
columns = ["CA/TP", "Classifier", "Repetition", "Overall_Acc", "Low_Acc", "High_Acc"]
tuned = pd.DataFrame(columns=columns)

if (tune):
    for name, estimator in classifiers.items():

        if (name == "RF"):

            for rep in range(repetitions):

                print(rep, end="\r", flush=True)

                best_window = 1
                df = datasets[best_window]

                pipeline = Pipeline([
                        ('PP', preprocessing),
                        ('FS', feature_selection),
                        ('estimator', estimator)])

                parameters = distributions[name]
                
                # evaluation
                inner_loop = BayesSearchCV(pipeline, parameters, n_iter=10, n_points=5, cv=10, refit=True, n_jobs=-1, verbose=0)
                
                if (Across):
                    X = df.drop(columns=[target])
                    y = df[target]
                    y_pred  = cross_val_predict(inner_loop, X, y, cv=loo, n_jobs=-1, verbose=0)
                
                else:
                    groups = df["Uid"]
                    X = df.drop(columns=[target, "Sc_id", "Uid"])
                    y = df[target]
                    y_pred  = cross_val_predict(inner_loop, X, y, cv=logo.split(X, y, groups=groups), n_jobs=-1, verbose=0)
                    
                report  = classification_report(y, y_pred, output_dict=True)

                # results
                row = {"CA/TP"       : target,
                       "Classifier"  : name,
                       "Repetition"  : rep,
                       "Overall_Acc" : report['accuracy'],
                       "Low_Acc"     : report['0']['recall'],
                       "High_Acc"    : report['1']['recall']       
                      }

                tuned = tuned.append(row, ignore_index=True) 

    save(phase=phase, df=tuned)

In [148]:
try:
    df = read(phase=phase)
    df
except:
    print("File not found!")

File not found!


## Feature importance

In [24]:
Across = True                           # (True = Across, False = Within)
CA = True                               # (True = Cognative Ability, False = Task Performance)

cognitive_ability = 'Meara'             # choices: ('Meara', 'BarChartLit', 'VerbalWM_longest')
performance_measure = 'mmd_accuracy'    # choices: ('mmd_accuracy', 'mmd_task_time')

#-----------------------------------------------------------------------------------------------
name = 'RF'       # choices = ('RF', 'GB', 'XGB, 'LR', 'SVM')
best_window = 4

#-----------------------------------------------------------------------------------------------
estimator = classifiers[name]
pipeline = Pipeline([
                    ('PP', preprocessing),
                    ('estimator', estimator)])

if (Across):
    target = cognitive_ability
    df = pd.read_csv("Data/Data_Across_CA/"+ str(target) + "/Task_" + str(best_window) + ".csv")
    X = df.drop(columns=[target])
    y = df[target]
                
else:
    if (CA):
        target = cognitive_ability
        df = pd.read_csv("Data/Data_Within_CA/"+ str(target) + "/Cognitive_" + str(best_window) + ".csv")
    else:
        target = performance_measure
        df = pd.read_csv("Data/Data_Within_TP/"+ str(target) + "/Time_" + str(best_window) + ".csv")
        
    X = df.drop(columns=[target, "Sc_id", "Uid"])
    y = df[target]

pipeline.fit(X, y)

if name in ['RF', 'GB', 'XGB']:
    data = pipeline['estimator'].feature_importances_
    index = X.columns[pipeline['PP']['vart'].get_support()]
    importances = pd.DataFrame(data=data, index=index, columns=["Importance"])
    features = importances.sort_values(by="Importance", ascending=False)

elif name in ['LR', 'SVM']:
    data = pipeline['estimator'].coef_.ravel()
    index = X.columns[pipeline['PP']['vart'].get_support()]
    coefficients = pd.DataFrame(data=data, index=index, columns=["coef"])
    coefficients["|coef|"] = coefficients["coef"].abs()
    features = coefficients.sort_values(by="|coef|", ascending=False)

if (Across):
    print(f"Top features for (Aross_{cognitive_ability}) \n\tClassifier: {name} \n\tWindow: {best_window}")
else:
    if (CA):
        print(f"Top features for (Within_{cognitive_ability}) \n\tClassifier: {name} \n\tWindow: {best_window}")
    else:
        print(f"Top features for (Within_{performance_measure}) \n\tClassifier: {name} \n\tWindow: {best_window}")
        
display(features.head(10))

Top features for (Aross_Meara) 
	Classifier: RF 
	Window: 4


Unnamed: 0,Importance
labels_fixationrate,0.026535
Relevant.bars_stddevfixationduration,0.022477
Refs_proportiontime,0.02072
labels_meanfixationduration,0.02027
Text_longestfixation,0.019693
meansaccadeduration,0.018248
endpupilsize,0.016816
Relevant.bars_meanpupilvelocity,0.016751
meanfixationduration,0.015649
Viz_meanpupilvelocity,0.015582
