# Initial set up

In [1]:
%pip install scikit-posthocs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-posthocs
  Downloading scikit_posthocs-0.7.0-py3-none-any.whl (38 kB)
Installing collected packages: scikit-posthocs
Successfully installed scikit-posthocs-0.7.0


In [128]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from google.colab import drive
from scipy import stats
import scikit_posthocs as sp
import plotly.express as px
import plotly.graph_objects as go

In [2]:
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Programming/Data Science/PES-IMD/Aprendizado de Máquina/Trabalho/Trab_AM/gato-cachorro_classificacao_de_racas/'

sns.set_theme(style='darkgrid')

Mounted at /content/drive
/content/drive/My Drive/Programming/Data Science/PES-IMD/Aprendizado de Máquina/Trabalho/Trab_AM/gato-cachorro_classificacao_de_racas


In [3]:
path_to_accuracies = './accuracies/'

# Functions

In [154]:
def plot_avg_accuracies(filename,xlabel):
    df_knn = pd.read_csv(filename)
    df_knn.set_index(['dataset','train/test_type'],inplace=True)
    knn_means = df_knn.describe().loc['mean',:]
    ax = sns.lineplot(y=knn_means.values, x=knn_means.index)
    ax.set_ylabel('acc')
    ax.set_xlabel(xlabel)
    # ax.set_xlim(xlabel)
    # image_name=filename.split('.')[0]
    # plt.savefig(f'{image_name}.pdf')
    
def plot_avg_df_accuracies(df_accuracies,xlabel):
    means = df_accuracies.describe().loc['mean',:]
    plt.figure(figsize=(10,6))
    ax = sns.lineplot(y=means.values, x=means.index.get_level_values(-1))
    ax.set_ylabel('Acurácia média')
    ax.set_xlabel(xlabel)
    ax.set_ylim(0.30,0.60)
    # image_name=filename.split('.')[0]
    # plt.savefig(f'{image_name}.pdf')

def plotly_avg_df_accuracies(df_accuracies,xlabel):
    df_means = df_accuracies.describe().loc['mean',:].to_frame()\
    .reset_index(level=-1)
    df_means = df_means.rename(columns={df_means.columns[0]:'values'})
    fig = px.line(df_means,x = 'values',y='mean')
    fig.update_layout(yaxis_range=[0.3,0.6],xaxis_title=xlabel,
                      yaxis_title='Acurácia média')
    fig.show()

def friedmanchisquare(df_accuracies):
    return stats.friedmanchisquare(*[df_accuracies[col] 
                                     for col in df_accuracies.columns])
def posthoc_nemenyi_friedman(df_accuracies):
    nemenyi_p =\
     sp.posthoc_nemenyi_friedman(np.array([df_accuracies[col]
                                           for col in df_accuracies.columns]).T)
    nemenyi_p.index = df_accuracies.columns.get_level_values(-1).tolist()
    nemenyi_p.columns = nemenyi_p.index
    return nemenyi_p

# Knn

In [95]:
df_knn = pd.read_csv(path_to_accuracies+'KNN_accuracies.csv',header=[0,1],
                     index_col=[0,1])
df_knn

Unnamed: 0_level_0,Unnamed: 1_level_0,KNN,KNN,KNN,KNN,KNN
Unnamed: 0_level_1,Unnamed: 1_level_1,n_neighbors=1,n_neighbors=2,n_neighbors=3,n_neighbors=4,n_neighbors=5
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
normal_d=1764,70/30-HOLDOUT,0.346667,0.29,0.313333,0.263333,0.28
normal_d=1764,80/20-HOLDOUT,0.34,0.325,0.315,0.27,0.275
normal_d=1764,90/10-HOLDOUT,0.32,0.34,0.36,0.34,0.33
normal_d=1764,10-FOLD,0.343,0.333,0.34,0.324,0.326
normal_d=900,70/30-HOLDOUT,0.393333,0.37,0.386667,0.363333,0.353333
normal_d=900,80/20-HOLDOUT,0.43,0.38,0.405,0.38,0.375
normal_d=900,90/10-HOLDOUT,0.48,0.38,0.41,0.5,0.44
normal_d=900,10-FOLD,0.401,0.395,0.372,0.366,0.354
pca_d=459,70/30-HOLDOUT,0.33,0.276667,0.3,0.303333,0.296667
pca_d=459,80/20-HOLDOUT,0.325,0.285,0.305,0.305,0.275


In [96]:
df_knn.describe()

Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN
Unnamed: 0_level_1,n_neighbors=1,n_neighbors=2,n_neighbors=3,n_neighbors=4,n_neighbors=5
count,16.0,16.0,16.0,16.0,16.0
mean,0.353021,0.328208,0.336917,0.330521,0.326583
std,0.049007,0.037662,0.038317,0.05724,0.045815
min,0.293333,0.276667,0.295,0.263333,0.275
25%,0.32375,0.2975,0.31025,0.300833,0.29
50%,0.338,0.328,0.325,0.317,0.327
75%,0.360833,0.3475,0.363,0.353333,0.3535
max,0.48,0.395,0.41,0.5,0.44


## Graphics


In [155]:
plotly_avg_df_accuracies(df_knn,'Número de vizinhos')

## Statistical tests

In [98]:
friedmanchisquare(df_knn)

FriedmanchisquareResult(statistic=17.756410256410255, pvalue=0.0013769638667562607)

There is a chance of 0.138% of the null hypothesis (the populations are the same) being true. If our threshold to reject the null hipothesis is < 5%, we can conclude that there is enough evidence to reject the null hypothesis. Thus, we accept that the population of the four tested samples are different.

In [99]:
posthoc_nemenyi_friedman(df_knn)

Unnamed: 0,n_neighbors=1,n_neighbors=2,n_neighbors=3,n_neighbors=4,n_neighbors=5
n_neighbors=1,1.0,0.048462,0.580482,0.025375,0.001687
n_neighbors=2,0.048462,1.0,0.675658,0.9,0.834289
n_neighbors=3,0.580482,0.675658,1.0,0.548754,0.14731
n_neighbors=4,0.025375,0.9,0.548754,1.0,0.9
n_neighbors=5,0.001687,0.834289,0.14731,0.9,1.0


Statistically similar pairs:
- (1,3); (2,3); (2,4); (2,5); (3,4); (3,5); (4,5);

Statistically different pairs:
- (1,2); (1,4); (1,5);

## Conclusion

**Best configuration adopted:** n_neighbors=3

# Decision Tree

In [100]:
df_dt = pd.read_csv(path_to_accuracies+'DT_accuracies.csv',header=[0,1],
                     index_col=[0,1])
df_dt

Unnamed: 0_level_0,Unnamed: 1_level_0,DT,DT,DT,DT,DT
Unnamed: 0_level_1,Unnamed: 1_level_1,max_depth=3,max_depth=4,max_depth=5,max_depth=6,max_depth=7
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
normal_d=1764,70/30-HOLDOUT,0.316667,0.316667,0.313333,0.323333,0.28
normal_d=1764,80/20-HOLDOUT,0.3,0.305,0.29,0.355,0.37
normal_d=1764,90/10-HOLDOUT,0.35,0.32,0.31,0.33,0.35
normal_d=1764,10-FOLD,0.309,0.319,0.31,0.311,0.334
normal_d=900,70/30-HOLDOUT,0.296667,0.35,0.346667,0.333333,0.31
normal_d=900,80/20-HOLDOUT,0.335,0.36,0.335,0.34,0.365
normal_d=900,90/10-HOLDOUT,0.31,0.34,0.4,0.44,0.47
normal_d=900,10-FOLD,0.311,0.347,0.377,0.384,0.366
pca_d=459,70/30-HOLDOUT,0.36,0.363333,0.326667,0.356667,0.313333
pca_d=459,80/20-HOLDOUT,0.325,0.335,0.375,0.315,0.315


In [101]:
df_dt.describe()

Unnamed: 0_level_0,DT,DT,DT,DT,DT
Unnamed: 0_level_1,max_depth=3,max_depth=4,max_depth=5,max_depth=6,max_depth=7
count,16.0,16.0,16.0,16.0,16.0
mean,0.325208,0.329333,0.331979,0.338042,0.333375
std,0.0232,0.020813,0.034499,0.035419,0.04601
min,0.296667,0.3,0.28,0.301,0.28
25%,0.30675,0.313,0.31,0.314,0.3115
50%,0.320833,0.325,0.328333,0.331667,0.3235
75%,0.3425,0.34775,0.35325,0.35125,0.35375
max,0.36,0.363333,0.4,0.44,0.47


## Graphics

In [145]:
plotly_avg_df_accuracies(df_dt,'Máxima profundidade da árvore')

## Statistical tests

In [103]:
friedmanchisquare(df_dt)

FriedmanchisquareResult(statistic=2.573248407643347, pvalue=0.6315692897770462)

There is a chance of 63.16% of the null hypothesis (the populations are the same) being true. If our threshold to reject the null hipothesis is < 5%, we can conclude that there is not enough evidence to reject the null hypothesis. Thus, we accept that the population of the four tested samples are the same. This doesn't proven that the two populations are the same. But for a lack of evidence on the contrary we accept that they are the same. So the best choice is dictated by the greatest mean value, that, in this case is for max_depth = 6.

## Conclusion

**Best configuration adopted:** max_depth = 6

# Gaussian Naive Bayes

In [104]:
df_gnb = pd.read_csv(path_to_accuracies+'GNB_accuracies.csv',header=[0],
                     skiprows=[1,2],index_col=[0,1])
df_gnb

Unnamed: 0,Unnamed: 1,GNB
normal_d=1764,70/30-HOLDOUT,0.5
normal_d=1764,80/20-HOLDOUT,0.505
normal_d=1764,90/10-HOLDOUT,0.49
normal_d=1764,10-FOLD,0.507
normal_d=900,70/30-HOLDOUT,0.556667
normal_d=900,80/20-HOLDOUT,0.585
normal_d=900,90/10-HOLDOUT,0.63
normal_d=900,10-FOLD,0.537
pca_d=459,70/30-HOLDOUT,0.426667
pca_d=459,80/20-HOLDOUT,0.445


In [105]:
df_gnb.describe()

Unnamed: 0,GNB
count,16.0
mean,0.503896
std,0.054948
min,0.415
25%,0.485
50%,0.5
75%,0.5145
max,0.63


# MLP

## Activation

In [106]:
df_mlp_actv = pd.read_csv(path_to_accuracies+'MLP_actv_accuracies.csv',header=[0,1],
                     index_col=[0,1])
df_mlp_actv

Unnamed: 0_level_0,Unnamed: 1_level_0,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,Unnamed: 1_level_1,activation=identity,activation=logistic,activation=tanh,activation=relu
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
normal_d=1764,70/30-HOLDOUT,0.493333,0.513333,0.5,0.533333
normal_d=1764,80/20-HOLDOUT,0.485,0.495,0.495,0.54
normal_d=1764,90/10-HOLDOUT,0.51,0.54,0.56,0.59
normal_d=1764,10-FOLD,0.505,0.539,0.542,0.563
normal_d=900,70/30-HOLDOUT,0.5,0.523333,0.506667,0.53
normal_d=900,80/20-HOLDOUT,0.505,0.55,0.545,0.605
normal_d=900,90/10-HOLDOUT,0.5,0.54,0.64,0.64
normal_d=900,10-FOLD,0.495,0.526,0.534,0.569
pca_d=459,70/30-HOLDOUT,0.43,0.456667,0.453333,0.463333
pca_d=459,80/20-HOLDOUT,0.42,0.46,0.445,0.465


In [107]:
df_mlp_actv.describe()

Unnamed: 0_level_0,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,activation=identity,activation=logistic,activation=tanh,activation=relu
count,16.0,16.0,16.0,16.0
mean,0.4765,0.501125,0.516625,0.529688
std,0.027996,0.032028,0.045172,0.053478
min,0.42,0.456667,0.445,0.463333
25%,0.455,0.47,0.49875,0.48425
50%,0.4825,0.494,0.509,0.5295
75%,0.5,0.52925,0.536,0.5645
max,0.51,0.55,0.64,0.64


### Graphics

In [146]:
plotly_avg_df_accuracies(df_mlp_actv,'Activation')

### Statistical test

In [109]:
friedmanchisquare(df_mlp_actv)

FriedmanchisquareResult(statistic=30.74999999999999, pvalue=9.595688624023438e-07)

In [110]:
posthoc_nemenyi_friedman(df_mlp_actv)

Unnamed: 0,activation=identity,activation=logistic,activation=tanh,activation=relu
activation=identity,1.0,0.021029,0.001,0.001
activation=logistic,0.021029,1.0,0.708703,0.077891
activation=tanh,0.001,0.708703,1.0,0.516551
activation=relu,0.001,0.077891,0.516551,1.0


Statistically similar pairs:
- (logistic,tanh); (logistic,relu); (tanh,relu);

Statistically different pairs:
- (identity,logistic); (identity,tanh); (identity,relu);

Thus, it's indifferent adopt logistic, tanh or relu, but the last one resulted the best average accuracy and will be chosen.

## Hidden layer sizes

In [111]:
df_mlp_hls = pd.read_csv(path_to_accuracies+'MLP_hls_accuracies.csv',
                         header=[0,1],index_col=[0,1])
second_level_split =\
[i.split(',') for i in df_mlp_hls.columns.get_level_values(1)]
df_mlp_hls.columns = pd.MultiIndex.from_tuples([(f'{col},{b[0]}',b[1].replace('hidden_layer_sizes=','')) 
for col,b in zip(df_mlp_hls.columns.get_level_values(0),second_level_split)])

df_mlp_hls

Unnamed: 0_level_0,Unnamed: 1_level_0,"MLP,activation=relu","MLP,activation=relu","MLP,activation=relu","MLP,activation=relu","MLP,activation=relu"
Unnamed: 0_level_1,Unnamed: 1_level_1,O,OA,A,AT,T
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
normal_d=1764,70/30-HOLDOUT,0.436667,0.533333,0.516667,0.533333,0.513333
normal_d=1764,80/20-HOLDOUT,0.435,0.545,0.525,0.525,0.525
normal_d=1764,90/10-HOLDOUT,0.54,0.55,0.53,0.52,0.53
normal_d=1764,10-FOLD,0.463,0.57,0.559,0.56,0.563
normal_d=900,70/30-HOLDOUT,0.463333,0.59,0.556667,0.526667,0.546667
normal_d=900,80/20-HOLDOUT,0.52,0.57,0.625,0.6,0.605
normal_d=900,90/10-HOLDOUT,0.54,0.57,0.58,0.6,0.62
normal_d=900,10-FOLD,0.48,0.574,0.569,0.574,0.581
pca_d=459,70/30-HOLDOUT,0.323333,0.453333,0.463333,0.45,0.453333
pca_d=459,80/20-HOLDOUT,0.365,0.42,0.42,0.49,0.48


In [112]:
df_mlp_hls.describe()

Unnamed: 0_level_0,"MLP,activation=relu","MLP,activation=relu","MLP,activation=relu","MLP,activation=relu","MLP,activation=relu"
Unnamed: 0_level_1,O,OA,A,AT,T
count,16.0,16.0,16.0,16.0,16.0
mean,0.454646,0.524646,0.527667,0.533875,0.530833
std,0.057216,0.047149,0.04708,0.039045,0.044556
min,0.323333,0.42,0.42,0.45,0.453333
25%,0.43625,0.5,0.51,0.5155,0.51
50%,0.457,0.526167,0.5275,0.531667,0.5255
75%,0.4825,0.57,0.55725,0.55325,0.55075
max,0.54,0.59,0.625,0.6,0.62


### Graphics

In [147]:
plotly_avg_df_accuracies(df_mlp_hls,'Número de neurônios na camada escondida')

### Statistical tests

In [114]:
friedmanchisquare(df_mlp_hls)

FriedmanchisquareResult(statistic=25.440514469453397, pvalue=4.102274603642946e-05)

In [115]:
posthoc_nemenyi_friedman(df_mlp_hls)

Unnamed: 0,O,OA,A,AT,T
O,1.0,0.002591,0.001,0.001,0.001
OA,0.002591,1.0,0.9,0.9,0.9
A,0.001,0.9,1.0,0.9,0.9
AT,0.001,0.9,0.9,1.0,0.9
T,0.001,0.9,0.9,0.9,1.0


Statistically similar pairs:
- (OA,A); (OA,AT); (OA,T); (A,AT); (A,T); (AT,T);

Statistically different pairs:
- (O,OA); (O,A); (O,AT); (O,T);

Thus, it's indifferent to adopt OA, A, AT or T, but the first one was less time cosuming during traning phases and it was chosen.

## Max Iterations

In [116]:
df_mlp_max_it = pd.read_csv(path_to_accuracies+'MLP_it_accuracies.csv',
                         header=[0,1],index_col=[0,1])
second_level_split =\
[i.split(',') for i in df_mlp_max_it.columns.get_level_values(-1)]
df_mlp_max_it.columns = pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-1])}',b[-1].replace('max_iter=','')) 
for col,b in zip(df_mlp_max_it.columns.get_level_values(0),second_level_split)])

df_mlp_max_it

Unnamed: 0_level_0,Unnamed: 1_level_0,"MLP,activation=relu,hidden_layer_sizes=OA","MLP,activation=relu,hidden_layer_sizes=OA","MLP,activation=relu,hidden_layer_sizes=OA"
Unnamed: 0_level_1,Unnamed: 1_level_1,100,1000,5000
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
normal_d=1764,70/30-HOLDOUT,0.52,0.506667,0.536667
normal_d=1764,80/20-HOLDOUT,0.495,0.515,0.545
normal_d=1764,90/10-HOLDOUT,0.54,0.54,0.54
normal_d=1764,10-FOLD,0.564,0.568,0.576
normal_d=900,70/30-HOLDOUT,0.523333,0.546667,0.556667
normal_d=900,80/20-HOLDOUT,0.6,0.605,0.605
normal_d=900,90/10-HOLDOUT,0.61,0.59,0.61
normal_d=900,10-FOLD,0.576,0.571,0.569
pca_d=459,70/30-HOLDOUT,0.366667,0.463333,0.423333
pca_d=459,80/20-HOLDOUT,0.475,0.455,0.5


In [117]:
df_mlp_max_it.describe()

Unnamed: 0_level_0,"MLP,activation=relu,hidden_layer_sizes=OA","MLP,activation=relu,hidden_layer_sizes=OA","MLP,activation=relu,hidden_layer_sizes=OA"
Unnamed: 0_level_1,100,1000,5000
count,16.0,16.0,16.0
mean,0.5205,0.523396,0.529396
std,0.057271,0.048643,0.048526
min,0.366667,0.43,0.423333
25%,0.49375,0.5005,0.4975
50%,0.521667,0.523333,0.528333
75%,0.5475,0.552,0.55975
max,0.61,0.605,0.61


### Graphics

In [148]:
plotly_avg_df_accuracies(df_mlp_max_it,'Número máximo de iterações')

### Statistical tests

In [119]:
friedmanchisquare(df_mlp_max_it)

FriedmanchisquareResult(statistic=0.75, pvalue=0.6872892787909721)

The set isn't statistically different because p-value=68.73% > 5%.
Thus it's statiscally indifferent choose one or other option. Therefore, max_it will be set to 1000, because, when we adopt max_it=100 some convergence issues were happening, althought this is the less time cosuming option during training phases.

## Learning rate

In [120]:
df_mlp_lr = pd.read_csv(path_to_accuracies+'MLP_lr_accuracies.csv',
                         header=[0,1],index_col=[0,1])
second_level_split =\
[i.split(',') for i in df_mlp_lr.columns.get_level_values(-1)]
df_mlp_lr.columns = pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-1])}',b[-1].replace('learning_rate_init=','')) 
for col,b in zip(df_mlp_lr.columns.get_level_values(0),second_level_split)])

df_mlp_lr

Unnamed: 0_level_0,Unnamed: 1_level_0,"MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000","MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000","MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000"
Unnamed: 0_level_1,Unnamed: 1_level_1,0.001,0.01,0.1
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
normal_d=1764,70/30-HOLDOUT,0.53,0.536667,0.486667
normal_d=1764,80/20-HOLDOUT,0.525,0.475,0.465
normal_d=1764,90/10-HOLDOUT,0.56,0.55,0.56
normal_d=1764,10-FOLD,0.577,0.552,0.521
normal_d=900,70/30-HOLDOUT,0.536667,0.526667,0.503333
normal_d=900,80/20-HOLDOUT,0.6,0.585,0.595
normal_d=900,90/10-HOLDOUT,0.58,0.62,0.61
normal_d=900,10-FOLD,0.574,0.558,0.523
pca_d=459,70/30-HOLDOUT,0.463333,0.486667,0.47
pca_d=459,80/20-HOLDOUT,0.425,0.52,0.45


In [121]:
df_mlp_lr.describe()

Unnamed: 0_level_0,"MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000","MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000","MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000"
Unnamed: 0_level_1,0.001,0.01,0.1
count,16.0,16.0,16.0
mean,0.517042,0.528583,0.489583
std,0.053714,0.037993,0.06567
min,0.425,0.475,0.36
25%,0.465333,0.505833,0.46125
50%,0.5275,0.521,0.484833
75%,0.5635,0.5505,0.5215
max,0.6,0.62,0.61


### Graphics

In [149]:
plotly_avg_df_accuracies(df_mlp_lr,'Taxa de aprendizado inicial')

### Statistical tests

In [123]:
friedmanchisquare(df_mlp_lr)

FriedmanchisquareResult(statistic=5.555555555555555, pvalue=0.06217652402211632)

There is no statistical difference. So, it is indefferent to chose one value or other. Threrefore, the default value (0.001) will be adopted.

## Conclusion
**Best configuration adopted:**
- Activation function: relu
- Hidden layer sizes: OA
- Max number of iterations: 1000
- Initial learning rate: 0.001

# Best simple model 

In [124]:
df_best_simple_model = \
pd.concat([df_knn[('KNN','n_neighbors=3')],df_dt[('DT','max_depth=6')],
df_gnb['GNB'],
df_mlp_lr[('MLP,activation=relu,hidden_layer_sizes=OA,max_iter=1000','0.001')]],
axis=1)
df_best_simple_model.columns = ['KNN','DT','GNB','MLP']
df_best_simple_model

Unnamed: 0,Unnamed: 1,KNN,DT,GNB,MLP
normal_d=1764,70/30-HOLDOUT,0.313333,0.323333,0.5,0.53
normal_d=1764,80/20-HOLDOUT,0.315,0.355,0.505,0.525
normal_d=1764,90/10-HOLDOUT,0.36,0.33,0.49,0.56
normal_d=1764,10-FOLD,0.34,0.311,0.507,0.577
normal_d=900,70/30-HOLDOUT,0.386667,0.333333,0.556667,0.536667
normal_d=900,80/20-HOLDOUT,0.405,0.34,0.585,0.6
normal_d=900,90/10-HOLDOUT,0.41,0.44,0.63,0.58
normal_d=900,10-FOLD,0.372,0.384,0.537,0.574
pca_d=459,70/30-HOLDOUT,0.3,0.356667,0.426667,0.463333
pca_d=459,80/20-HOLDOUT,0.305,0.315,0.445,0.425


## Graphic

In [156]:
plotly_avg_df_accuracies(df_best_simple_model,'Best models')

## Statical tests

In [125]:
friedmanchisquare(df_best_simple_model)

FriedmanchisquareResult(statistic=39.0, pvalue=1.7356651904833323e-08)

In [126]:
posthoc_nemenyi_friedman(df_best_simple_model)

Unnamed: 0,KNN,DT,GNB,MLP
KNN,1.0,0.9,0.001,0.001
DT,0.9,1.0,0.001,0.001
GNB,0.001,0.001,1.0,0.9
MLP,0.001,0.001,0.9,1.0


Statistically similar pairs:
- (KNN,DT); (GNB,MLP);

Statistically different pairs:
- (KNN,GNB); (KNN,MLP); (DT,GNB); (DT,MLP);

There is basically two groups: (KNN,DT) and (GNB,MLP). The second one has the best accuracy and MLP has the best mean accuracy of all those.

## Conclusion

**Best simple model adopted:** MLP(relu,OA,1000,0.001)

# Bagging

## KNN

In [160]:
df_bagging_knn = pd.read_csv(path_to_accuracies+f'BAGGING_knn_accuracies.csv',header=[0,1],
                     index_col=[0,1])

second_level_split =\
[i.split(',') for i in df_bagging_knn.columns.get_level_values(-1)]
df_bagging_knn.columns =\
pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-2])}',
                            "/".join(b[-2:]).replace('n_estimators=','')\
                                     .replace('max_features=',''))
for col,b in zip(df_bagging_knn.columns.get_level_values(0),second_level_split)])

df_bagging_knn

Unnamed: 0_level_0,Unnamed: 1_level_0,"BAGGING,base_estimator=KNeighborsClassifier(n_neighbors=3)","BAGGING,base_estimator=KNeighborsClassifier(n_neighbors=3)","BAGGING,base_estimator=KNeighborsClassifier(n_neighbors=3)","BAGGING,base_estimator=KNeighborsClassifier(n_neighbors=3)"
Unnamed: 0_level_1,Unnamed: 1_level_1,10/1.0,10/0.5,20/1.0,20/0.5
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
normal_d=1764,70/30-HOLDOUT,0.29,0.316667,0.306667,0.333333
normal_d=1764,80/20-HOLDOUT,0.285,0.3,0.31,0.285
normal_d=1764,90/10-HOLDOUT,0.35,0.32,0.31,0.31
normal_d=1764,10-FOLD,0.335,0.342,0.34,0.328
normal_d=900,70/30-HOLDOUT,0.363333,0.366667,0.37,0.393333
normal_d=900,80/20-HOLDOUT,0.45,0.41,0.445,0.41
normal_d=900,90/10-HOLDOUT,0.48,0.5,0.52,0.47
normal_d=900,10-FOLD,0.381,0.378,0.386,0.399
pca_d=459,70/30-HOLDOUT,0.283333,0.303333,0.306667,0.283333
pca_d=459,80/20-HOLDOUT,0.32,0.325,0.285,0.32


### Graphics

In [161]:
plotly_avg_df_accuracies(df_bagging_knn,'n_estimators/max_features')

### Statistical tests

In [162]:
friedmanchisquare(df_bagging_knn)

FriedmanchisquareResult(statistic=4.006578947368397, pvalue=0.26075459825947866)

There is no statistical difference between the samples. Thus, the n_estimators/max_features=20/1.0 will be chosen, since it has the best mean accuracy.

### Conclusion

**Best configuration adopted:** n_estimators/max_features=20/1.0

## DT

In [163]:
df_bagging_dt = pd.read_csv(path_to_accuracies+f'BAGGING_dt_accuracies.csv',header=[0,1],
                     index_col=[0,1])

second_level_split =\
[i.split(',') for i in df_bagging_dt.columns.get_level_values(-1)]
df_bagging_dt.columns =\
pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-2])}',
                            "/".join(b[-2:]).replace('n_estimators=','')\
                                     .replace('max_features=',''))
for col,b in zip(df_bagging_dt.columns.get_level_values(0),second_level_split)])

df_bagging_dt

Unnamed: 0_level_0,Unnamed: 1_level_0,"BAGGING,base_estimator=DecisionTreeClassifier(max_depth=4)","BAGGING,base_estimator=DecisionTreeClassifier(max_depth=4)","BAGGING,base_estimator=DecisionTreeClassifier(max_depth=4)","BAGGING,base_estimator=DecisionTreeClassifier(max_depth=4)"
Unnamed: 0_level_1,Unnamed: 1_level_1,10/1.0,10/0.5,20/1.0,20/0.5
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
normal_d=1764,70/30-HOLDOUT,0.36,0.403333,0.416667,0.42
normal_d=1764,80/20-HOLDOUT,0.42,0.37,0.42,0.445
normal_d=1764,90/10-HOLDOUT,0.38,0.41,0.5,0.52
normal_d=1764,10-FOLD,0.396,0.417,0.422,0.433
normal_d=900,70/30-HOLDOUT,0.396667,0.413333,0.44,0.423333
normal_d=900,80/20-HOLDOUT,0.48,0.455,0.48,0.515
normal_d=900,90/10-HOLDOUT,0.53,0.49,0.44,0.47
normal_d=900,10-FOLD,0.418,0.413,0.448,0.44
pca_d=459,70/30-HOLDOUT,0.356667,0.37,0.403333,0.386667
pca_d=459,80/20-HOLDOUT,0.395,0.335,0.395,0.38


### Graphics

In [164]:
plotly_avg_df_accuracies(df_bagging_dt,'n_estimators/max_features')

### Statistical tests

In [165]:
friedmanchisquare(df_bagging_dt)

FriedmanchisquareResult(statistic=16.29936305732482, pvalue=0.0009844691861733649)

There is statistical difference between the samples.

In [166]:
posthoc_nemenyi_friedman(df_bagging_dt)

Unnamed: 0,10/1.0,10/0.5,20/1.0,20/0.5
10/1.0,1.0,0.631841,0.283091,0.077891
10/0.5,0.631841,1.0,0.017085,0.0021
20/1.0,0.283091,0.017085,1.0,0.9
20/0.5,0.077891,0.0021,0.9,1.0


Statistically similar pairs:
- (10/1.0,10/0.5); (10/1.0,20/1.0); (10/1.0,20/0.5); (20/1.0,20/0.5);

Statistically different pairs:
- (10/0.5,20/1.0); (10/0.5,20/0.5);

Thus, it's indifferent to adopt 10/1.0, 20/1.0 or 20/0.5, but this last one has the best mean accuracy.

### Conclusion

**Best configuration adopted:** n_estimators/max_features=20/0.5

## GNB

In [167]:
df_bagging_gnb = pd.read_csv(path_to_accuracies+f'BAGGING_nb_accuracies.csv',header=[0,1],
                     index_col=[0,1])

second_level_split =\
[i.split(',') for i in df_bagging_gnb.columns.get_level_values(-1)]
df_bagging_gnb.columns =\
pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-2])}',
                            "/".join(b[-2:]).replace('n_estimators=','')\
                                     .replace('max_features=',''))
for col,b in zip(df_bagging_gnb.columns.get_level_values(0),second_level_split)])

df_bagging_gnb

Unnamed: 0_level_0,Unnamed: 1_level_0,"BAGGING,base_estimator=GaussianNB()","BAGGING,base_estimator=GaussianNB()","BAGGING,base_estimator=GaussianNB()","BAGGING,base_estimator=GaussianNB()"
Unnamed: 0_level_1,Unnamed: 1_level_1,10/1.0,10/0.5,20/1.0,20/0.5
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
normal_d=1764,70/30-HOLDOUT,0.513333,0.49,0.49,0.51
normal_d=1764,80/20-HOLDOUT,0.5,0.515,0.51,0.525
normal_d=1764,90/10-HOLDOUT,0.45,0.47,0.5,0.5
normal_d=1764,10-FOLD,0.507,0.5,0.512,0.515
normal_d=900,70/30-HOLDOUT,0.55,0.543333,0.54,0.55
normal_d=900,80/20-HOLDOUT,0.615,0.59,0.565,0.58
normal_d=900,90/10-HOLDOUT,0.63,0.65,0.64,0.63
normal_d=900,10-FOLD,0.534,0.526,0.533,0.53
pca_d=459,70/30-HOLDOUT,0.413333,0.403333,0.416667,0.393333
pca_d=459,80/20-HOLDOUT,0.44,0.445,0.445,0.39


### Graphics

In [168]:
plotly_avg_df_accuracies(df_bagging_gnb,'n_estimators/max_features')

### Statistical tests

In [169]:
friedmanchisquare(df_bagging_gnb)

FriedmanchisquareResult(statistic=0.12080536912749847, pvalue=0.989228825726344)

There is no statistical difference between the samples. Thus, the n_estimators/max_features=10/1.0 will be chosen, since it has the best mean accuracy.

### Conclusion

**Best configuration adopted:** n_estimators/max_features=10/1.0

## MLP

In [170]:
df_bagging_mlp = pd.read_csv(path_to_accuracies+f'BAGGING_mlp_accuracies.csv',header=[0,1],
                     index_col=[0,1])

second_level_split =\
[i.split(',') for i in df_bagging_mlp.columns.get_level_values(-1)]
df_bagging_mlp.columns =\
pd.MultiIndex.from_tuples([(f'{col},{",".join(b[0:1]+b[-2:])}',
                            "/".join(b[-4:-2]).replace('n_estimators=','')\
                                     .replace('max_features=',''))
for col,b in zip(df_bagging_mlp.columns.get_level_values(0),second_level_split)])

df_bagging_mlp

Unnamed: 0_level_0,Unnamed: 1_level_0,"BAGGING,base_estimator=MLPClassifier(max_iter=1000),base_estimator__hidden_layer_sizes=OA,n_jobs=-1","BAGGING,base_estimator=MLPClassifier(max_iter=1000),base_estimator__hidden_layer_sizes=OA,n_jobs=-1","BAGGING,base_estimator=MLPClassifier(max_iter=1000),base_estimator__hidden_layer_sizes=OA,n_jobs=-1","BAGGING,base_estimator=MLPClassifier(max_iter=1000),base_estimator__hidden_layer_sizes=OA,n_jobs=-1"
Unnamed: 0_level_1,Unnamed: 1_level_1,10/1.0,10/0.5,20/1.0,20/0.5
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
normal_d=1764,70/30-HOLDOUT,0.523333,0.526667,0.53,0.54
normal_d=1764,80/20-HOLDOUT,0.565,0.525,0.55,0.545
normal_d=1764,90/10-HOLDOUT,0.51,0.52,0.53,0.54
normal_d=1764,10-FOLD,0.566,0.573,0.572,0.569
normal_d=900,70/30-HOLDOUT,0.583333,0.563333,0.573333,0.563333
normal_d=900,80/20-HOLDOUT,0.61,0.605,0.605,0.61
normal_d=900,90/10-HOLDOUT,0.63,0.61,0.62,0.59
normal_d=900,10-FOLD,0.585,0.579,0.569,0.588
pca_d=459,70/30-HOLDOUT,0.496667,0.456667,0.503333,0.47
pca_d=459,80/20-HOLDOUT,0.515,0.445,0.515,0.495


### Graphics

In [171]:
plotly_avg_df_accuracies(df_bagging_mlp,'n_estimators/max_features')

### Statistical tests

In [172]:
friedmanchisquare(df_bagging_mlp)

FriedmanchisquareResult(statistic=7.838709677419355, pvalue=0.04946539533596238)

There is statistical difference between the samples.

In [173]:
posthoc_nemenyi_friedman(df_bagging_mlp)

Unnamed: 0,10/1.0,10/0.5,20/1.0,20/0.5
10/1.0,1.0,0.107756,0.9,0.747133
10/0.5,0.107756,1.0,0.065622,0.554984
20/1.0,0.9,0.065622,1.0,0.631841
20/0.5,0.747133,0.554984,0.631841,1.0


Statistically similar pairs:
- (10/1.0,10/0.5); (10/1.0,20/1.0); (10/1.0,20/0.5); (10/0.5,20/1.0); (10/0.5,20/0.5); (20/1.0,20/0.5);

Thus, it's indifferent to adopt 10/1.0, 10/0.5, 20/1.0 or 20/0.5, but the first one has the best mean accuracy.

### Conclusion

**Best configuration adopted:** n_estimators/max_features=10/1.0

## Best Bagging

In [203]:
df_best_bagging = pd.concat([
df_bagging_knn[('BAGGING,base_estimator=KNeighborsClassifier(n_neighbors=3)','20/1.0')],
df_bagging_dt[('BAGGING,base_estimator=DecisionTreeClassifier(max_depth=4)','20/0.5')],
df_bagging_gnb[('BAGGING,base_estimator=GaussianNB()','10/1.0')],
df_bagging_mlp[('BAGGING,base_estimator=MLPClassifier(max_iter=1000),base_estimator__hidden_layer_sizes=OA,n_jobs=-1','10/1.0')]
],axis = 1)
df_best_bagging.columns = ['KNN','DT', 'GNB', 'MLP']
df_best_bagging

Unnamed: 0_level_0,Unnamed: 1_level_0,KNN,DT,GNB,MLP
dataset,train/test_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
normal_d=1764,70/30-HOLDOUT,0.306667,0.42,0.513333,0.523333
normal_d=1764,80/20-HOLDOUT,0.31,0.445,0.5,0.565
normal_d=1764,90/10-HOLDOUT,0.31,0.52,0.45,0.51
normal_d=1764,10-FOLD,0.34,0.433,0.507,0.566
normal_d=900,70/30-HOLDOUT,0.37,0.423333,0.55,0.583333
normal_d=900,80/20-HOLDOUT,0.445,0.515,0.615,0.61
normal_d=900,90/10-HOLDOUT,0.52,0.47,0.63,0.63
normal_d=900,10-FOLD,0.386,0.44,0.534,0.585
pca_d=459,70/30-HOLDOUT,0.306667,0.386667,0.413333,0.496667
pca_d=459,80/20-HOLDOUT,0.285,0.38,0.44,0.515


In [204]:
plotly_avg_df_accuracies(df_best_bagging,'Best models')

In [205]:
friedmanchisquare(df_best_bagging)

FriedmanchisquareResult(statistic=41.9811320754717, pvalue=4.049286686227919e-09)

In [206]:
posthoc_nemenyi_friedman(df_best_bagging)

Unnamed: 0,KNN,DT,GNB,MLP
KNN,1.0,0.125527,0.001,0.001
DT,0.125527,1.0,0.145912,0.001
GNB,0.001,0.145912,1.0,0.283091
MLP,0.001,0.001,0.283091,1.0


Statistically similar pairs:
- (KNN,DT); (DT,GNB); (GNB,MLP);

Statistically different pairs:
- (KNN,GNB); (KNN,MLP); (DT,MLP);

There is no a clear distinct bagging, but MLP has the best mean accuracy.

## Conclusion

**Configuration adopted:** MLP

# Boosting

## DT

In [174]:
df_boosting_dt = pd.read_csv(path_to_accuracies+f'BOOSTING_dt_accuracies.csv',header=[0,1],
                     index_col=[0,1])

second_level_split =\
[i.split(',') for i in df_boosting_dt.columns.get_level_values(-1)]
df_boosting_dt.columns =\
pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-1])}',
                            "/".join(b[-1:]).replace('n_estimators=','')\
                                     .replace('max_features=',''))
for col,b in zip(df_boosting_dt.columns.get_level_values(0),second_level_split)])

df_boosting_dt

Unnamed: 0_level_0,Unnamed: 1_level_0,"BOOSTING,base_estimator=DecisionTreeClassifier(max_depth=4)","BOOSTING,base_estimator=DecisionTreeClassifier(max_depth=4)"
Unnamed: 0_level_1,Unnamed: 1_level_1,10,20
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2
normal_d=1764,70/30-HOLDOUT,0.283333,0.326667
normal_d=1764,80/20-HOLDOUT,0.355,0.33
normal_d=1764,90/10-HOLDOUT,0.27,0.24
normal_d=1764,10-FOLD,0.33,0.319
normal_d=900,70/30-HOLDOUT,0.383333,0.296667
normal_d=900,80/20-HOLDOUT,0.3,0.355
normal_d=900,90/10-HOLDOUT,0.29,0.34
normal_d=900,10-FOLD,0.339,0.339
pca_d=459,70/30-HOLDOUT,0.3,0.28
pca_d=459,80/20-HOLDOUT,0.235,0.195


### Graphics

In [175]:
plotly_avg_df_accuracies(df_boosting_dt,'Número de classificadores')

### Statistical tests

In [176]:
posthoc_nemenyi_friedman(df_boosting_dt)

Unnamed: 0,10,20
10,1.0,0.627173
20,0.627173,1.0


There is no statistical difference but boosting with 10 estimators resulted in a better performance.

### Conclusion

**Best configuration adopted:** n_estimators=10

## GNB

In [177]:
df_boosting_gnb = pd.read_csv(path_to_accuracies+f'BOOSTING_nb_accuracies.csv',header=[0,1],
                     index_col=[0,1])

second_level_split =\
[i.split(',') for i in df_boosting_gnb.columns.get_level_values(-1)]
df_boosting_gnb.columns =\
pd.MultiIndex.from_tuples([(f'{col},{",".join(b[:-1])}',
                            "/".join(b[-1:]).replace('n_estimators=','')\
                                     .replace('max_features=',''))
for col,b in zip(df_boosting_gnb.columns.get_level_values(0),second_level_split)])

df_boosting_gnb

Unnamed: 0_level_0,Unnamed: 1_level_0,"BOOSTING,base_estimator=GaussianNB()","BOOSTING,base_estimator=GaussianNB()"
Unnamed: 0_level_1,Unnamed: 1_level_1,10,20
dataset,train/test_type,Unnamed: 2_level_2,Unnamed: 3_level_2
normal_d=1764,70/30-HOLDOUT,0.316667,0.363333
normal_d=1764,80/20-HOLDOUT,0.32,0.22
normal_d=1764,90/10-HOLDOUT,0.27,0.16
normal_d=1764,10-FOLD,0.241,0.256
normal_d=900,70/30-HOLDOUT,0.34,0.216667
normal_d=900,80/20-HOLDOUT,0.33,0.29
normal_d=900,90/10-HOLDOUT,0.28,0.19
normal_d=900,10-FOLD,0.274,0.262
pca_d=459,70/30-HOLDOUT,0.216667,0.233333
pca_d=459,80/20-HOLDOUT,0.245,0.22


### Graphics

In [196]:
plotly_avg_df_accuracies(df_boosting_gnb,'Número de classificadores')

### Statistical tests

In [179]:
posthoc_nemenyi_friedman(df_boosting_gnb)

Unnamed: 0,10,20
10,1.0,0.9
20,0.9,1.0


There is no statistical difference but boosting with 10 estimators resulted in a better performance.

### Conclusion

**Best configuration adopted:** n_estimators=10

## Best Boosting

In [197]:
df_best_boosting = pd.concat([
df_boosting_dt[('BOOSTING,base_estimator=DecisionTreeClassifier(max_depth=4)','10')],
df_boosting_gnb[('BOOSTING,base_estimator=GaussianNB()','10')]
],axis=1)
df_best_boosting.columns = ['DT','GNB']
df_best_boosting

Unnamed: 0_level_0,Unnamed: 1_level_0,DT,GNB
dataset,train/test_type,Unnamed: 2_level_1,Unnamed: 3_level_1
normal_d=1764,70/30-HOLDOUT,0.283333,0.316667
normal_d=1764,80/20-HOLDOUT,0.355,0.32
normal_d=1764,90/10-HOLDOUT,0.27,0.27
normal_d=1764,10-FOLD,0.33,0.241
normal_d=900,70/30-HOLDOUT,0.383333,0.34
normal_d=900,80/20-HOLDOUT,0.3,0.33
normal_d=900,90/10-HOLDOUT,0.29,0.28
normal_d=900,10-FOLD,0.339,0.274
pca_d=459,70/30-HOLDOUT,0.3,0.216667
pca_d=459,80/20-HOLDOUT,0.235,0.245


In [198]:
df_best_boosting.describe()

Unnamed: 0,DT,GNB
count,16.0,16.0
mean,0.311583,0.282458
std,0.039552,0.042205
min,0.235,0.216667
25%,0.288333,0.251
50%,0.305,0.277
75%,0.33225,0.3175
max,0.383333,0.36


In [199]:
posthoc_nemenyi_friedman(df_best_boosting)

Unnamed: 0,DT,GNB
DT,1.0,0.024449
GNB,0.024449,1.0


There is statiscal difference, and DT has the best mean value. 

## Conclusion
**Configuration adopted:** DT

# Random forest

In [209]:
df_rf = pd.read_csv(path_to_accuracies+f'RANDOM_FOREST_accuracies.csv',header=[0,1],
                     index_col=[0,1])

df_rf.columns = [col.replace('n_estimators=','').replace('criterion=','')\
                 .replace(',','/')
                 for col in df_rf.columns.get_level_values(-1)]

df_rf

Unnamed: 0_level_0,Unnamed: 1_level_0,10/gini,10/entropy,100/gini,100/entropy
dataset,train/test_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
normal_d=1764,70/30-HOLDOUT,0.37,0.39,0.483333,0.48
normal_d=1764,80/20-HOLDOUT,0.385,0.355,0.53,0.57
normal_d=1764,90/10-HOLDOUT,0.37,0.42,0.59,0.52
normal_d=1764,10-FOLD,0.361,0.362,0.528,0.531
normal_d=900,70/30-HOLDOUT,0.356667,0.346667,0.51,0.51
normal_d=900,80/20-HOLDOUT,0.395,0.415,0.61,0.59
normal_d=900,90/10-HOLDOUT,0.43,0.4,0.6,0.58
normal_d=900,10-FOLD,0.404,0.409,0.542,0.532
pca_d=459,70/30-HOLDOUT,0.246667,0.283333,0.363333,0.326667
pca_d=459,80/20-HOLDOUT,0.305,0.25,0.4,0.405


In [210]:
df_rf.describe()

Unnamed: 0,10/gini,10/entropy,100/gini,100/entropy
count,16.0,16.0,16.0,16.0
mean,0.347729,0.347208,0.497854,0.491042
std,0.048227,0.055496,0.073649,0.070989
min,0.246667,0.24,0.363333,0.326667
25%,0.3225,0.32325,0.45,0.445
50%,0.357333,0.3515,0.505,0.51
75%,0.37375,0.3925,0.533,0.53125
max,0.43,0.42,0.61,0.59


## Graphics

In [211]:
plotly_avg_df_accuracies(df_rf,'n_estimators/criterion')

## Statistical tests

In [212]:
friedmanchisquare(df_rf)

FriedmanchisquareResult(statistic=39.38216560509556, pvalue=1.440444914047935e-08)

In [213]:
posthoc_nemenyi_friedman(df_rf)

Unnamed: 0,10/gini,10/entropy,100/gini,100/entropy
10/gini,1.0,0.9,0.001,0.001
10/entropy,0.9,1.0,0.001,0.001
100/gini,0.001,0.001,1.0,0.9
100/entropy,0.001,0.001,0.9,1.0


Statistically similar pairs:
- (10/gini,10/entropy); (10/entropy,20/1.0); (100/gini,100/entropy);

Statistically different pairs:
- (10/gini,100/gini); (10/gini,100/entropy); (10/entropy,100/gini); (10/entropy,100/entropy);

Thus, it's indifferent to adopt  (100/gini,100/entropy), but the first one has the best mean accuracy.

## Conclusion

**Best configuration adopted:** n_estimators/criterion=100/gini

# Stacking

In [218]:
df_stacking = pd.read_csv(path_to_accuracies+f'STACKING_accuracies.csv',header=[0,1],
                     index_col=[0,1])

df_stacking.columns = ['MLP,KNN,GNB','5MLP,5KNN','9MLP,10KNN,GNB']

df_stacking

Unnamed: 0_level_0,Unnamed: 1_level_0,"MLP,KNN,GNB","5MLP,5KNN","9MLP,10KNN,GNB"
dataset,train/test_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal_d=1764,70/30-HOLDOUT,0.536667,0.51,0.503333
normal_d=1764,80/20-HOLDOUT,0.53,0.515,0.515
normal_d=1764,90/10-HOLDOUT,0.57,0.58,0.59
normal_d=1764,10-FOLD,0.562,0.581,0.552
normal_d=900,70/30-HOLDOUT,0.543333,0.583333,0.576667
normal_d=900,80/20-HOLDOUT,0.64,0.61,0.585
normal_d=900,90/10-HOLDOUT,0.65,0.65,0.67
normal_d=900,10-FOLD,0.587,0.587,0.577
pca_d=459,70/30-HOLDOUT,0.486667,0.466667,0.453333
pca_d=459,80/20-HOLDOUT,0.51,0.465,0.47


## Graphics

In [219]:
plotly_avg_df_accuracies(df_stacking,'Estimadores utilizados')

## Statistical tests

In [220]:
friedmanchisquare(df_stacking)

FriedmanchisquareResult(statistic=3.0, pvalue=0.22313016014842982)

There is no statical difference. It's better to adopt the first configuration with 3 estimators, since it has fewer estimators and the best mean accuracy.

**Chosen**: MLP,KNN,GNB

## Conclusion

**Best configuration adopted:** estimators=MLP,KNN,GNP

# Best ensemble

In [223]:
df_best_ensemble = pd.concat([
df_best_bagging['MLP'],
df_best_boosting['DT'],
df_rf['100/gini'],
df_stacking['MLP,KNN,GNB']
],axis=1)
df_best_ensemble.columns = ['bagging','boosting','random_forest','stacking']
df_best_ensemble

Unnamed: 0_level_0,Unnamed: 1_level_0,bagging,boosting,random_forest,stacking
dataset,train/test_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
normal_d=1764,70/30-HOLDOUT,0.523333,0.283333,0.483333,0.536667
normal_d=1764,80/20-HOLDOUT,0.565,0.355,0.53,0.53
normal_d=1764,90/10-HOLDOUT,0.51,0.27,0.59,0.57
normal_d=1764,10-FOLD,0.566,0.33,0.528,0.562
normal_d=900,70/30-HOLDOUT,0.583333,0.383333,0.51,0.543333
normal_d=900,80/20-HOLDOUT,0.61,0.3,0.61,0.64
normal_d=900,90/10-HOLDOUT,0.63,0.29,0.6,0.65
normal_d=900,10-FOLD,0.585,0.339,0.542,0.587
pca_d=459,70/30-HOLDOUT,0.496667,0.3,0.363333,0.486667
pca_d=459,80/20-HOLDOUT,0.515,0.235,0.4,0.51


In [227]:
df_best_ensemble.describe()

Unnamed: 0,bagging,boosting,random_forest,stacking
count,16.0,16.0,16.0,16.0
mean,0.549083,0.311583,0.497854,0.550938
std,0.038406,0.039552,0.073649,0.045269
min,0.496667,0.235,0.363333,0.486667
25%,0.51875,0.288333,0.45,0.5225
50%,0.5455,0.305,0.505,0.54
75%,0.570333,0.33225,0.533,0.564
max,0.63,0.383333,0.61,0.65


## Statistical tests

In [225]:
friedmanchisquare(df_best_ensemble)

FriedmanchisquareResult(statistic=36.30573248407644, pvalue=6.452693752130029e-08)

In [226]:
posthoc_nemenyi_friedman(df_best_ensemble)

Unnamed: 0,bagging,boosting,random_forest,stacking
bagging,1.0,0.001,0.125527,0.9
boosting,0.001,1.0,0.021029,0.001
random_forest,0.125527,0.021029,1.0,0.091664
stacking,0.9,0.001,0.091664,1.0


Statistically similar pairs:
- (bagging,random_forest); (bagging,stacking); (random_forest,stacking);

Statistically different pairs:
- (bagging,boosting); (boosting,random_forest); (boosting,stacking);

There is basically two groups: (bagging,random_forest,stacking) and (boosting). Stacking has the best mean accuracy of the first group.

## Conclusion

**Best configuration adopted:** Stacking(MLP,KNN,GNB)

# Best of all

In [230]:
df_best_of_all = pd.concat([
df_best_simple_model['MLP'],
df_best_ensemble['stacking']]
,axis=1)
df_best_of_all

Unnamed: 0,Unnamed: 1,MLP,stacking
normal_d=1764,70/30-HOLDOUT,0.53,0.536667
normal_d=1764,80/20-HOLDOUT,0.525,0.53
normal_d=1764,90/10-HOLDOUT,0.56,0.57
normal_d=1764,10-FOLD,0.577,0.562
normal_d=900,70/30-HOLDOUT,0.536667,0.543333
normal_d=900,80/20-HOLDOUT,0.6,0.64
normal_d=900,90/10-HOLDOUT,0.58,0.65
normal_d=900,10-FOLD,0.574,0.587
pca_d=459,70/30-HOLDOUT,0.463333,0.486667
pca_d=459,80/20-HOLDOUT,0.425,0.51


In [231]:
df_best_of_all.describe()

Unnamed: 0,MLP,stacking
count,16.0,16.0
mean,0.517042,0.550938
std,0.053714,0.045269
min,0.425,0.486667
25%,0.465333,0.5225
50%,0.5275,0.54
75%,0.5635,0.564
max,0.6,0.65


In [232]:
posthoc_nemenyi_friedman(df_best_of_all)

Unnamed: 0,MLP,stacking
MLP,1.0,0.001
stacking,0.001,1.0


# Conclusion
**Best model**: Stacking(MLP,KNN,GNB)