# Arvore de Decisao

In [1]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [2]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('tae.data', sep=',',  index_col=0, na_values='?', header = None)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [3]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [4]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 5] 
y = np.array(dataset.loc[:, dataset.columns == 5]).ravel()

In [5]:
#Define os parametros do metodo de Arvore de Decisao
parameters= {'criterion':['gini'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150], 'random_state': [56]}

In [6]:
# importa o método do sklearn que separa em conjunto de treino e teste
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.26, stratify=y, random_state=56)

In [7]:
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV
# importa a arvore de decisão
from sklearn.tree import DecisionTreeClassifier
# instancia o classificador 
clf = DecisionTreeClassifier()
gs_tree = GridSearchCV(clf,parameters, cv = 10, scoring = 'accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs_tree.fit(X,y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini'],
                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30,
                                       40, 50, 70, 90, 120, 150],
                         'random_state': [56]},
             scoring='accuracy')

In [8]:
# coloca os resultados num Frame para melhor visulização
results = pd.DataFrame(gs_tree.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_random_state,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004383,0.000607,0.00256,0.000134,gini,4,56,"{'criterion': 'gini', 'max_depth': 4, 'random_...",0.5,0.466667,...,0.733333,0.466667,0.4,0.333333,0.4,0.333333,0.266667,0.456667,0.139881,18
1,0.004103,0.000173,0.003553,0.002774,gini,5,56,"{'criterion': 'gini', 'max_depth': 5, 'random_...",0.75,0.733333,...,0.733333,0.6,0.4,0.333333,0.266667,0.4,0.2,0.501667,0.195853,17
2,0.004218,0.000179,0.002951,0.001072,gini,6,56,"{'criterion': 'gini', 'max_depth': 6, 'random_...",0.6875,0.666667,...,0.8,0.666667,0.333333,0.266667,0.333333,0.4,0.266667,0.502083,0.191079,16
3,0.004097,0.000444,0.002883,0.001138,gini,7,56,"{'criterion': 'gini', 'max_depth': 7, 'random_...",0.875,0.666667,...,0.933333,0.6,0.4,0.333333,0.333333,0.4,0.2,0.540833,0.232989,15
4,0.003938,0.000151,0.002385,0.000142,gini,8,56,"{'criterion': 'gini', 'max_depth': 8, 'random_...",0.75,0.8,...,1.0,0.666667,0.533333,0.333333,0.4,0.266667,0.2,0.575,0.254433,14
5,0.004146,0.00047,0.002513,0.000364,gini,9,56,"{'criterion': 'gini', 'max_depth': 9, 'random_...",0.875,0.8,...,0.933333,0.733333,0.466667,0.333333,0.4,0.266667,0.2,0.580833,0.260929,13
6,0.004245,0.000481,0.002555,0.000294,gini,10,56,"{'criterion': 'gini', 'max_depth': 10, 'random...",0.875,0.933333,...,0.933333,0.8,0.466667,0.333333,0.4,0.266667,0.2,0.614167,0.291024,12
7,0.004325,0.000484,0.002638,0.000248,gini,11,56,"{'criterion': 'gini', 'max_depth': 11, 'random...",0.9375,1.0,...,1.0,0.733333,0.466667,0.4,0.466667,0.266667,0.2,0.640417,0.298852,11
8,0.005671,0.002554,0.002541,0.000257,gini,12,56,"{'criterion': 'gini', 'max_depth': 12, 'random...",0.9375,1.0,...,1.0,0.733333,0.466667,0.333333,0.533333,0.333333,0.2,0.647083,0.294121,10
9,0.003879,0.000213,0.002526,0.000737,gini,15,56,"{'criterion': 'gini', 'max_depth': 15, 'random...",1.0,1.0,...,1.0,0.8,0.466667,0.333333,0.533333,0.266667,0.266667,0.66,0.301773,1


In [9]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
9,"{'criterion': 'gini', 'max_depth': 15, 'random...",0.66,0.301773,1
17,"{'criterion': 'gini', 'max_depth': 150, 'rando...",0.653333,0.305214,2
15,"{'criterion': 'gini', 'max_depth': 90, 'random...",0.653333,0.305214,2
14,"{'criterion': 'gini', 'max_depth': 70, 'random...",0.653333,0.305214,2
13,"{'criterion': 'gini', 'max_depth': 50, 'random...",0.653333,0.305214,2
12,"{'criterion': 'gini', 'max_depth': 40, 'random...",0.653333,0.305214,2
11,"{'criterion': 'gini', 'max_depth': 30, 'random...",0.653333,0.305214,2
10,"{'criterion': 'gini', 'max_depth': 20, 'random...",0.653333,0.305214,2
16,"{'criterion': 'gini', 'max_depth': 120, 'rando...",0.653333,0.305214,2
8,"{'criterion': 'gini', 'max_depth': 12, 'random...",0.647083,0.294121,10


# Multilayer-Perceptron (redes neurais)

In [None]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [10]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('tae.data', sep=',',  index_col=0, na_values='?', header = None)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [11]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [12]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 5] 
y = np.array(dataset.loc[:, dataset.columns == 5]).ravel()

In [13]:
# importa o algoritmo de classificação Multilayer-Perceptron (redes neurais)
from sklearn.neural_network import MLPClassifier
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV

# na MLP um parâmetro a ser testado é a quantidade de neurônios na camada escondida, 
# onde utilzamos uma tupla para representar a camada escondida.
# Por exemplo:
# (5) - Cinco neurônios e uma camada escondida
# (8, 5) - Oito neurônios na primeira camada escondida e cinco neurônios na segunda camada escondida.
# Interessante nesse projeto utilizar no máximo duas camadas para verificação 
# Quanto mais camadas e neurônios maior o tempo de processamento do algoritmo
parameters = {'hidden_layer_sizes' : [(5), (8), (15), (5, 3), (8, 5), (10, 5)],
              'max_iter' : [3000], 'random_state' : [56]}

# define o algoritmo de classificação que será usado
mlp = MLPClassifier()
gs_mlp = GridSearchCV(mlp, parameters, cv=10, scoring='accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs_mlp.fit(X, y)

GridSearchCV(cv=10, estimator=MLPClassifier(),
             param_grid={'hidden_layer_sizes': [5, 8, 15, (5, 3), (8, 5),
                                                (10, 5)],
                         'max_iter': [3000], 'random_state': [56]},
             scoring='accuracy')

In [14]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results = pd.DataFrame(gs_mlp.cv_results_)
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
4,"{'hidden_layer_sizes': (8, 5), 'max_iter': 300...",0.464167,0.109294,1
0,"{'hidden_layer_sizes': 5, 'max_iter': 3000, 'r...",0.46375,0.08478,2
2,"{'hidden_layer_sizes': 15, 'max_iter': 3000, '...",0.455833,0.097472,3
3,"{'hidden_layer_sizes': (5, 3), 'max_iter': 300...",0.404583,0.122339,4
1,"{'hidden_layer_sizes': 8, 'max_iter': 3000, 'r...",0.40375,0.112121,5
5,"{'hidden_layer_sizes': (10, 5), 'max_iter': 30...",0.330417,0.074303,6


# Naive Bayes

In [None]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [15]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('tae.data', sep=',',  index_col=0, na_values='?', header = None)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [16]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [17]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 5] 
y = np.array(dataset.loc[:, dataset.columns == 5]).ravel()

In [18]:
#Define os parametros do metodo Naive Bayes
param = {'var_smoothing': np.logspace(0,-9, num=100)}

In [19]:
# importa o método do sklearn que separa em conjunto de treino e teste
from sklearn.model_selection import train_test_split
# importa o algoritmo Naive Bayes 
from sklearn.naive_bayes import GaussianNB
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=56)
# instancia o classificador
gnb = GaussianNB()
gs_NB = GridSearchCV(gnb, param, cv = 10, scoring = 'accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs_NB.fit(X,y)

GridSearchCV(cv=10, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.3...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring='accuracy')

In [20]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results = pd.DataFrame(gs_NB.cv_results_)
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
35,{'var_smoothing': 0.0006579332246575676},0.515417,0.110806,1
36,{'var_smoothing': 0.0005336699231206307},0.508750,0.107472,2
33,{'var_smoothing': 0.001},0.502083,0.111854,3
34,{'var_smoothing': 0.0008111308307896872},0.502083,0.111854,3
39,{'var_smoothing': 0.0002848035868435802},0.495833,0.111009,5
...,...,...,...,...
5,{'var_smoothing': 0.3511191734215131},0.356667,0.078951,96
0,{'var_smoothing': 1.0},0.350417,0.120710,97
8,{'var_smoothing': 0.1873817422860384},0.344167,0.063689,98
7,{'var_smoothing': 0.23101297000831597},0.344167,0.063689,98


# K Neighbors

In [None]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [21]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('tae.data', sep=',',  index_col=0, na_values='?', header = None)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [22]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,23,3,1,19,3
2,15,3,1,17,3
1,23,3,2,49,3
1,5,2,2,33,3
2,7,11,2,55,3
...,...,...,...,...,...
2,3,2,2,26,1
2,10,3,2,12,1
1,18,7,2,48,1
2,22,1,2,51,1


In [23]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 5] 
y = np.array(dataset.loc[:, dataset.columns == 5]).ravel()

In [24]:
# importa o algoritmo de classifcaçaõ k-vizinhos
from sklearn.neighbors import KNeighborsClassifier
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV

# no grid coloca os hiperparâmetros do algoritmo que serão testados (dicionário)
# testará o valor de k com os seguintes valores e o hiperparâmetros weights com os valores 'uniform' e 'distance'
parameters = {'n_neighbors' : [1, 3, 5, 7, 9, 11, 13], 'weights' : ['uniform', 'distance']}

# define o algoritmo de classificação que será usado
knn = KNeighborsClassifier()
''' configura o GridSearch com o algoritmo de classificação = knn (instanciado), os parâmetros testados serão os 
    definidos em parameters, usa cross-validation = 10 e a medida de avaliação é a acurácia.'''  
gs = GridSearchCV(knn, parameters, cv=10, scoring='accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs.fit(X, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [25]:
# coloca os resultados num Frame para melhor visulização
results = pd.DataFrame(gs.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00265,0.001865,0.002843,0.000782,1,uniform,"{'n_neighbors': 1, 'weights': 'uniform'}",0.9375,1.0,0.933333,1.0,0.8,0.466667,0.4,0.4,0.266667,0.266667,0.647083,0.297128,6
1,0.001848,3.3e-05,0.002025,8.8e-05,1,distance,"{'n_neighbors': 1, 'weights': 'distance'}",0.9375,1.0,0.933333,1.0,0.8,0.466667,0.4,0.4,0.266667,0.266667,0.647083,0.297128,6
2,0.002255,0.000656,0.00298,0.000913,3,uniform,"{'n_neighbors': 3, 'weights': 'uniform'}",0.5625,0.733333,0.733333,0.466667,0.666667,0.333333,0.266667,0.266667,0.333333,0.266667,0.462917,0.186231,9
3,0.002243,0.000525,0.002354,0.000415,3,distance,"{'n_neighbors': 3, 'weights': 'distance'}",1.0,1.0,0.933333,0.933333,0.8,0.533333,0.4,0.333333,0.333333,0.2,0.646667,0.301183,8
4,0.00215,0.000397,0.002933,0.00072,5,uniform,"{'n_neighbors': 5, 'weights': 'uniform'}",0.5625,0.533333,0.6,0.4,0.666667,0.333333,0.266667,0.266667,0.266667,0.2,0.409583,0.158794,14
5,0.001804,3.6e-05,0.002006,0.000101,5,distance,"{'n_neighbors': 5, 'weights': 'distance'}",1.0,1.0,0.933333,1.0,0.866667,0.533333,0.333333,0.266667,0.333333,0.266667,0.653333,0.316649,4
6,0.00201,0.000155,0.002589,0.000257,7,uniform,"{'n_neighbors': 7, 'weights': 'uniform'}",0.5,0.466667,0.6,0.533333,0.466667,0.4,0.4,0.4,0.2,0.333333,0.43,0.105883,12
7,0.001861,4.8e-05,0.002056,7.1e-05,7,distance,"{'n_neighbors': 7, 'weights': 'distance'}",1.0,1.0,0.933333,1.0,0.8,0.466667,0.4,0.4,0.333333,0.2,0.653333,0.305214,4
8,0.002628,0.000801,0.003193,0.000746,9,uniform,"{'n_neighbors': 9, 'weights': 'uniform'}",0.4375,0.466667,0.4,0.533333,0.6,0.266667,0.466667,0.4,0.266667,0.466667,0.430417,0.099409,11
9,0.002253,0.000543,0.002721,0.000624,9,distance,"{'n_neighbors': 9, 'weights': 'distance'}",1.0,1.0,0.933333,1.0,0.8,0.466667,0.4,0.333333,0.4,0.266667,0.66,0.295823,3


In [26]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
11,"{'n_neighbors': 11, 'weights': 'distance'}",0.68,0.288752,1
13,"{'n_neighbors': 13, 'weights': 'distance'}",0.68,0.290287,1
9,"{'n_neighbors': 9, 'weights': 'distance'}",0.66,0.295823,3
5,"{'n_neighbors': 5, 'weights': 'distance'}",0.653333,0.316649,4
7,"{'n_neighbors': 7, 'weights': 'distance'}",0.653333,0.305214,4
0,"{'n_neighbors': 1, 'weights': 'uniform'}",0.647083,0.297128,6
1,"{'n_neighbors': 1, 'weights': 'distance'}",0.647083,0.297128,6
3,"{'n_neighbors': 3, 'weights': 'distance'}",0.646667,0.301183,8
2,"{'n_neighbors': 3, 'weights': 'uniform'}",0.462917,0.186231,9
12,"{'n_neighbors': 13, 'weights': 'uniform'}",0.43125,0.106511,10


# Calculo da Diferenca Absoluta

Observando os resultados, podemos ver que os metodos MLP e Arvore de Decisao tem as melhores performances, logo temos que calcular a diferenca absoluta para ver qual dos dois e de fato o melhor

In [None]:
#Calculo do desvio padrao
import math
Dp = math.sqrt(((0.121518)*(0.121518) + (0.059363)*(0.059363))/2)
#Calculo da media
media = (0.813421 - 0.815385)
#calcula da Diferenca Absoluta
DA = media/Dp
print(DA)
#MLP >performance Tree

-0.020537265746999902


Conclusao: O MLP tem uma melhor performance do que a Arvore de decisao e o restante dos algoritmos

