# Arvore de Decisao

In [1]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [2]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('transfusion.data', sep=',',  index_col=0, na_values='?')
dataset

Unnamed: 0_level_0,Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
Recency (months),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,50,12500,98,1
0,13,3250,28,1
1,16,4000,35,1
2,20,5000,45,1
1,24,6000,77,0
...,...,...,...,...
23,2,500,38,0
21,2,500,52,0
23,3,750,62,0
39,1,250,39,0


In [3]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0_level_0,Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
Recency (months),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,50,12500,98,1
0,13,3250,28,1
1,16,4000,35,1
2,20,5000,45,1
1,24,6000,77,0
...,...,...,...,...
23,2,500,38,0
21,2,500,52,0
23,3,750,62,0
39,1,250,39,0


In [4]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 'whether he/she donated blood in March 2007'] 
y = np.array(dataset.loc[:, dataset.columns == 'whether he/she donated blood in March 2007']).ravel()

In [5]:
#Define os parametros do metodo de Arvore de Decisao
parameters= {'criterion':['gini'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150], 'random_state': [56]}

In [6]:
# importa o método do sklearn que separa em conjunto de treino e teste
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.26, stratify=y, random_state=56)

In [7]:
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV
# importa a arvore de decisão
from sklearn.tree import DecisionTreeClassifier
# instancia o classificador 
clf = DecisionTreeClassifier()
gs_tree = GridSearchCV(clf,parameters, cv = 10, scoring = 'accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs_tree.fit(X,y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini'],
                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30,
                                       40, 50, 70, 90, 120, 150],
                         'random_state': [56]},
             scoring='accuracy')

In [8]:
# coloca os resultados num Frame para melhor visulização
results = pd.DataFrame(gs_tree.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_random_state,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002413,0.000351,0.00138,0.000196,gini,4,56,"{'criterion': 'gini', 'max_depth': 4, 'random_...",0.4,0.813333,...,0.76,0.746667,0.76,0.666667,0.826667,0.77027,0.77027,0.728721,0.116783,3
1,0.002724,0.000567,0.001597,0.000443,gini,5,56,"{'criterion': 'gini', 'max_depth': 5, 'random_...",0.533333,0.813333,...,0.746667,0.746667,0.746667,0.746667,0.826667,0.743243,0.77027,0.744685,0.075892,1
2,0.002305,5.7e-05,0.001286,1.1e-05,gini,6,56,"{'criterion': 'gini', 'max_depth': 6, 'random_...",0.52,0.773333,...,0.733333,0.746667,0.72,0.72,0.826667,0.743243,0.77027,0.731351,0.076479,2
3,0.002391,5.7e-05,0.001394,0.000207,gini,7,56,"{'criterion': 'gini', 'max_depth': 7, 'random_...",0.506667,0.786667,...,0.666667,0.68,0.706667,0.72,0.813333,0.72973,0.77027,0.715333,0.082787,4
4,0.002481,5.9e-05,0.001302,3.4e-05,gini,8,56,"{'criterion': 'gini', 'max_depth': 8, 'random_...",0.52,0.733333,...,0.666667,0.706667,0.706667,0.76,0.773333,0.743243,0.783784,0.712703,0.072164,6
5,0.003002,0.001069,0.001407,9.7e-05,gini,9,56,"{'criterion': 'gini', 'max_depth': 9, 'random_...",0.52,0.773333,...,0.666667,0.706667,0.706667,0.76,0.786667,0.72973,0.783784,0.714018,0.074881,5
6,0.002655,7.5e-05,0.001367,3.2e-05,gini,10,56,"{'criterion': 'gini', 'max_depth': 10, 'random...",0.506667,0.773333,...,0.653333,0.72,0.693333,0.746667,0.76,0.716216,0.783784,0.704667,0.076331,7
7,0.002651,0.000117,0.001427,0.000274,gini,11,56,"{'criterion': 'gini', 'max_depth': 11, 'random...",0.493333,0.746667,...,0.653333,0.693333,0.72,0.746667,0.786667,0.689189,0.783784,0.703297,0.080391,8
8,0.002668,0.000128,0.001342,4.7e-05,gini,12,56,"{'criterion': 'gini', 'max_depth': 12, 'random...",0.52,0.72,...,0.653333,0.68,0.72,0.76,0.8,0.675676,0.783784,0.700613,0.075518,9
9,0.002871,0.000793,0.001462,0.000355,gini,15,56,"{'criterion': 'gini', 'max_depth': 15, 'random...",0.506667,0.72,...,0.613333,0.693333,0.72,0.76,0.8,0.702703,0.783784,0.699315,0.081486,10


In [9]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
1,"{'criterion': 'gini', 'max_depth': 5, 'random_...",0.744685,0.075892,1
2,"{'criterion': 'gini', 'max_depth': 6, 'random_...",0.731351,0.076479,2
0,"{'criterion': 'gini', 'max_depth': 4, 'random_...",0.728721,0.116783,3
3,"{'criterion': 'gini', 'max_depth': 7, 'random_...",0.715333,0.082787,4
5,"{'criterion': 'gini', 'max_depth': 9, 'random_...",0.714018,0.074881,5
4,"{'criterion': 'gini', 'max_depth': 8, 'random_...",0.712703,0.072164,6
6,"{'criterion': 'gini', 'max_depth': 10, 'random...",0.704667,0.076331,7
7,"{'criterion': 'gini', 'max_depth': 11, 'random...",0.703297,0.080391,8
8,"{'criterion': 'gini', 'max_depth': 12, 'random...",0.700613,0.075518,9
9,"{'criterion': 'gini', 'max_depth': 15, 'random...",0.699315,0.081486,10


# Multilayer-Perceptron (redes neurais)

In [None]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [11]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('transfusion.data', sep=',', na_values='?')
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [12]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [13]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 'whether he/she donated blood in March 2007'] 
y = np.array(dataset.loc[:, dataset.columns == 'whether he/she donated blood in March 2007']).ravel()

In [14]:
# importa o algoritmo de classificação Multilayer-Perceptron (redes neurais)
from sklearn.neural_network import MLPClassifier
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV

# na MLP um parâmetro a ser testado é a quantidade de neurônios na camada escondida, 
# onde utilzamos uma tupla para representar a camada escondida.
# Por exemplo:
# (5) - Cinco neurônios e uma camada escondida
# (8, 5) - Oito neurônios na primeira camada escondida e cinco neurônios na segunda camada escondida.
# Interessante nesse projeto utilizar no máximo duas camadas para verificação 
# Quanto mais camadas e neurônios maior o tempo de processamento do algoritmo
parameters = {'hidden_layer_sizes' : [(5), (8), (15), (5, 3), (8, 5), (10, 5)],
              'max_iter' : [3000], 'random_state' : [56]}

# define o algoritmo de classificação que será usado
mlp = MLPClassifier()
gs_mlp = GridSearchCV(mlp, parameters, cv=10, scoring='accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs_mlp.fit(X, y)

GridSearchCV(cv=10, estimator=MLPClassifier(),
             param_grid={'hidden_layer_sizes': [5, 8, 15, (5, 3), (8, 5),
                                                (10, 5)],
                         'max_iter': [3000], 'random_state': [56]},
             scoring='accuracy')

In [15]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results = pd.DataFrame(gs_mlp.cv_results_)
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
1,"{'hidden_layer_sizes': 8, 'max_iter': 3000, 'r...",0.791423,0.046356,1
3,"{'hidden_layer_sizes': (5, 3), 'max_iter': 300...",0.762054,0.004108,2
5,"{'hidden_layer_sizes': (10, 5), 'max_iter': 30...",0.758036,0.014869,3
2,"{'hidden_layer_sizes': 15, 'max_iter': 3000, '...",0.754054,0.130449,4
4,"{'hidden_layer_sizes': (8, 5), 'max_iter': 300...",0.706198,0.11514,5
0,"{'hidden_layer_sizes': 5, 'max_iter': 3000, 'r...",0.237946,0.004108,6


# Naive Bayes

In [None]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [16]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('transfusion.data', sep=',', na_values='?')
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [17]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [18]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 'whether he/she donated blood in March 2007'] 
y = np.array(dataset.loc[:, dataset.columns == 'whether he/she donated blood in March 2007']).ravel()

In [19]:
#Define os parametros do metodo Naive Bayes
param = {'var_smoothing': np.logspace(0,-9, num=100)}

In [21]:
# importa o método do sklearn que separa em conjunto de treino e teste
from sklearn.model_selection import train_test_split
# importa o algoritmo Naive Bayes 
from sklearn.naive_bayes import GaussianNB
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=56)
# instancia o classificador
gnb = GaussianNB()
gs_NB = GridSearchCV(gnb, param, cv = 10, scoring = 'accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs_NB.fit(X,y)

GridSearchCV(cv=10, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.3...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring='accuracy')

In [22]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results = pd.DataFrame(gs_NB.cv_results_)
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
1,{'var_smoothing': 0.8111308307896871},0.768721,0.021462,1
2,{'var_smoothing': 0.657933224657568},0.768703,0.021661,2
3,{'var_smoothing': 0.533669923120631},0.768703,0.021661,2
5,{'var_smoothing': 0.3511191734215131},0.767369,0.021745,4
0,{'var_smoothing': 1.0},0.766054,0.022361,5
...,...,...,...,...
78,{'var_smoothing': 8.111308307896873e-08},0.747351,0.034813,77
77,{'var_smoothing': 1e-07},0.747351,0.034813,77
65,{'var_smoothing': 1.232846739442066e-06},0.747351,0.031601,77
86,{'var_smoothing': 1.519911082952933e-08},0.747351,0.034813,77


# K Neighbors

In [None]:
#Importa as bibliotecas Pandas e Numpy
import pandas as pd
import numpy as np

In [23]:
# na_values = substitui o valor '?' (dado faltante) para Nan
dataset = pd.read_csv('transfusion.data', sep=',', na_values='?')
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [24]:

# elimina as linhas com dados faltantes
dataset = dataset.dropna(axis=0)
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [25]:
#Separando as variáveis independentes (conjunto X) e a variável dependente y (que neste caso é a coluna 'status')
X = dataset.loc[:, dataset.columns != 'whether he/she donated blood in March 2007'] 
y = np.array(dataset.loc[:, dataset.columns == 'whether he/she donated blood in March 2007']).ravel()

In [26]:
# importa o algoritmo de classifcaçaõ k-vizinhos
from sklearn.neighbors import KNeighborsClassifier
# importa o GridSearchCV
from sklearn.model_selection import GridSearchCV

# no grid coloca os hiperparâmetros do algoritmo que serão testados (dicionário)
# testará o valor de k com os seguintes valores e o hiperparâmetros weights com os valores 'uniform' e 'distance'
parameters = {'n_neighbors' : [1, 3, 5, 7, 9, 11, 13], 'weights' : ['uniform', 'distance']}

# define o algoritmo de classificação que será usado
knn = KNeighborsClassifier()
''' configura o GridSearch com o algoritmo de classificação = knn (instanciado), os parâmetros testados serão os 
    definidos em parameters, usa cross-validation = 10 e a medida de avaliação é a acurácia.'''  
gs = GridSearchCV(knn, parameters, cv=10, scoring='accuracy')
# o grid search treinará todos os modelos conforme a parametrização acima
gs.fit(X, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [27]:
# coloca os resultados num Frame para melhor visulização
results = pd.DataFrame(gs.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002607,0.001366,0.004397,0.001244,1,uniform,"{'n_neighbors': 1, 'weights': 'uniform'}",0.333333,0.32,0.493333,0.533333,0.546667,0.653333,0.613333,0.866667,0.635135,0.797297,0.579243,0.166782,13
1,0.002012,6.7e-05,0.002064,4.7e-05,1,distance,"{'n_neighbors': 1, 'weights': 'distance'}",0.333333,0.32,0.493333,0.533333,0.546667,0.653333,0.613333,0.866667,0.635135,0.797297,0.579243,0.166782,13
2,0.002098,7.2e-05,0.003922,8.1e-05,3,uniform,"{'n_neighbors': 3, 'weights': 'uniform'}",0.426667,0.373333,0.52,0.72,0.76,0.746667,0.706667,0.72,0.72973,0.797297,0.650036,0.143468,9
3,0.002107,0.000136,0.002221,0.000277,3,distance,"{'n_neighbors': 3, 'weights': 'distance'}",0.373333,0.36,0.493333,0.653333,0.693333,0.746667,0.733333,0.8,0.689189,0.783784,0.632631,0.155655,12
4,0.002047,8.5e-05,0.003892,7.4e-05,5,uniform,"{'n_neighbors': 5, 'weights': 'uniform'}",0.413333,0.453333,0.533333,0.746667,0.773333,0.76,0.706667,0.813333,0.72973,0.77027,0.67,0.138444,5
5,0.002038,0.0002,0.002173,0.000296,5,distance,"{'n_neighbors': 5, 'weights': 'distance'}",0.386667,0.36,0.52,0.68,0.72,0.746667,0.746667,0.813333,0.702703,0.756757,0.643279,0.153373,11
6,0.002166,0.000202,0.004007,0.000185,7,uniform,"{'n_neighbors': 7, 'weights': 'uniform'}",0.413333,0.48,0.586667,0.733333,0.746667,0.76,0.653333,0.773333,0.72973,0.77027,0.664667,0.122965,6
7,0.002358,0.000593,0.002388,0.000553,7,distance,"{'n_neighbors': 7, 'weights': 'distance'}",0.413333,0.373333,0.56,0.706667,0.706667,0.746667,0.733333,0.813333,0.716216,0.756757,0.652631,0.143442,8
8,0.002231,0.000296,0.004252,0.000489,9,uniform,"{'n_neighbors': 9, 'weights': 'uniform'}",0.48,0.48,0.573333,0.733333,0.746667,0.773333,0.733333,0.76,0.716216,0.77027,0.676649,0.112236,4
9,0.002135,0.000202,0.002311,0.000196,9,distance,"{'n_neighbors': 9, 'weights': 'distance'}",0.426667,0.373333,0.56,0.68,0.72,0.746667,0.706667,0.813333,0.716216,0.756757,0.649964,0.139763,10


In [28]:
#criar uma visão para facilitar a visualização dos resultados, e ordenar os resultados conforme os resultados.
view = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results[view].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
12,"{'n_neighbors': 13, 'weights': 'uniform'}",0.715333,0.0948,1
10,"{'n_neighbors': 11, 'weights': 'uniform'}",0.688667,0.104777,2
13,"{'n_neighbors': 13, 'weights': 'distance'}",0.681982,0.110661,3
8,"{'n_neighbors': 9, 'weights': 'uniform'}",0.676649,0.112236,4
4,"{'n_neighbors': 5, 'weights': 'uniform'}",0.67,0.138444,5
6,"{'n_neighbors': 7, 'weights': 'uniform'}",0.664667,0.122965,6
11,"{'n_neighbors': 11, 'weights': 'distance'}",0.655297,0.14181,7
7,"{'n_neighbors': 7, 'weights': 'distance'}",0.652631,0.143442,8
2,"{'n_neighbors': 3, 'weights': 'uniform'}",0.650036,0.143468,9
9,"{'n_neighbors': 9, 'weights': 'distance'}",0.649964,0.139763,10


# Calculo da Diferenca Absoluta

Observando os resultados, podemos ver que os metodos MLP e Arvore de Decisao tem as melhores performances, logo temos que calcular a diferenca absoluta para ver qual dos dois e de fato o melhor

In [None]:
#Calculo do desvio padrao
import math
Dp = math.sqrt(((0.121518)*(0.121518) + (0.059363)*(0.059363))/2)
#Calculo da media
media = (0.813421 - 0.815385)
#calcula da Diferenca Absoluta
DA = media/Dp
print(DA)
#MLP >performance Tree

-0.020537265746999902


Conclusao: O MLP tem uma melhor performance do que a Arvore de decisao e o restante dos algoritmos

