# Bibliotecas

In [1]:
import numpy as np
from scipy import stats
from sklearn import datasets
from sklearn.model_selection import cross_val_predict 
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.utils import resample
from collections import Counter
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, clone
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier



# Classe - Heterogeneous Polling

In [2]:
def mostCommon(estimatorsPredict):
  
    return [Counter(col).most_common() for col in zip(*estimatorsPredict)]
  

def votingClass(predMatrix, y_train):

    saida = np.array([])
    
    valuesFrequency = mostCommon(predMatrix)
    for j in range(predMatrix.shape[1]):
      if len(valuesFrequency[j])>1 and valuesFrequency[j][0][1] == valuesFrequency[j][1][1]:
        listaElementsTie = np.array([])
        for k in range(len(valuesFrequency[j])-1):
          if valuesFrequency[j][k][1] == valuesFrequency[j][k+1][1]:
            if k == 0 :
              listaElementsTie = np.append(listaElementsTie, valuesFrequency[j][k][0])
              listaElementsTie = np.append(listaElementsTie, valuesFrequency[j][k+1][0])
            else:
              listaElementsTie = np.append(listaElementsTie, valuesFrequency[j][k+1][0])
        # Retorna a classe mais votada mais frequente na base de treino
        saida = np.append(saida, compareFrequencyValues(y_train, listaElementsTie))
      else:
        # Retorna a classe mais votada
        saida = np.append(saida, valuesFrequency[j][0][0])
    return saida

def compareFrequencyValues(y_train, elementsTie):
      indexElements = []
      orderArray = getMostfrequentValues(y_train)
      listValues = list(orderArray)
      for i in range(len(elementsTie)):
        indexElements.append(listValues.index(elementsTie[i]))
      mostFreq = min(indexElements)
      return orderArray[mostFreq]

def getMostfrequentValues(a):

    from collections import Counter
    mostfrequentValues = np.array([])
    b = Counter(a)
    arrayValoresFreq = b.most_common()
    for i in range(len(b)):
      mostfrequentValues = np.append(mostfrequentValues,arrayValoresFreq[i][0])
    return mostfrequentValues   

class HeterogeneousClassifier(BaseEstimator):

  estimatorBase = list()
  estimatorBase.append(KNeighborsClassifier(n_neighbors=1))
  estimatorBase.append(DecisionTreeClassifier())
  estimatorBase.append(GaussianNB())

  def __init__(self, base_estimator = estimatorBase,n_samples=None):
    
    self.base_estimator = base_estimator
    self.n_samples = n_samples
    self.ord = []
    self.estimators = []
    self.KNNclassifier = KNeighborsClassifier(n_neighbors=1)
    self.DTclassifier = DecisionTreeClassifier()
    self.NBclassifier = GaussianNB()
 

  def fit(self, X, y):

    self.ord = y
    for i in range(self.n_samples):
      if self.n_samples == 1:
        self.estimators.append(self.KNNclassifier.fit(X, y))
        self.estimators.append(self.DTclassifier.fit(X, y))
        self.estimators.append(self.NBclassifier.fit(X, y))
      else:
        X, y = resample(X,y, replace=True)
        self.estimators.append(self.KNNclassifier.fit(X, y))
        self.estimators.append(self.DTclassifier.fit(X, y))
        self.estimators.append(self.NBclassifier.fit(X, y))
    
    return self.estimators


  def predict(self, X):

      y_pred = []
      for i in range(self.n_samples):
        y_knn = np.array([self.estimators[0+(3*i)].predict(X)])
        y_DTree = np.array([self.estimators[1+(3*i)].predict(X)])
        y_naive = np.array([self.estimators[2+(3*i)].predict(X)])
        if i == 0:
          y_pred = np.vstack((y_knn, y_DTree, y_naive))
        else:
          y_pred = np.vstack((y_pred, y_knn, y_DTree, y_naive))
      y_predVot = votingClass(y_pred,self.ord)
      
      return y_predVot


Os resultados de cada classificador devem ser apresentados numa tabela contendo a média das acurácias obtidas em cada fold do ciclo externo, o desvio padrão e o intervalo de confiança a 95% de significância dos resultados, e também através do boxplot dos resultados de cada classificador em cada fold.

# Functions Grid and Results

Os dados utilizados no conjunto de treino em cada rodada de teste são padronizados (normalizados o com z-score). Os valores de padronização obtidos nos dados de treino são utilizados para padronizar os dados do respectivo conjunto de teste.
O procedimento experimental de treinamento, validação e teste é realizado através de 3 rodadas de ciclos aninhados de validação e teste, com o ciclo interno de validação contendo 4 folds e o externo de teste com 10 folds. A busca em grade (grid search) do ciclo interno considera os os valores de hiperparâmetros definidos para cada técnica de aprendizado.

In [3]:
def GridTestModel(dataBase, model, grid):

  # Data Base
  X = dataBase.data
  y = dataBase.target

  # Z-score
  scalar = StandardScaler()

  # Pipeline
  pipeline = Pipeline([('transformer', scalar), ('estimator', model)])

  # configure the cross-validation procedure
  rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

  # configure Grid Search
  gs = GridSearchCV(estimator=pipeline, param_grid = grid, 
                    scoring='accuracy', cv = 4, verbose=0, refit=True)

  # Results
  scores = cross_val_score(gs, X, y, scoring='accuracy', 
                          cv = rkf)

  print('Done')
  return scores

In [4]:
def getResults(scores):
  print (scores)

  mean = scores.mean()
  std = scores.std()
  inf, sup = stats.norm.interval(0.95, loc=mean, 
                                scale=std/np.sqrt(len(scores)))

  print("\nMean Accuracy: %0.3f Standard Deviation: %0.3f" % (mean, std))
  print ("Accuracy Confidence Interval (95%%): (%0.2f, %0.2f)\n" % 
        (inf, sup)) 
  return mean, std, inf, sup

In [5]:
dfResultClassifier = pd.DataFrame(columns=['Métodos', 'Média', 'STD', 'Limite Inferior', 'Limite Superior'])
dfResultClassifier['Métodos'] = ['Bagging', 'AdaBoost', 'RandomForest', 'HeterogeneousPooling']

# Bases de Dados - Digits

## Classificadores

### Bagging

In [6]:
# Define DataBase
dataBase = datasets.load_digits()
dataBase.DESCR[4:18]

'digits_dataset'

In [7]:
# define the model
bagging = BaggingClassifier()

In [8]:
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = bagging
print('Model Evaluate... BaggingClassifier')
bagScores = GridTestModel(dataBase, model, grid)

Model Evaluate... BaggingClassifier
Done


In [9]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:17], '\n')
bag_mean, bag_std, bag_inf, bag_sup = getResults(bagScores)
# dfResultClassifier.iloc[0] = ['Baggind', bag_mean, bag_std, bag_inf, bag_sup]
# dfResultClassifier

****** Resultados ******

Base de Dados:  digits_dataset
Modelo -  BaggingClassifier 

[0.95       0.96111111 0.93888889 0.97222222 0.93333333 0.95
 0.95       0.96089385 0.96089385 0.97765363 0.95       0.96111111
 0.96111111 0.94444444 0.96111111 0.97222222 0.95       0.94413408
 0.94972067 0.94972067 0.95       0.93888889 0.94444444 0.94444444
 0.92777778 0.97222222 0.96111111 0.97765363 0.93296089 0.96089385]

Mean Accuracy: 0.954 Standard Deviation: 0.013
Accuracy Confidence Interval (95%): (0.95, 0.96)



In [10]:
dfResultClassifier.iloc[0] = ['Baggind', bag_mean, bag_std, bag_inf, bag_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.953632,0.0129415,0.949001,0.958263
1,AdaBoost,,,,
2,RandomForest,,,,
3,HeterogeneousPooling,,,,


### AdaBoost

In [11]:
# Define DataBase
dataBase = datasets.load_digits()
dataBase.DESCR[4:18]

'digits_dataset'

In [12]:
# define the Model
adaBoost = AdaBoostClassifier()

In [13]:
# Z-score, Train, Test, gridSearch, CrossValidation
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = adaBoost
print('Model Evaluate... AdaBoostClassifier')
adaScores = GridTestModel(dataBase, model, grid)

Model Evaluate... AdaBoostClassifier
Done


In [14]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:18], '\n')
ada_mean, ada_std, ada_inf, ada_sup = getResults(adaScores)

****** Resultados ******

Base de Dados:  digits_dataset
Modelo -  AdaBoostClassifier 

[0.26666667 0.26111111 0.26666667 0.34444444 0.25555556 0.28333333
 0.27222222 0.26256983 0.24022346 0.25139665 0.28333333 0.25555556
 0.24444444 0.26111111 0.26666667 0.27222222 0.27222222 0.23463687
 0.26256983 0.29050279 0.26666667 0.25       0.26111111 0.32222222
 0.30555556 0.27222222 0.27222222 0.26815642 0.27374302 0.25698324]

Mean Accuracy: 0.270 Standard Deviation: 0.022
Accuracy Confidence Interval (95%): (0.26, 0.28)



In [15]:
dfResultClassifier.iloc[1] = ['Adaboost', ada_mean, ada_std, ada_inf, ada_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.953632,0.0129415,0.949001,0.958263
1,Adaboost,0.269878,0.0222605,0.261912,0.277844
2,RandomForest,,,,
3,HeterogeneousPooling,,,,


### RandomForest

In [16]:
# Define DataBase
dataBase = datasets.load_digits()
dataBase.DESCR[4:18]

'digits_dataset'

In [17]:
# define the Model
randomForest = RandomForestClassifier()

In [18]:
# Model Evaluate
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = randomForest
print('Model Evaluate... RandomForestClassifier')
rfScores = GridTestModel(dataBase, model, grid)

Model Evaluate... RandomForestClassifier
Done


In [19]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:22], '\n')
rf_mean, rf_std, rf_inf, rf_sup = getResults(rfScores)

****** Resultados ******

Base de Dados:  digits_dataset
Modelo -  RandomForestClassifier 

[0.98333333 0.98333333 0.96666667 0.96666667 0.97777778 0.98888889
 0.96666667 0.96648045 0.96089385 0.98882682 0.97777778 0.98333333
 0.97777778 0.97222222 0.98888889 0.99444444 0.97222222 0.97206704
 0.95530726 0.97765363 0.96666667 0.96111111 0.96666667 0.98333333
 0.98333333 0.97777778 0.98888889 0.98324022 0.97206704 0.97206704]

Mean Accuracy: 0.976 Standard Deviation: 0.010
Accuracy Confidence Interval (95%): (0.97, 0.98)



In [20]:
dfResultClassifier.iloc[2] = ['RandomForest', rf_mean, rf_std, rf_inf, rf_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.953632,0.0129415,0.949001,0.958263
1,Adaboost,0.269878,0.0222605,0.261912,0.277844
2,RandomForest,0.975879,0.00969284,0.972411,0.979348
3,HeterogeneousPooling,,,,


### HeterogeneousPooling

In [21]:
# Define DataBase
dataBase = datasets.load_digits()
dataBase.DESCR[4:18]

'digits_dataset'

In [22]:
# define the model
HeterogeneousModel = HeterogeneousClassifier()

In [23]:
# Model Evaluate
grid = {'estimator__n_samples': [1,3,5,7]}
model = HeterogeneousModel
print('Model Evaluate... Heterogeneous Classifier')
hpScores = GridTestModel(dataBase, model, grid)

Model Evaluate... Heterogeneous Classifier
Done


In [24]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:23], '\n')
hp_mean, hp_std, hp_inf, hp_sup = getResults(hpScores)

****** Resultados ******

Base de Dados:  digits_dataset
Modelo -  HeterogeneousClassifier 

[0.93333333 0.93888889 0.95       0.91666667 0.91111111 0.93888889
 0.92777778 0.93854749 0.91620112 0.96089385 0.95       0.91666667
 0.92777778 0.91666667 0.93333333 0.95       0.91666667 0.9273743
 0.96648045 0.94413408 0.93888889 0.92222222 0.91111111 0.95
 0.93333333 0.92222222 0.95       0.95530726 0.94972067 0.89944134]

Mean Accuracy: 0.934 Standard Deviation: 0.016
Accuracy Confidence Interval (95%): (0.93, 0.94)



## Resultados Classificadores Acurácia

In [25]:
dfResultClassifier.iloc[3] = ['Heterogeneous', hp_mean, hp_std, hp_inf, hp_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.953632,0.0129415,0.949001,0.958263
1,Adaboost,0.269878,0.0222605,0.261912,0.277844
2,RandomForest,0.975879,0.00969284,0.972411,0.979348
3,Heterogeneous,0.933789,0.0164458,0.927904,0.939673


In [26]:
import plotly.graph_objects as go
scores = [bagScores, adaScores, rfScores, hpScores]
scoresNames = ['Bagging', 'AdaBoost', 'RandomForest', 'Heterogeneous Polling']
fig = go.Figure()
for i in range(len(scores)):
  fig.add_trace(go.Box(y=scores[i], name=scoresNames[i]))
fig.update_layout(
    yaxis_title='Acc',
    xaxis_title='Modelos',
    title='Digits - Desempenho dos Modelos  - Acurácia',
)
fig.show()

## Teste Pareado

In [27]:
from scipy.stats import ttest_rel, wilcoxon
scores = [bagScores, adaScores, rfScores, hpScores]
scoresNames = ['Bagging', 'AdaBoost', 'RandomForest', 'Heterogeneous Polling']
dfPairTest = pd.DataFrame(columns=[0,1,2,3])
for i in range(len(scores)):
  for j in range(len(scores)):
    if j == i:
       dfPairTest.at[i, j] = scoresNames[i]
    
    if j > i:
      print('Paired T Test', scoresNames[i], scoresNames[j])
      s,p = ttest_rel(scores[i],scores[j])
      print("t: %0.2f p-value: %0.8f\n" % (s,p))
      dfPairTest.at[i, j] = p
      # dfPairTest.iloc[i][j] = p
    # print("t:",(s,p))

    if j < i :
      print ('Wilcoxon Test', scoresNames[i], scoresNames[j])
      s,p = wilcoxon (scores[i],scores[j])
      print("w: %0.2f p-value: %0.8f\n" % (s,p))
      dfPairTest.at[i, j] = p
      # dfPairTest.iloc[i][j] = p
# print("w: ",  (s,p))
dfPairTest.columns = ['T1','T2','T3','T4']
dfPairTest.index = ['w1', 'w2', 'w3', 'w4']
dfPairTest

Paired T Test Bagging AdaBoost
t: 141.28 p-value: 0.00000000

Paired T Test Bagging RandomForest
t: -8.85 p-value: 0.00000000

Paired T Test Bagging Heterogeneous Polling
t: 5.43 p-value: 0.00000778

Wilcoxon Test AdaBoost Bagging
w: 0.00 p-value: 0.00000171

Paired T Test AdaBoost RandomForest
t: -166.14 p-value: 0.00000000

Paired T Test AdaBoost Heterogeneous Polling
t: -139.82 p-value: 0.00000000

Wilcoxon Test RandomForest Bagging
w: 1.50 p-value: 0.00000293

Wilcoxon Test RandomForest AdaBoost
w: 0.00 p-value: 0.00000172

Paired T Test RandomForest Heterogeneous Polling
t: 13.51 p-value: 0.00000000

Wilcoxon Test Heterogeneous Polling Bagging
w: 32.00 p-value: 0.00005984

Wilcoxon Test Heterogeneous Polling AdaBoost
w: 0.00 p-value: 0.00000172

Wilcoxon Test Heterogeneous Polling RandomForest
w: 1.00 p-value: 0.00000190



Unnamed: 0,T1,T2,T3,T4
w1,Bagging,1.02385e-42,9.7954e-10,7.78092e-06
w2,1.71053e-06,AdaBoost,9.36674e-45,1.38397e-42
w3,2.93191e-06,1.7246e-06,RandomForest,4.86504e-14
w4,5.98376e-05,1.72026e-06,1.89707e-06,Heterogeneous Polling


# Bases de Dados - Wine

## Classificadores

### Bagging

In [6]:
# Define DataBase
dataBase = datasets.load_wine()
dataBase.DESCR[4:18]

'wine_dataset:\n'

In [7]:
# define the model
bagging = BaggingClassifier()

In [8]:
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = bagging
print('Model Evaluate... BaggingClassifier')
bagScores = GridTestModel(dataBase, model, grid)

Model Evaluate... BaggingClassifier
Done


In [9]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:17], '\n')
bag_mean, bag_std, bag_inf, bag_sup = getResults(bagScores)
# dfResultClassifier.iloc[0] = ['Baggind', bag_mean, bag_std, bag_inf, bag_sup]
# dfResultClassifier

****** Resultados ******

Base de Dados:  wine_dataset:

Modelo -  BaggingClassifier 

[1.         1.         0.94444444 1.         0.77777778 1.
 1.         1.         0.88235294 0.94117647 1.         1.
 1.         1.         1.         0.77777778 1.         0.94444444
 0.94117647 0.94117647 0.94444444 1.         1.         0.94444444
 1.         0.88888889 1.         0.94444444 1.         1.        ]

Mean Accuracy: 0.962 Standard Deviation: 0.060
Accuracy Confidence Interval (95%): (0.94, 0.98)



In [10]:
dfResultClassifier.iloc[0] = ['Baggind', bag_mean, bag_std, bag_inf, bag_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.962418,0.0601091,0.940909,0.983928
1,AdaBoost,,,,
2,RandomForest,,,,
3,HeterogeneousPooling,,,,


### AdaBoost

In [11]:
# Define DataBase
dataBase = datasets.load_wine()
dataBase.DESCR[4:18]

'wine_dataset:\n'

In [12]:
# define the Model
adaBoost = AdaBoostClassifier()

In [13]:
# Z-score, Train, Test, gridSearch, CrossValidation
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = adaBoost
print('Model Evaluate... AdaBoostClassifier')
adaScores = GridTestModel(dataBase, model, grid)

Model Evaluate... AdaBoostClassifier
Done


In [14]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:18], '\n')
ada_mean, ada_std, ada_inf, ada_sup = getResults(adaScores)

****** Resultados ******

Base de Dados:  wine_dataset:

Modelo -  AdaBoostClassifier 

[0.88888889 0.88888889 0.94444444 1.         0.88888889 0.94444444
 0.83333333 1.         0.94117647 0.94117647 0.72222222 0.83333333
 0.88888889 0.94444444 1.         0.88888889 0.88888889 0.88888889
 0.94117647 1.         1.         1.         0.83333333 0.94444444
 0.94444444 0.83333333 0.77777778 0.94444444 0.82352941 1.        ]

Mean Accuracy: 0.912 Standard Deviation: 0.072
Accuracy Confidence Interval (95%): (0.89, 0.94)



In [15]:
dfResultClassifier.iloc[1] = ['Adaboost', ada_mean, ada_std, ada_inf, ada_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.962418,0.0601091,0.940909,0.983928
1,Adaboost,0.912309,0.0715136,0.886719,0.9379
2,RandomForest,,,,
3,HeterogeneousPooling,,,,


### RandomForest

In [16]:
# Define DataBase
dataBase = datasets.load_wine()
dataBase.DESCR[4:18]

'wine_dataset:\n'

In [17]:
# define the Model
randomForest = RandomForestClassifier()

In [18]:
# Model Evaluate
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = randomForest
print('Model Evaluate... RandomForestClassifier')
rfScores = GridTestModel(dataBase, model, grid)

Model Evaluate... RandomForestClassifier
Done


In [19]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:22], '\n')
rf_mean, rf_std, rf_inf, rf_sup = getResults(rfScores)

****** Resultados ******

Base de Dados:  wine_dataset:

Modelo -  RandomForestClassifier 

[1.         1.         1.         1.         0.88888889 1.
 1.         0.94444444 1.         0.94117647 1.         1.
 1.         1.         1.         0.88888889 1.         1.
 1.         1.         0.94444444 1.         1.         1.
 1.         1.         1.         0.94444444 0.94117647 1.        ]

Mean Accuracy: 0.983 Standard Deviation: 0.033
Accuracy Confidence Interval (95%): (0.97, 0.99)



In [20]:
dfResultClassifier.iloc[2] = ['RandomForest', rf_mean, rf_std, rf_inf, rf_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.962418,0.0601091,0.940909,0.983928
1,Adaboost,0.912309,0.0715136,0.886719,0.9379
2,RandomForest,0.983115,0.0328219,0.971371,0.99486
3,HeterogeneousPooling,,,,


### HeterogeneousPooling

In [21]:
# Define DataBase
dataBase = datasets.load_wine()
dataBase.DESCR[4:18]

'wine_dataset:\n'

In [22]:
# define the model
HeterogeneousModel = HeterogeneousClassifier()

In [23]:
# Model Evaluate
grid = {'estimator__n_samples': [1,3,5,7]}
model = HeterogeneousModel
print('Model Evaluate... Heterogeneous Classifier')
hpScores = GridTestModel(dataBase, model, grid)

Model Evaluate... Heterogeneous Classifier
Done


In [24]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:23], '\n')
hp_mean, hp_std, hp_inf, hp_sup = getResults(hpScores)

****** Resultados ******

Base de Dados:  wine_dataset:

Modelo -  HeterogeneousClassifier 

[1.         0.94444444 1.         1.         0.83333333 1.
 1.         1.         1.         0.88235294 1.         0.94444444
 1.         1.         1.         0.83333333 0.94444444 1.
 1.         0.94117647 0.94444444 1.         0.94444444 0.94444444
 1.         0.94444444 1.         0.94444444 0.94117647 1.        ]

Mean Accuracy: 0.966 Standard Deviation: 0.047
Accuracy Confidence Interval (95%): (0.95, 0.98)



## Resultados Classificadores Acurácia

In [25]:
dfResultClassifier.iloc[3] = ['Heterogeneous', hp_mean, hp_std, hp_inf, hp_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.962418,0.0601091,0.940909,0.983928
1,Adaboost,0.912309,0.0715136,0.886719,0.9379
2,RandomForest,0.983115,0.0328219,0.971371,0.99486
3,Heterogeneous,0.966231,0.0471867,0.949346,0.983116


In [28]:
import plotly.graph_objects as go
scores = [bagScores, adaScores, rfScores, hpScores]
scoresNames = ['Bagging', 'AdaBoost', 'RandomForest', 'Heterogeneous Polling']
fig = go.Figure()
for i in range(len(scores)):
  fig.add_trace(go.Box(y=scores[i], name=scoresNames[i]))
fig.update_layout(
    yaxis_title='Acurácia',
    xaxis_title='Modelos',
    title='Wine - Desempenho dos Modelos  - Acurácia',
)
fig.show()

## Teste Pareado

In [27]:
from scipy.stats import ttest_rel, wilcoxon
scores = [bagScores, adaScores, rfScores, hpScores]
scoresNames = ['Bagging', 'AdaBoost', 'RandomForest', 'Heterogeneous Polling']
dfPairTest = pd.DataFrame(columns=[0,1,2,3])
for i in range(len(scores)):
  for j in range(len(scores)):
    if j == i:
       dfPairTest.at[i, j] = scoresNames[i]
    
    if j > i:
      print('Paired T Test', scoresNames[i], scoresNames[j])
      s,p = ttest_rel(scores[i],scores[j])
      print("t: %0.2f p-value: %0.8f\n" % (s,p))
      dfPairTest.at[i, j] = p
      # dfPairTest.iloc[i][j] = p
    # print("t:",(s,p))

    if j < i :
      print ('Wilcoxon Test', scoresNames[i], scoresNames[j])
      s,p = wilcoxon (scores[i],scores[j])
      print("w: %0.2f p-value: %0.8f\n" % (s,p))
      dfPairTest.at[i, j] = p
      # dfPairTest.iloc[i][j] = p
# print("w: ",  (s,p))
dfPairTest.columns = ['T1','T2','T3','T4']
dfPairTest.index = ['w1', 'w2', 'w3', 'w4']
dfPairTest

Paired T Test Bagging AdaBoost
t: 2.87 p-value: 0.00758514

Paired T Test Bagging RandomForest
t: -2.48 p-value: 0.01933337

Paired T Test Bagging Heterogeneous Polling
t: -0.50 p-value: 0.62416289

Wilcoxon Test AdaBoost Bagging
w: 38.00 p-value: 0.01210153

Paired T Test AdaBoost RandomForest
t: -4.77 p-value: 0.00004747

Paired T Test AdaBoost Heterogeneous Polling
t: -3.59 p-value: 0.00121271

Wilcoxon Test RandomForest Bagging
w: 8.50 p-value: 0.02818936

Wilcoxon Test RandomForest AdaBoost
w: 8.00 p-value: 0.00016908

Paired T Test RandomForest Heterogeneous Polling
t: 3.08 p-value: 0.00449968

Wilcoxon Test Heterogeneous Polling Bagging
w: 42.00 p-value: 0.79917162

Wilcoxon Test Heterogeneous Polling AdaBoost
w: 32.00 p-value: 0.00120347

Wilcoxon Test Heterogeneous Polling RandomForest
w: 6.00 p-value: 0.01341754



Unnamed: 0,T1,T2,T3,T4
w1,Bagging,0.00758514,0.0193334,0.624163
w2,0.0121015,AdaBoost,4.74684e-05,0.00121271
w3,0.0281894,0.000169075,RandomForest,0.00449968
w4,0.799172,0.00120347,0.0134175,Heterogeneous Polling


# Bases de Dados - Breast Cancer

## Classificadores

### Bagging

In [29]:
# Define DataBase
dataBase = datasets.load_breast_cancer()
dataBase.DESCR[4:18]

'breast_cancer_'

In [30]:
# define the model
bagging = BaggingClassifier()

In [31]:
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = bagging
print('Model Evaluate... BaggingClassifier')
bagScores = GridTestModel(dataBase, model, grid)

Model Evaluate... BaggingClassifier
Done


In [32]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:17], '\n')
bag_mean, bag_std, bag_inf, bag_sup = getResults(bagScores)
# dfResultClassifier.iloc[0] = ['Baggind', bag_mean, bag_std, bag_inf, bag_sup]
# dfResultClassifier

****** Resultados ******

Base de Dados:  breast_cancer_
Modelo -  BaggingClassifier 

[0.98245614 0.92982456 0.94736842 0.96491228 1.         0.89473684
 0.96491228 0.96491228 0.96491228 1.         0.96491228 0.94736842
 1.         0.92982456 0.94736842 0.98245614 0.94736842 0.96491228
 0.92982456 0.94642857 0.96491228 0.94736842 0.9122807  0.94736842
 0.96491228 0.98245614 0.96491228 0.98245614 0.96491228 0.92857143]

Mean Accuracy: 0.958 Standard Deviation: 0.025
Accuracy Confidence Interval (95%): (0.95, 0.97)



In [33]:
dfResultClassifier.iloc[0] = ['Baggind', bag_mean, bag_std, bag_inf, bag_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.957822,0.0247072,0.94898,0.966663
1,Adaboost,0.912309,0.0715136,0.886719,0.9379
2,RandomForest,0.983115,0.0328219,0.971371,0.99486
3,Heterogeneous,0.966231,0.0471867,0.949346,0.983116


### AdaBoost

In [34]:
# Define DataBase
dataBase = datasets.load_breast_cancer()
dataBase.DESCR[4:18]

'breast_cancer_'

In [35]:
# define the Model
adaBoost = AdaBoostClassifier()

In [36]:
# Z-score, Train, Test, gridSearch, CrossValidation
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = adaBoost
print('Model Evaluate... AdaBoostClassifier')
adaScores = GridTestModel(dataBase, model, grid)

Model Evaluate... AdaBoostClassifier
Done


In [37]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:18], '\n')
ada_mean, ada_std, ada_inf, ada_sup = getResults(adaScores)

****** Resultados ******

Base de Dados:  breast_cancer_
Modelo -  AdaBoostClassifier 

[0.98245614 0.98245614 0.96491228 0.96491228 1.         0.9122807
 1.         0.96491228 0.92982456 0.98214286 0.98245614 0.98245614
 1.         0.94736842 1.         0.98245614 0.92982456 0.96491228
 0.92982456 0.96428571 0.96491228 0.94736842 0.94736842 0.94736842
 0.94736842 1.         0.98245614 0.98245614 0.98245614 0.96428571]

Mean Accuracy: 0.968 Standard Deviation: 0.024
Accuracy Confidence Interval (95%): (0.96, 0.98)



In [38]:
dfResultClassifier.iloc[1] = ['Adaboost', ada_mean, ada_std, ada_inf, ada_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.957822,0.0247072,0.94898,0.966663
1,Adaboost,0.967784,0.0235731,0.959349,0.976219
2,RandomForest,0.983115,0.0328219,0.971371,0.99486
3,Heterogeneous,0.966231,0.0471867,0.949346,0.983116


### RandomForest

In [39]:
# Define DataBase
dataBase = datasets.load_breast_cancer()
dataBase.DESCR[4:18]

'breast_cancer_'

In [40]:
# define the Model
randomForest = RandomForestClassifier()

In [41]:
# Model Evaluate
grid = {'estimator__n_estimators': [10, 25, 50, 100]}
model = randomForest
print('Model Evaluate... RandomForestClassifier')
rfScores = GridTestModel(dataBase, model, grid)

Model Evaluate... RandomForestClassifier
Done


In [42]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:22], '\n')
rf_mean, rf_std, rf_inf, rf_sup = getResults(rfScores)

****** Resultados ******

Base de Dados:  breast_cancer_
Modelo -  RandomForestClassifier 

[0.98245614 0.92982456 0.94736842 0.96491228 0.98245614 0.92982456
 0.98245614 0.98245614 0.92982456 0.92857143 0.96491228 0.96491228
 1.         0.89473684 0.96491228 0.98245614 0.92982456 0.96491228
 0.92982456 0.96428571 0.98245614 0.94736842 0.96491228 0.9122807
 0.98245614 0.96491228 0.94736842 0.96491228 0.96491228 0.92857143]

Mean Accuracy: 0.956 Standard Deviation: 0.025
Accuracy Confidence Interval (95%): (0.95, 0.96)



In [43]:
dfResultClassifier.iloc[2] = ['RandomForest', rf_mean, rf_std, rf_inf, rf_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.957822,0.0247072,0.94898,0.966663
1,Adaboost,0.967784,0.0235731,0.959349,0.976219
2,RandomForest,0.956036,0.0247908,0.947165,0.964907
3,Heterogeneous,0.966231,0.0471867,0.949346,0.983116


### HeterogeneousPooling

In [44]:
# Define DataBase
dataBase = datasets.load_breast_cancer()
dataBase.DESCR[4:18]

'breast_cancer_'

In [45]:
# define the model
HeterogeneousModel = HeterogeneousClassifier()

In [46]:
# Model Evaluate
grid = {'estimator__n_samples': [1,3,5,7]}
model = HeterogeneousModel
print('Model Evaluate... Heterogeneous Classifier')
hpScores = GridTestModel(dataBase, model, grid)

Model Evaluate... Heterogeneous Classifier
Done


In [47]:
# Resultados
print('****** Resultados ******\n')
print('Base de Dados: ', dataBase.DESCR[4:18])
print('Modelo - ', str(model)[0:23], '\n')
hp_mean, hp_std, hp_inf, hp_sup = getResults(hpScores)

****** Resultados ******

Base de Dados:  breast_cancer_
Modelo -  HeterogeneousClassifier 

[1.         0.92982456 0.94736842 0.9122807  0.96491228 0.92982456
 0.96491228 0.98245614 0.94736842 0.96428571 0.94736842 1.
 0.96491228 0.92982456 0.98245614 0.98245614 0.92982456 0.92982456
 0.87719298 0.94642857 0.94736842 0.98245614 0.92982456 0.96491228
 0.94736842 0.98245614 0.94736842 0.94736842 0.96491228 0.91071429]

Mean Accuracy: 0.952 Standard Deviation: 0.027
Accuracy Confidence Interval (95%): (0.94, 0.96)



## Resultados Classificadores Acurácia

In [48]:
dfResultClassifier.iloc[3] = ['Heterogeneous', hp_mean, hp_std, hp_inf, hp_sup]
dfResultClassifier

Unnamed: 0,Métodos,Média,STD,Limite Inferior,Limite Superior
0,Baggind,0.957822,0.0247072,0.94898,0.966663
1,Adaboost,0.967784,0.0235731,0.959349,0.976219
2,RandomForest,0.956036,0.0247908,0.947165,0.964907
3,Heterogeneous,0.951942,0.0272276,0.942199,0.961685


In [49]:
import plotly.graph_objects as go
scores = [bagScores, adaScores, rfScores, hpScores]
scoresNames = ['Bagging', 'AdaBoost', 'RandomForest', 'Heterogeneous Polling']
fig = go.Figure()
for i in range(len(scores)):
  fig.add_trace(go.Box(y=scores[i], name=scoresNames[i]))
fig.update_layout(
    yaxis_title='Acc',
    xaxis_title='Modelos',
    title='Breast Cancer - Desempenho dos Modelos  - Acurácia',
)
fig.show()

## Teste Pareado

In [50]:
from scipy.stats import ttest_rel, wilcoxon
scores = [bagScores, adaScores, rfScores, hpScores]
scoresNames = ['Bagging', 'AdaBoost', 'RandomForest', 'Heterogeneous Polling']
dfPairTest = pd.DataFrame(columns=[0,1,2,3])
for i in range(len(scores)):
  for j in range(len(scores)):
    if j == i:
       dfPairTest.at[i, j] = scoresNames[i]
    
    if j > i:
      print('Paired T Test', scoresNames[i], scoresNames[j])
      s,p = ttest_rel(scores[i],scores[j])
      print("t: %0.2f p-value: %0.8f\n" % (s,p))
      dfPairTest.at[i, j] = p
      # dfPairTest.iloc[i][j] = p
    # print("t:",(s,p))

    if j < i :
      print ('Wilcoxon Test', scoresNames[i], scoresNames[j])
      s,p = wilcoxon (scores[i],scores[j])
      print("w: %0.2f p-value: %0.8f\n" % (s,p))
      dfPairTest.at[i, j] = p
      # dfPairTest.iloc[i][j] = p
# print("w: ",  (s,p))
dfPairTest.columns = ['T1','T2','T3','T4']
dfPairTest.index = ['w1', 'w2', 'w3', 'w4']
dfPairTest

Paired T Test Bagging AdaBoost
t: -2.66 p-value: 0.01248827

Paired T Test Bagging RandomForest
t: 0.41 p-value: 0.68393338

Paired T Test Bagging Heterogeneous Polling
t: 1.21 p-value: 0.23631385

Wilcoxon Test AdaBoost Bagging
w: 35.50 p-value: 0.02848511

Paired T Test AdaBoost RandomForest
t: 2.82 p-value: 0.00854613

Paired T Test AdaBoost Heterogeneous Polling
t: 3.47 p-value: 0.00165218

Wilcoxon Test RandomForest Bagging
w: 78.50 p-value: 0.75835888

Wilcoxon Test RandomForest AdaBoost
w: 39.00 p-value: 0.01312233

Paired T Test RandomForest Heterogeneous Polling
t: 0.81 p-value: 0.42258783

Wilcoxon Test Heterogeneous Polling Bagging
w: 87.00 p-value: 0.19538198

Wilcoxon Test Heterogeneous Polling AdaBoost
w: 63.50 p-value: 0.00239813

Wilcoxon Test Heterogeneous Polling RandomForest
w: 99.50 p-value: 0.37714359



Unnamed: 0,T1,T2,T3,T4
w1,Bagging,0.0124883,0.683933,0.236314
w2,0.0284851,AdaBoost,0.00854613,0.00165218
w3,0.758359,0.0131223,RandomForest,0.422588
w4,0.195382,0.00239813,0.377144,Heterogeneous Polling


*Primeiro Trabalho de Inteligência Artificial e Sistemas Inteligentes*

*André Paulo F. Machado*

Este trabalho consiste em realizar uma comparação experimental entre um conjunto pré-definido de técnicas de aprendizado para classificação automática, baseadas na ideia de combinados de classificadores, aplicadas a alguns problemas de classificação. As técnicas escolhidas são: Bagging, AdaBoost, RandomForest e HeterogeneousPooling.

As bases de dados utilizadas são digits, wine e breast cancer.