In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from statsmodels.stats.contingency_tables import mcnemar
import numpy as np

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [2]:
#import da base
digits = datasets.load_digits()
X = digits.data
y = digits.target

print(X.shape)
print(y.shape)

(1797, 64)
(1797,)


### Declarando funções que serão usadas

In [3]:
#Declarando o conteúdo da atividade em funções (KNN, decision tree e random forest)
def knn_bagging(Xtrain,ytrain,Xtest):
    #dados sem reducao
    start = time.time()
    model = KNeighborsClassifier()
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)
    print('Acurácia de KNeighbors simples:', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_1 = model.predict(Xtest)
    
    start = time.time()
    model = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, random_state = 42)
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)
    print('Acurácia de KNeighbors Bagging (c/ 10 estimators):', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_2 = model.predict(Xtest)
    
    start = time.time()
    model = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, n_estimators=100, random_state = 42)
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)
    print('Acurácia de KNeighbors Bagging (c/ 100 estimators):', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_3 = model.predict(Xtest)
    
    print('-------------------------------------')
    print('Teste do McNemar')
    testaMcNemar(ytrain,y_pred_1,y_pred_2)
    testaMcNemar(ytrain,y_pred_1,y_pred_3)
    testaMcNemar(ytrain,y_pred_2,y_pred_3)
    
    return y_pred_1,y_pred_2,y_pred_3
#################################################################################################
    
def decision_tree(Xtrain,ytrain,Xtest):
    start = time.time()
    model = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)
    print('Acurácia de Decision Tree puro:', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_1 = model.predict(Xtest)


    start = time.time()
    model = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)
    print('Acurácia de Random Forest:', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_2 = model.predict(Xtest)


    start = time.time()
    model = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)
    print('Acurácia de Extreme Randomized Trees:', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_3 = model.predict(Xtest)

    print('-------------------------------------')
    print('Teste do McNemar')
    testaMcNemar(ytrain,y_pred_1,y_pred_2)
    testaMcNemar(ytrain,y_pred_1,y_pred_3)
    testaMcNemar(ytrain,y_pred_2,y_pred_3)
    
    return y_pred_1,y_pred_2,y_pred_3
########################################################################################################

def multiples_vote(Xtrain,ytrain,Xtest):
    start = time.time()
    #Obs aumentei a max_inter da regressão logistica para tirar os warnings (deixou bem lento, tem a ver com a escala dos dados)
    clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial',max_iter = 10000, random_state=1)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf3 = GaussianNB()

    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb',clf3)], voting='hard')

    for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        print("Acurácia: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

    print('-'*20)

    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')

    for clf, label in zip([clf1, clf2, eclf], ['Logistic Regression', 'Random Forest', 'Ensemble']):
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        print("Acurácia: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    end = time.time()
    print('Tempo:',end - start)

#######################################################################################################

def gradient_boost(Xtrain,ytrain,Xtest):
    
    start = time.time()
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_depth=2, random_state=0)
    scores = cross_val_score(model, Xtrain, ytrain, cv=5)

    print('Acurácia de Gradient Boosting Tree:', scores.mean())
    end = time.time()
    print('Tempo:',end - start)
    model.fit(Xtrain, ytrain)
    y_pred_1 = model.predict(Xtest)

    return y_pred_1
    

In [4]:
#Fazer a parte do mc nemmar: A ideia é fazer 1 comparação com cada tipo (originais,redução e seleção):

#essa primeira função cria a contigence table a partir do y_test,y_pred1 e y_pred2 (resultados que vc quer comparar)
def build_contingence_table(Y, Y_pred_1, Y_pred_2):
    y1_and_y2 = 0
    y1_and_not_y2 = 0
    y2_and_not_y1 = 0
    not_y1_and_not_y2 = 0
    for y, y1, y2 in zip(Y, Y_pred_1, Y_pred_2):
        if y == y1 == y2:
            y1_and_y2 += 1
        elif y != y1 and y != y2:
            not_y1_and_not_y2 += 1
        elif y == y1 and y != y2:
            y1_and_not_y2 += 1
        elif y != y1 and y == y2:
            y2_and_not_y1 += 1
            
    contingency_table = [[y1_and_y2, y1_and_not_y2], 
                         [y2_and_not_y1, not_y1_and_not_y2]]
    
    return contingency_table
    

In [5]:
def testaMcNemar(train, pred1, pred2):
    contingence_table = build_contingence_table(train, pred1, pred2)

    import pprint

    pprint.pprint(contingence_table)

    result = mcnemar(contingence_table, exact=True)


    if result.pvalue >= 0.001:
        print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    else:
        print('statistic=%.3f, p-value=%.3e' % (result.statistic, result.pvalue))

    # interpretando o p-value
    alpha = 0.05
    if result.pvalue > alpha:
        print('Mesma proporção de erros (falhou em rejeitar H0)')
    else:
        print('Proporções de erros diferentes (rejeitou H0)')

### Dividindo em treino e teste para cada tipo (original, redução e seleção)

In [6]:
# Aplicando Train test split nos dados originais
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Número original de atributos:', X.shape[1])

#----------------------------------------------------------------------------------------------------------

# Aplicando Train test split nos dados com reduçao de atributos (PCA)

#realizando a redução
pca = PCA(n_components=0.90, whiten=True)

X_pca = pca.fit_transform(X)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.33, random_state=42)

print('Número reduzido de atributos:', X_pca.shape[1])

#----------------------------------------------------------------------------------------------------------

# Aplicando Train test split na seleção de atributos
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X, y, test_size=0.33, random_state=42)


fvalue_selector = SelectKBest(f_classif, k=20)
X_kbest = fvalue_selector.fit_transform(X_train_sel, y_train_sel)
X_kbest2= fvalue_selector.fit_transform(X_test_sel, y_test_sel)
print('Número de atributos de Seleçao:', X_kbest.shape[1])


Número original de atributos: 64
Número reduzido de atributos: 21
Número de atributos de Seleçao: 20


  f = msb / msw
  f = msb / msw


### Colocando tudo dentro de um dicionário e rodando com um loop

In [7]:
d = {'originais':[X_train,y_train,X_test],'redução':[X_train_pca,y_train_pca,X_test_pca],'seleção':[X_kbest,y_train_sel,X_kbest2]}

In [8]:
#essa parte é só um teste
# observação d[k][0] é X e d[k][1] é y
for k in d.keys():
    print(k)

originais
redução
seleção


In [9]:
#o d2 é mais um dicionário para armazenar todos os y_pred de cada modelo (ainda tenho que organiza-lo melhor)

#OBS: FALTA FAZER A FUNÇÃO DO MULTIPLE VOTES JOGAR O Y_PRED NA VARIÁVEL (AINDA NÃO CONSEGUI)

d2={}
for k in d.keys():
    print('==========================================================================')
    print('Tempos e acurácias dos modelos com os dados',k,':')
    print('==========================================================================')
    y_pred_b1,y_pred_b2,y_pred_b3 = knn_bagging(d[k][0],d[k][1],d[k][2])  
    d2[k + '_bagging'] = y_pred_b1,y_pred_b2,y_pred_b3
    print('_____________________')
    y_pred_t1,y_pred_t2,y_pred_t3 = decision_tree(d[k][0],d[k][1],d[k][2])
    d2[k + '_tree'] = y_pred_t1,y_pred_t2,y_pred_t3
    print('_____________________')
    multiples_vote(d[k][0],d[k][1],d[k][2])
    print('_____________________')
    y_pred_g1 = gradient_boost(d[k][0],d[k][1],d[k][2])
    d2[k + '_boost'] = y_pred_g1
    print('\n')

Tempos e acurácias dos modelos com os dados originais :
Acurácia de KNeighbors simples: 0.9767150760719225
Tempo: 0.226454496383667
Acurácia de KNeighbors Bagging (c/ 10 estimators): 0.9709163208852004
Tempo: 0.5030057430267334
Acurácia de KNeighbors Bagging (c/ 100 estimators): 0.9717461964038726
Tempo: 4.986911296844482
-------------------------------------
Teste do McNemar
[[53, 2], [3, 536]]
statistic=2.000, p-value=1.000
Mesma proporção de erros (falhou em rejeitar H0)
[[55, 0], [1, 538]]
statistic=0.000, p-value=1.000
Mesma proporção de erros (falhou em rejeitar H0)
[[54, 2], [2, 536]]
statistic=2.000, p-value=1.000
Mesma proporção de erros (falhou em rejeitar H0)
_____________________
Acurácia de Decision Tree puro: 0.8204426002766253
Tempo: 0.04686737060546875
Acurácia de Random Forest: 0.9592842323651452
Tempo: 0.5819847583770752
Acurácia de Extreme Randomized Trees: 0.971753112033195
Tempo: 0.4498403072357178
-------------------------------------
Teste do McNemar
[[49, 6], [8