In [78]:
# Importa pacotes necessários via kernel do Jupyter
# %pip install numpy seaborn scipy sklearn matplotlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_rel, wilcoxon
from sklearn.datasets import load_digits, load_iris, load_wine, load_breast_cancer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

def create_stats_header():
    return '\\textbf{Método} & \\textbf{Média} & \\textbf{Desvio Padrão} & \\textbf{Limite Inferior} & \\textbf{Limite Superior} \\ \hline \n'

def create_stats_table_line(method_name, mean, std, lower, upper):
    return '{} & {:.2f} & {:.2f} & {:.2f} & {:.2f} \\ \hline \n'.format(method_name, mean, std, lower, upper)

# Load digits dataset from scikit-learn; split into data and target/label
data, labels = load_breast_cancer(return_X_y=True)
# Get the number of samples (lines) and features (columns); also get the number of unique labels, i.e. the number of classes
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size
# print(f"- digits: {n_digits}; - samples: {n_samples}; - features: {n_features}")

# Initialize a dummy classifier (zeroR), and the Naive Bayes classifier (GaussianNB)
zR = DummyClassifier(strategy='most_frequent')
NBG = GaussianNB()

# Initialize a scalar pipeline to scale the data before feeding it to the classifier
scalar = StandardScaler()

pipeline_zR = Pipeline([('transformer', scalar), ('estimator', zR)])
pipeline_NBG = Pipeline([('transformer', scalar), ('estimator', NBG)])

# Initialize a stratified k-fold cross-validation object with seed provided by the professor
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

# Use zeroR (DummyClassifier) and naiveBayes to predict the class of the data with stratified cross-validation (10-fold), repeated 3 times
scores_zR = cross_val_score(pipeline_zR, data, labels, scoring='accuracy', cv=RSKF)
scores_NBG = cross_val_score(pipeline_NBG, data, labels, scoring='accuracy', cv=RSKF)

# Create a dataframe where the columns are the methods and the rows are the scores
df_scores = pd.DataFrame(data={'zR': scores_zR, 'NBG': scores_NBG})
print(df_scores)

# Print the accuracy scores for the classifiers
# NBG
mean_NBG = scores_NBG.mean()
std_NBG = scores_NBG.std()
lower_NBG, upper_NBG = stats.norm.interval(0.95, loc=mean_NBG, scale=std_NBG/np.sqrt(len(scores_NBG)))
     
# print("NBG score:\n", scores_NBG)
# print("NBG: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_NBG, std_NBG))
# print("NBG: Accuracy Confidence Interval (95%%): (%0.2f, %0.2f)\n" % (lower_NBG, upper_NBG)) 
       
# zR                          
mean_zR = scores_zR.mean()
std_zR = scores_zR.std()
lower_zR, upper_zR = stats.norm.interval(0.95, loc=mean_zR, 
                               scale=std_zR/np.sqrt(len(scores_zR)))
                            

# print("zR score:\n", scores_zR)
# print("zR: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_zR, std_zR))
# print("zR: Accuracy Confidence Interval (95\%%): (%0.2f, %0.2f)\n" % (lower_zR, upper_zR)) 


# ZR
_, pTValueZrNBG = ttest_rel(scores_zR, scores_NBG)
print(f'zR-NBG Paired T Test:\np-value: {pTValueZrNBG}')

_, pWValueZrNBG = wilcoxon(scores_zR, scores_NBG)
print(f'zR-NBG Wilcoxon Test:\n p-value: {pWValueZrNBG}')

table = create_stats_header()
table += create_stats_table_line('ZR', mean_zR, std_zR, lower_zR, upper_zR)
table += create_stats_table_line('NGB', mean_NBG, std_NBG, lower_NBG, upper_NBG)
print(table)

# conf_mat = confusion_matrix(labels, score_NBG)
# print(conf_mat)

# plt.matshow(conf_mat, cmap=plt.cm.Blues)
# for i in range(len(conf_mat)):
#     for j in range(len(conf_mat)):
#         plt.text(i, j, conf_mat[i][j], va="center", ha="center")

# plt.show()

          zR       NBG
0   0.614035  0.964912
1   0.614035  0.912281
2   0.631579  0.947368
3   0.631579  0.929825
4   0.631579  0.947368
5   0.631579  0.877193
6   0.631579  0.947368
7   0.631579  0.947368
8   0.631579  0.912281
9   0.625000  0.910714
10  0.614035  0.929825
11  0.614035  0.964912
12  0.631579  0.982456
13  0.631579  0.929825
14  0.631579  0.964912
15  0.631579  0.947368
16  0.631579  0.929825
17  0.631579  0.877193
18  0.631579  0.877193
19  0.625000  0.928571
20  0.614035  0.929825
21  0.614035  0.964912
22  0.631579  0.929825
23  0.631579  0.929825
24  0.631579  0.929825
25  0.631579  0.912281
26  0.631579  0.929825
27  0.631579  0.964912
28  0.631579  0.964912
29  0.625000  0.928571
zR-NBG Paired T Test:
p-value: 1.3542754092366858e-31
zR-NBG Wilcoxon Test:
 p-value: 1.595254879875626e-06
\textbf{Método} & \textbf{Média} & \textbf{Desvio Padrão} & \textbf{Limite Inferior} & \textbf{Limite Superior} \ \hline 
ZR & 0.63 & 0.01 & 0.62 & 0.63 \ \hline 
NGB & 0.93 & 0.0

In [66]:
# ETAPA 2:
# A segunda etapa consiste no treino, validação e teste dos classificadores que precisam de ajuste de hiperparâmetros, isto é, os classificadores KMC, KNN e AD
from sklearn.base import BaseEstimator
from sklearn.utils import check_X_y
from sklearn.cluster import KMeans

# The KMC classifier uses a clustering algorithm to define K groups of examples of each class in the training base.
class KMeansCentroidsClassifier(BaseEstimator):
    def __init__(self, k=1):
        super().__init__()
        self.k = k
        self.centroids = []
        # self.groups = self.k * self.nClasses

    def fit(self, x_train, y_train):
        x_train, y_train = check_X_y(x_train, y_train)

        for _class in np.unique(y_train):
            # Initialize KMeans:
            km = KMeans(n_clusters=self.k, random_state=36851234)
            # Fit KMeans:
            km.fit(x_train[y_train == _class], y_train[y_train == _class])
            # Append centroids to centroids list:   
            self.centroids.append({"clusters": km.cluster_centers_, "class": _class})

    def predict(self, x_test):
        classes = []
        for x in x_test:
            min_dist = np.Inf
            
            for centroid in self.centroids:
                for cluster in centroid["clusters"]:
                    dist = np.linalg.norm(x - cluster)
                    if dist < min_dist:
                        min_dist = dist
                        _class = centroid["class"]

            classes.append(_class)
            
        return classes
                    

dKNN = KNeighborsClassifier(weights='distance')
pipeline_kNN = Pipeline([('transformer', scalar), ('estimator', dKNN)])
KMC = KMeansCentroidsClassifier()
pipeline_KMC = Pipeline([('transformer', scalar), ('estimator', KMC)])
AD = DecisionTreeClassifier()
pipeline_AD = Pipeline([('transformer', scalar), ('estimator', AD)])

# Neste caso, o procedimento de treinamento, validação e teste será realizado através de 3 rodadas de ciclos aninhados de validação e teste,
# com o ciclo interno de validação contendo 4 folds e o externo de teste com 10 folds.
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

# A busca em grade (grid search) do ciclo interno deve considerar os seguintes valores de hiperparâmetros de cada técnica de aprendizado:
# KMC: [k = 1, 3, 5, 7]
# KNN: [n_neighbors = 1, 3, 5, 7]
# AD: [max_depth = None, 3, 5, 10]
grade_KMC = {'estimator__k': [1, 3, 5, 7]}
grade_kNN = {'estimator__n_neighbors': [1, 3, 5, 7]}
grade_AD = {'estimator__max_depth': [None, 3, 5, 10]}
grid_search_KMC = GridSearchCV(estimator=pipeline_KMC, param_grid=grade_KMC, scoring='accuracy', cv=4)
grid_search_kNN = GridSearchCV(estimator=pipeline_kNN, param_grid=grade_kNN, scoring='accuracy', cv=4)
grid_search_AD = GridSearchCV(estimator=pipeline_AD, param_grid=grade_AD, scoring='accuracy', cv=4)

# Os dados utilizados no conjunto de treino em cada rodada de teste devem ser padronizados (normalização com z-score).
# Os valores de padronização obtidos nos dados de treino devem ser utilizados para padronizar os dados do respectivo conjunto de teste.
# ?

scores_kNN = cross_val_score(grid_search_kNN, digits_data, digits_labels, scoring='accuracy', cv=RSKF)
scores_AD = cross_val_score(grid_search_AD, digits_data, digits_labels, scoring='accuracy', cv=RSKF)
scores_KMC = cross_val_score(grid_search_KMC, data, labels, scoring='accuracy', cv=RSKF)

mean_KMC = scores_KMC.mean()
std_KMC = scores_KMC.std()
lower_KMC, upper_KMC = stats.norm.interval(0.95, loc=mean_KMC, scale=std_KMC/np.sqrt(len(scores_KMC)))

print("KMC score:\n", scores_KMC)
print("KMC: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_KMC, std_KMC))
print("KMC: Accuracy Confidence Interval (95\%%): (%0.2f, %0.2f)\n" % (lower_KMC, upper_KMC)) 

# Create T-test and wilcoxon test for each pair of classifiers: zR, gNB, KNN, AD, KMC
# ZR TESTS:
# zR-gNB
_, pTValueZrGnb = ttest_rel(scores_zR, scores_gNB)
print(f'zR-gNB Paired T Test:\np-value: {pTValueZrGnb}')
_, pWValueZrGnb = wilcoxon(scores_zR, scores_gNB)
print(f'zR-gNB Wilcoxon Test:\n p-value: {pWValueZrGnb}')
# zR-KNN
_, pTValueZrKnn = ttest_rel(scores_zR, scores_kNN)
print(f'zR-KNN Paired T Test:\np-value: {pTValueZrKnn}')
_, pWValueZrKnn = wilcoxon(scores_zR, scores_kNN)
print(f'zR-KNN Wilcoxon Test:\n p-value: {pWValueZrKnn}')
# zR-AD
_, pTValueZrAd = ttest_rel(scores_zR, scores_AD)
print(f'zR-AD Paired T Test:\np-value: {pTValueZrAd}')
_, pWValueZrAd = wilcoxon(scores_zR, scores_AD)
print(f'zR-AD Wilcoxon Test:\n p-value: {pWValueZrAd}')
# zR-KMC
_, pTValueZrKmc = ttest_rel(scores_zR, scores_KMC)
print(f'zR-KMC Paired T Test:\np-value: {pTValueZrKmc}')
_, pWValueZrKmc = wilcoxon(scores_zR, scores_KMC)
print(f'zR-KMC Wilcoxon Test:\n p-value: {pWValueZrKmc}')

# gNB TESTS:
# gNB-KNN
_, pTValueGnbKnn = ttest_rel(scores_gNB, scores_kNN)
print(f'gNB-KNN Paired T Test:\np-value: {pTValueGnbKnn}')
_, pWValueGnbKnn = wilcoxon(scores_gNB, scores_kNN)
print(f'gNB-KNN Wilcoxon Test:\n p-value: {pWValueGnbKnn}')
# gNB-AD
_, pTValueGnbAd = ttest_rel(scores_gNB, scores_AD)
print(f'gNB-AD Paired T Test:\np-value: {pTValueGnbAd}')
_, pWValueGnbAd = wilcoxon(scores_gNB, scores_AD)
print(f'gNB-AD Wilcoxon Test:\n p-value: {pWValueGnbAd}')
# gNB-KMC
_, pTValueGnbKmc = ttest_rel(scores_gNB, scores_KMC)
print(f'gNB-KMC Paired T Test:\np-value: {pTValueGnbKmc}')

# KNN TESTS:
# KNN-AD
_, pTValueKnnAd = ttest_rel(scores_kNN, scores_AD)
print(f'KNN-AD Paired T Test:\np-value: {pTValueKnnAd}')
_, pWValueKnnAd = wilcoxon(scores_kNN, scores_AD)
print(f'KNN-AD Wilcoxon Test:\n p-value: {pWValueKnnAd}')
# KNN-KMC
_, pTValueKnnKmc = ttest_rel(scores_kNN, scores_KMC)
print(f'KNN-KMC Paired T Test:\np-value: {pTValueKnnKmc}')
_, pWValueKnnKmc = wilcoxon(scores_kNN, scores_KMC)
print(f'KNN-KMC Wilcoxon Test:\n p-value: {pWValueKnnKmc}')

# Os resultados de cada classificador devem ser apresentados numa tabela contendo a média das acurácias obtidas em cada fold, o desvio padrão e o intervalo de confiança a 95% de significância dos resultados,

KMC score:
 [0.96666667 0.93333333 0.97777778 0.93888889 0.98333333 0.92222222
 0.96111111 0.94972067 0.94413408 0.93854749 0.98333333 0.95
 0.93888889 0.95555556 0.95       0.96111111 0.93333333 0.94972067
 0.96089385 0.93854749 0.9        0.96666667 0.97222222 0.94444444
 0.94444444 0.95       0.93333333 0.95530726 0.97206704 0.92178771]
KMC: Mean Accuracy: 0.95 Standard Deviation: 0.02
KMC: Accuracy Confidence Interval (95\%): (0.94, 0.96)

zR-gNB Paired T Test:
p-value: 4.230984524662128e-41
zR-gNB Wilcoxon Test:
 p-value: 1.7083716990437344e-06
zR-KNN Paired T Test:
p-value: 3.462949310618454e-60
zR-KNN Wilcoxon Test:
 p-value: 1.6363965084210647e-06
zR-AD Paired T Test:
p-value: 6.756231505708888e-44
zR-AD Wilcoxon Test:
 p-value: 1.7224282827430733e-06
zR-KMC Paired T Test:
p-value: 2.335308158986351e-49
zR-KMC Wilcoxon Test:
 p-value: 1.7170105183845441e-06
gNB-KNN Paired T Test:
p-value: 3.952790045634852e-25
gNB-KNN Wilcoxon Test:
 p-value: 1.7180929312456739e-06
gNB-AD Paire