In [142]:
# Importa pacotes necessários via kernel do Jupyter
# %pip install numpy seaborn scipy sklearn matplotlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_rel, wilcoxon
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

def create_stats_header():
    return '\\textbf{Método} & \\textbf{Média} & \\textbf{Desvio Padrão} & \\textbf{Limite Inferior} & \\textbf{Limite Superior} \\ \hline \n'

def create_stats_table_line(method_name, mean, std, lower, upper):
    return '{} & {:.2f} & {:.2f} & {:.2f} & {:.2f} \\ \hline \n'.format(method_name, mean, std, lower, upper)

# Load digits dataset from scikit-learn; split into data and target/label
data, labels = load_digits(return_X_y=True)
# Get the number of samples (lines) and features (columns); also get the number of unique labels, i.e. the number of classes
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size
# print(f"- digits: {n_digits}; - samples: {n_samples}; - features: {n_features}")

# Initialize a dummy classifier (zeroR), and the Naive Bayes classifier (GaussianNB)
zR = DummyClassifier(strategy='most_frequent')
NBG = GaussianNB()

# Initialize a scalar pipeline to scale the data before feeding it to the classifier
scalar = StandardScaler()

pipeline_zR = Pipeline([('transformer', scalar), ('estimator', zR)])
pipeline_NBG = Pipeline([('transformer', scalar), ('estimator', NBG)])

# Initialize a stratified k-fold cross-validation object with seed provided by the professor
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

# Use zeroR (DummyClassifier) and naiveBayes to predict the class of the data with stratified cross-validation (10-fold), repeated 3 times
scores_zR = cross_val_score(pipeline_zR, data, labels, scoring='accuracy', cv=RSKF)
scores_NBG = cross_val_score(pipeline_NBG, data, labels, scoring='accuracy', cv=RSKF)

# Create a dataframe where the columns are the methods and the rows are the scores
df_scores = pd.DataFrame(data={'zR': scores_zR, 'NBG': scores_NBG})
print(df_scores)

# Print the accuracy scores for the classifiers
# NBG
mean_NBG = scores_NBG.mean()
std_NBG = scores_NBG.std()
lower_NBG, upper_NBG = stats.norm.interval(0.95, loc=mean_NBG, scale=std_NBG/np.sqrt(len(scores_NBG)))
     
# print("NBG score:\n", scores_NBG)
# print("NBG: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_NBG, std_NBG))
# print("NBG: Accuracy Confidence Interval (95%%): (%0.2f, %0.2f)\n" % (lower_NBG, upper_NBG)) 
       
# zR                          
mean_zR = scores_zR.mean()
std_zR = scores_zR.std()
lower_zR, upper_zR = stats.norm.interval(0.95, loc=mean_zR, 
                               scale=std_zR/np.sqrt(len(scores_zR)))
                            

# print("zR score:\n", scores_zR)
# print("zR: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_zR, std_zR))
# print("zR: Accuracy Confidence Interval (95\%%): (%0.2f, %0.2f)\n" % (lower_zR, upper_zR)) 


# ZR
_, pTValueZrNBG = ttest_rel(scores_zR, scores_NBG)
print(f'zR-NBG Paired T Test:\np-value: {pTValueZrNBG}')

_, pWValueZrNBG = wilcoxon(scores_zR, scores_NBG)
print(f'zR-NBG Wilcoxon Test:\n p-value: {pWValueZrNBG}')

table = create_stats_header()
table += create_stats_table_line('ZR', mean_zR, std_zR, lower_zR, upper_zR)
table += create_stats_table_line('NGB', mean_NBG, std_NBG, lower_NBG, upper_NBG)
print(table)

# conf_mat = confusion_matrix(labels, score_NBG)
# print(conf_mat)

# plt.matshow(conf_mat, cmap=plt.cm.Blues)
# for i in range(len(conf_mat)):
#     for j in range(len(conf_mat)):
#         plt.text(i, j, conf_mat[i][j], va="center", ha="center")

# plt.show()

          zR       NBG
0   0.100000  0.777778
1   0.100000  0.805556
2   0.100000  0.805556
3   0.100000  0.750000
4   0.100000  0.722222
5   0.100000  0.766667
6   0.100000  0.766667
7   0.100559  0.815642
8   0.106145  0.821229
9   0.106145  0.826816
10  0.100000  0.777778
11  0.100000  0.783333
12  0.100000  0.783333
13  0.100000  0.777778
14  0.100000  0.744444
15  0.100000  0.761111
16  0.100000  0.755556
17  0.100559  0.770950
18  0.106145  0.798883
19  0.106145  0.810056
20  0.100000  0.811111
21  0.100000  0.738889
22  0.100000  0.761111
23  0.100000  0.800000
24  0.100000  0.838889
25  0.100000  0.777778
26  0.100000  0.777778
27  0.100559  0.854749
28  0.106145  0.754190
29  0.106145  0.793296
zR-NBG Paired T Test:
p-value: 4.230984524662128e-41
zR-NBG Wilcoxon Test:
 p-value: 1.7083716990437344e-06
\textbf{Método} & \textbf{Média} & \textbf{Desvio Padrão} & \textbf{Limite Inferior} & \textbf{Limite Superior} \ \hline 
ZR & 0.10 & 0.00 & 0.10 & 0.10 \ \hline 
NGB & 0.78 & 0.0

In [143]:
# ETAPA 2:
# A segunda etapa consiste no treino, validação e teste dos classificadores que precisam de ajuste de hiperparâmetros, isto é, os classificadores KMC, KNN e AD
from sklearn.base import BaseEstimator
from sklearn.utils import check_X_y
from sklearn.cluster import KMeans

# The KMC classifier uses a clustering algorithm to define K groups of examples of each class in the training base.
class KMeansCentroidsClassifier(BaseEstimator):
    def __init__(self, k=1):
        super().__init__()
        self.k = k
        self.centroids = []
        # self.groups = self.k * self.nClasses

    def fit(self, x_train, y_train):
        x_train, y_train = check_X_y(x_train, y_train)

        for _class in np.unique(y_train):
            # Initialize KMeans:
            km = KMeans(n_clusters=self.k)
            # Fit KMeans:
            km.fit(x_train[y_train == _class], y_train[y_train == _class])
            # Append centroids to centroids list:   
            self.centroids.append({"clusters": km.cluster_centers_, "class": _class})

    def predict(self, x_test):
        classes = []
        for x in x_test:
            min_dist = np.Inf
            
            for centroid in self.centroids:
                for cluster in centroid["clusters"]:
                    dist = np.linalg.norm(x - cluster)
                    if dist < min_dist:
                        min_dist = dist
                        _class = centroid["class"]

            classes.append(_class)
            
        return classes
                    

dKNN = KNeighborsClassifier(weights='distance')
pipeline_kNN = Pipeline([('transformer', scalar), ('estimator', dKNN)])
KMC = KMeansCentroidsClassifier()
pipeline_KMC = Pipeline([('transformer', scalar), ('estimator', KMC)])
AD = DecisionTreeClassifier()
pipeline_AD = Pipeline([('transformer', scalar), ('estimator', AD)])

# Neste caso, o procedimento de treinamento, validação e teste será realizado através de 3 rodadas de ciclos aninhados de validação e teste,
# com o ciclo interno de validação contendo 4 folds e o externo de teste com 10 folds.
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

# A busca em grade (grid search) do ciclo interno deve considerar os seguintes valores de hiperparâmetros de cada técnica de aprendizado:
# KMC: [k = 1, 3, 5, 7]
# KNN: [n_neighbors = 1, 3, 5, 7]
# AD: [max_depth = None, 3, 5, 10]
grade_KMC = {'estimator__k': [1, 3, 5, 7]}
grade_kNN = {'estimator__n_neighbors': [1, 3, 5, 7]}
grade_AD = {'estimator__max_depth': [None, 3, 5, 10]}
grid_search_KMC = GridSearchCV(estimator=pipeline_KMC, param_grid=grade_KMC, scoring='accuracy', cv=4)
grid_search_kNN = GridSearchCV(estimator=pipeline_kNN, param_grid=grade_kNN, scoring='accuracy', cv=4)
grid_search_AD = GridSearchCV(estimator=pipeline_AD, param_grid=grade_AD, scoring='accuracy', cv=4)

# Os dados utilizados no conjunto de treino em cada rodada de teste devem ser padronizados (normalização com z-score).
# Os valores de padronização obtidos nos dados de treino devem ser utilizados para padronizar os dados do respectivo conjunto de teste.
# ?

scores_kNN = cross_val_score(grid_search_kNN, data, labels, scoring='accuracy', cv=RSKF)
scores_AD = cross_val_score(grid_search_AD, data, labels, scoring='accuracy', cv=RSKF)
scores_KMC = cross_val_score(grid_search_KMC, data, labels, scoring='accuracy', cv=RSKF)

mean_KMC = scores_KMC.mean()
std_KMC = scores_KMC.std()
lower_KMC, upper_KMC = stats.norm.interval(0.95, loc=mean_KMC, scale=std_KMC/np.sqrt(len(scores_KMC)))

print("KMC score:\n", scores_KMC)
print("KMC: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_KMC, std_KMC))
print("KMC: Accuracy Confidence Interval (95\%%): (%0.2f, %0.2f)\n" % (lower_KMC, upper_KMC))

KMC score:
 [0.88888889 0.95555556 0.97222222 0.95       0.92777778 0.95555556
 0.96111111 0.96089385 0.94972067 0.97765363 0.95       0.95555556
 0.96666667 0.92777778 0.98333333 0.96666667 0.93333333 0.96648045
 0.94972067 0.96648045 0.93888889 0.98888889 0.97222222 0.95
 0.96111111 0.94444444 0.95       0.93854749 0.97206704 0.94972067]
KMC: Mean Accuracy: 0.95 Standard Deviation: 0.02
KMC: Accuracy Confidence Interval (95\%): (0.95, 0.96)



In [181]:
from pprint import pprint

def test_two_models(scores1, scores2):
    pTValue = ttest_rel(scores1, scores2)
    pWValue = wilcoxon(scores1, scores2)
    return pTValue, pWValue


def test_models(scores):
    # Test each model with the other, one by one:
    pTValues = []
    pWValues = []

    testsDict = {}
    for i, (model, score) in enumerate(scores.items()):
        testsDict[model] = []
        for j in range(i+1, len(scores)):
            pTValue, pWValue = test_two_models(score, scores[list(scores.keys())[j]])
            testsDict[model].append({'p': pTValue.pvalue, 'w': pWValue.pvalue})

    pprint(testsDict)
    return testsDict


def get_statistical_tests_matrix(statisticalTestsDict):
    MIN_CONFIDENCE_LEVEL = 0.05
    statisticalTestMatrix = [[None for _ in range(len(statisticalTestsDict))] for _ in range(len(statisticalTestsDict))]
    
    print("Statistical tests:")
    
    # Iterate over statisticalTestsDict in a cross fashion:
    for i, (model, tests) in enumerate(statisticalTestsDict.items()):
        # Print model name in the diagonal:
        j = i
        statisticalTestMatrix[i][j] = model
        for test in tests:
            j += 1
            statisticalTestMatrix[i][j] = round(test['p'], 8)
            statisticalTestMatrix[j][i] = round(test['w'], 8)
                
    return statisticalTestMatrix


def create_matrix_table(statisticalTestMatrix):
    print("\hline")
    for row in statisticalTestMatrix:
        for i, cell in enumerate(row):
            ending = None
            # If cell is not the last in the row:
            if i != len(row) - 1:
                ending = ' & '
            else: 
                ending = ' \\\\ \hline\n'
                
            if type(cell) is not str:
                if cell > 0.05:
                    print(cell, end=ending)
                else:
                    print(f'\\textbf{{{cell}}}', end=ending)
            else: 
                print(cell, end=ending)


scoresDict = {
    'zR': scores_zR,
    'NBG': scores_NBG,
    'kNN': scores_kNN,
    'AD': scores_AD,
    'KMC': scores_KMC,
}

statisticalTestsDict = test_models(scoresDict)

# Print the statistical tests results. The diagonal of the matrix contains the names of the models. The upper diagonal contains the p-values of the statistical tests. The lower diagonal contains the Wilcoxon p-values.
statisticalTestMatrix = get_statistical_tests_matrix(statisticalTestsDict)
print(statisticalTestMatrix)
create_matrix_table(statisticalTestMatrix)
pprint(statisticalTestMatrix, width=40, indent=0, compact=True)

# Add KNN, AD and KMC scores to df_scores:
df_scores['KMC'] = scores_KMC
df_scores['KNN'] = scores_kNN
df_scores['AD'] = scores_AD


{'AD': [{'p': 1.0796784545713246e-17, 'w': 1.7224282827430733e-06}],
 'KMC': [],
 'NBG': [{'p': 3.7223941692129973e-25, 'w': 1.7224282827430733e-06},
         {'p': 2.585786419750758e-10, 'w': 2.1122012156048867e-06},
         {'p': 3.3241854029509423e-21, 'w': 1.7289484360195754e-06}],
 'kNN': [{'p': 2.849455595993415e-24, 'w': 1.7213435893906393e-06},
         {'p': 4.085040755090073e-05, 'w': 0.00011836231046420427}],
 'zR': [{'p': 4.230984524662128e-41, 'w': 1.7083716990437344e-06},
        {'p': 1.557666013355992e-54, 'w': 1.6858673414334802e-06},
        {'p': 1.937295275155675e-47, 'w': 1.711606992411454e-06},
        {'p': 1.762581599344587e-49, 'w': 1.7105279932400777e-06}]}
Statistical tests:
[['zR', 0.0, 0.0, 0.0, 0.0], [1.71e-06, 'NBG', 0.0, 0.0, 0.0], [1.69e-06, 1.72e-06, 'kNN', 0.0, 4.085e-05], [1.71e-06, 2.11e-06, 1.72e-06, 'AD', 0.0], [1.71e-06, 1.73e-06, 0.00011836, 1.72e-06, 'KMC']]
\hline
zR & \textbf{0.0} & \textbf{0.0} & \textbf{0.0} & \textbf{0.0} \\ \hline
\textb