In [10]:
# Importa pacotes necessários via kernel do Jupyter
# %pip install numpy seaborn scipy sklearn matplotlib

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.datasets import load_digits, load_iris, load_wine, load_breast_cancer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

# Load digits dataset from scikit-learn; split into data and target/label
digits_data, digits_labels = load_digits(return_X_y=True)
# Get the number of samples (lines) and features (columns); also get the number of unique labels, i.e. the number of classes
(n_samples, n_features), n_digits = digits_data.shape, np.unique(digits_labels).size
print(f"- digits: {n_digits}; - samples: {n_samples}; - features: {n_features}")

# Initialize a dummy classifier (zeroR), and the Naive Bayes classifier (GaussianNB)
zR = DummyClassifier(strategy='most_frequent')
gNB = GaussianNB()

# Initialize a scalar pipeline to scale the data before feeding it to the classifier
scalar = StandardScaler()

pipeline_gNB = Pipeline([('transformer', scalar), ('estimator', gNB)])
pipeline_zR = Pipeline([('transformer', scalar), ('estimator', zR)])

# Initialize a stratified k-fold cross-validation object with seed provided by the professor
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

# Use zeroR (DummyClassifier) and naiveBayes to predict the class of the data with stratified cross-validation (10-fold), repeated 3 times
score_gNB = cross_val_score(pipeline_gNB, digits_data, digits_labels, scoring='accuracy', cv=RSKF)
score_zR = cross_val_score(pipeline_zR, digits_data, digits_labels, scoring='accuracy', cv=RSKF)

# Print the accuracy scores for the classifiers
# gNB
mean_gNB = score_gNB.mean()
std_gNB = score_gNB.std()
lower_gNB, upper_gNB = stats.norm.interval(0.95, loc=mean_gNB, scale=std_gNB/np.sqrt(len(score_gNB)))
     
print("gNB score:\n", score_gNB)
print("gNB: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_gNB, std_gNB))
print("gNB: Accuracy Confidence Interval (95%%): (%0.2f, %0.2f)\n" % (lower_gNB, upper_gNB)) 
       
# zR                          
mean_zR = score_zR.mean()
std_zR = score_zR.std()
lower_zR, upper_zR = stats.norm.interval(0.95, loc=mean_zR, 
                               scale=std_zR/np.sqrt(len(score_zR)))

print("zR score:\n", score_zR)
print("zR: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_zR, std_zR))
print("zR: Accuracy Confidence Interval (95\%%): (%0.2f, %0.2f)\n" % (lower_zR, upper_zR)) 

# conf_mat = confusion_matrix(digits_labels, score_gNB)
# print(conf_mat)

# plt.matshow(conf_mat, cmap=plt.cm.Blues)
# for i in range(len(conf_mat)):
#     for j in range(len(conf_mat)):
#         plt.text(i, j, conf_mat[i][j], va="center", ha="center")

# plt.show()

- digits: 2; - samples: 569; - features: 30
gNB score:
 [0.96491228 0.9122807  0.94736842 0.92982456 0.94736842 0.87719298
 0.94736842 0.94736842 0.9122807  0.91071429 0.92982456 0.96491228
 0.98245614 0.92982456 0.96491228 0.94736842 0.92982456 0.87719298
 0.87719298 0.92857143 0.92982456 0.96491228 0.92982456 0.92982456
 0.92982456 0.9122807  0.92982456 0.96491228 0.96491228 0.92857143]
gNB: Mean Accuracy: 0.93 Standard Deviation: 0.03
gNB: Accuracy Confidence Interval (95%): (0.92, 0.94)

zR score:
 [0.61403509 0.61403509 0.63157895 0.63157895 0.63157895 0.63157895
 0.63157895 0.63157895 0.63157895 0.625      0.61403509 0.61403509
 0.63157895 0.63157895 0.63157895 0.63157895 0.63157895 0.63157895
 0.63157895 0.625      0.61403509 0.61403509 0.63157895 0.63157895
 0.63157895 0.63157895 0.63157895 0.63157895 0.63157895 0.625     ]
zR: Mean Accuracy: 0.63 Standard Deviation: 0.01
zR: Accuracy Confidence Interval (95\%): (0.62, 0.63)



In [23]:
# ETAPA 2:
# A segunda etapa consiste no treino, validação e teste dos classificadores que precisam de ajuste de hiperparâmetros, isto é, os classificadores KMC, KNN e AD
from sklearn.base import BaseEstimator
from sklearn.utils import check_X_y
from sklearn.cluster import KMeans

# The KMC classifier uses a clustering algorithm to define K groups of examples of each class in the training base.
class KMeansCentroidsClassifier(BaseEstimator):
    def __init__(self, k=1, nClasses=1):
        super().__init__()
        self.k = k
        self.nClasses = nClasses
        # self.groups = self.k * self.nClasses
        self.centroids = []

    # Assuming that a database has nClasses classes, the KMC algorithm initially forms K*nClasses groups, with K groups in each of the nClasses classes. The centroids of each of the groups are calculated and this centroid is associated with the class of the group from which it was generated. The method has the value of K as a hyperparameter.
    def fit(self, x_train, y_train):
        # x_train, y_train = check_X_y(x_train, y_train)

        # Initialize nClasses:
        self.nClasses = len(np.unique(y_train))

        for _class in range(self.nClasses):
            # print(x_train)
            # Initialize KMeans:
            km = KMeans(n_clusters=self.k, random_state=36851234)
            # Fit KMeans:
            km.fit(x_train[y_train == _class], y_train[y_train == _class])
            # Append centroids to centroids list:   
            self.centroids.append(km.cluster_centers_)
        return self

        # KMC calculates the centroids of each of the classes:
        # KMC associates each of the centroids with the class of the group from which it was generated:
        # self.labels = np.array([np.argmin(np.linalg.norm(self.centroids - self.centroids[i], axis=1)) for i in range(self.nClasses * self.k)])
        # self.centroids = self.kmeans.cluster_centers_
        # self.labels = self.kmeans.labels_

    # To perform a classification, KMC checks which centroid is closest to the element to be classified and returns its class. To create the KMC method, sklearn's Kmeans method (https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) must be used with default values for its hyperparameters.
    def predict(self, x_test):
        # Percorrer x_test
        
        # Para cada cluster, calcular a distância do elemento x_test ao centroide
        # Retornar o cluster mais próximo
        distancias = [np.linalg.norm(x_test-self.cents[centroide]) for centroide in self.centroids]
        classificacao = distancias.index(min(distancias))
        # for centroid in self.centroids:
            # print(np.linalg.norm(centroid - x_test, axis=1))
        # Obter a classificação do cluster mais próximo
         
        # KMC checks which centroid is closest to the element to be classified and returns its class:


dKNN = KNeighborsClassifier(weights='distance')
pipeline_kNN = Pipeline([('transformer', scalar), ('estimator', dKNN)])
KMC = KMeansCentroidsClassifier()
pipeline_KMC = Pipeline([('transformer', scalar), ('estimator', KMC)])
AD = DecisionTreeClassifier()
pipeline_AD = Pipeline([('transformer', scalar), ('estimator', AD)])

# Neste caso, o procedimento de treinamento, validação e teste será realizado através de 3 rodadas de ciclos aninhados de validação e teste,
# com o ciclo interno de validação contendo 4 folds e o externo de teste com 10 folds.
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

# A busca em grade (grid search) do ciclo interno deve considerar os seguintes valores de hiperparâmetros de cada técnica de aprendizado:
# KMC: [k = 1, 3, 5, 7]
# KNN: [n_neighbors = 1, 3, 5, 7]
# AD: [max_depth = None, 3, 5, 10]
grade_KMC = {'estimator__k': [1, 3, 5, 7]}
grade_kNN = {'estimator__n_neighbors': [1, 3, 5, 7]}
grade_AD = {'estimator__max_depth': [None, 3, 5, 10]}
grid_search_KMC = GridSearchCV(estimator=pipeline_KMC, param_grid=grade_KMC, scoring='accuracy', cv=4)
grid_search_kNN = GridSearchCV(estimator=pipeline_kNN, param_grid=grade_kNN, scoring='accuracy', cv=4)
grid_search_AD = GridSearchCV(estimator=pipeline_AD, param_grid=grade_AD, scoring='accuracy', cv=4)

# Os dados utilizados no conjunto de treino em cada rodada de teste devem ser padronizados (normalização com z-score).
# Os valores de padronização obtidos nos dados de treino devem ser utilizados para padronizar os dados do respectivo conjunto de teste.
# ?

scores_kNN = cross_val_score(grid_search_kNN, digits_data, digits_labels, scoring='accuracy', cv=RSKF)
scores_AD = cross_val_score(grid_search_AD, digits_data, digits_labels, scoring='accuracy', cv=RSKF)
scores_KMC = cross_val_score(grid_search_KMC, digits_data, digits_labels, scoring='accuracy', cv=RSKF)

# Os resultados de cada classificador devem ser apresentados numa tabela contendo a média das acurácias obtidas em cada fold, o desvio padrão e o intervalo de confiança a 95% de significância dos resultados,

# e também através do boxplot dos resultados de cada classificador em cada fold.
# Os gráficos bloxplot requeridos no treino e no teste devem ser gerados usando função específica do pacote seaborn
def example1():
    mydata = [1, 2, 3, 4, 5, 6, 12]
    sns.boxplot(y=mydata)  # Also accepts numpy arrays
    plt.show()


def example2():
    df = sns.load_dataset('iris')
    # returns a DataFrame object. This dataset has 150 examples.
    # print(df)
    # Make boxplot for each group
    sns.boxplot(data=df.loc[:, :])
    # loc[:,:] means all lines and all columns
    plt.show()


# example1()
# example2()

# será necessário apresentar também a tabela pareada dos resultados (p-values) dos testes de hipótese entre os pares de métodos.
# Na matriz triangular superior devem ser apresentados os resultados do teste t pareado (amostras dependentes).
# Na matriz triangular inferior devem ser apresentados os resultado do teste não paramétrico de wilcoxon.
# Os valores da célula da tabela rejeitarem a hipótese nula para um nível de significância de 95% devem ser escritos em negrito.



SyntaxError: invalid syntax (3986472868.py, line 44)