In [9]:
'''
Comparar o desempenho do meu modelo com outro usando apenas N 
componentes por classe e um componente geral para todas as classes.
'''
import numpy as np
import pandas as pd
from sklearn.mixture import BayesianGaussianMixture
from sklearn.decomposition import PCA

def mahalanobis_distance(x, mean, precision):
    """
    Calcula a distância de Mahalanobis de um ponto para uma distribuição gaussiana.
    
    Args:
    - x: Ponto (array 1D).
    - mean: Média da gaussiana (array 1D).
    - cov: Matriz de covariância (array 2D).
    
    Retorna:
    - Distância de Mahalanobis (float).
    """
    delta = x - mean
    dist = np.sqrt(np.dot(np.dot(delta.T, precision), delta))
    return dist

def reduce_dim_PCA(train_data, threshold):
    pca = PCA(n_components=threshold, random_state=2)
    transformed_data = pca.fit_transform(train_data)
    return pca, transformed_data

def load_data(file_path): 
    print("loading data")
    all_data = pd.read_csv(file_path)
    train_data = all_data.to_numpy()[:, 1:]
    labels = all_data.to_numpy()[:, 0]
    print(f"train shape: {train_data.shape}")
    print(f"labels shape: {labels.shape}")
    
    return train_data, labels

n_componentes = 10
threshold = 0.98
train_data, labels = load_data('../data/digit-recognizer/train.csv')
pca, transformed_data = reduce_dim_PCA(train_data, 0.90)
print(pca.n_components_)
gmm = BayesianGaussianMixture(
    n_components=10,
    max_iter=100,
    covariance_type='full',
    weight_concentration_prior_type='dirichlet_process',
    tol=1e-4,
    random_state=2,
    verbose=2, 
    init_params='k-means++',
    n_init=5,
    warm_start=True,
)

gmm.fit(transformed_data)

means = gmm.means_
precisions = gmm.precisions_
weights = gmm.weights_



loading data
train shape: (42000, 784)
labels shape: (42000,)
87
Initialization 0
  Iteration 10	 time lapse 9.66098s	 ll change 25884.52550
  Iteration 20	 time lapse 9.49646s	 ll change 5826.71384
  Iteration 30	 time lapse 9.42574s	 ll change 2914.21152
  Iteration 40	 time lapse 9.44223s	 ll change 1880.29980
  Iteration 50	 time lapse 9.15935s	 ll change 538.91391
  Iteration 60	 time lapse 9.42686s	 ll change 162.53658
  Iteration 70	 time lapse 9.72263s	 ll change 64.35270
  Iteration 80	 time lapse 9.95019s	 ll change 125.52381
  Iteration 90	 time lapse 9.66600s	 ll change 117.77738
  Iteration 100	 time lapse 9.78365s	 ll change 199.35508
Initialization did not converge. time lapse 95.73409s	 lower bound -18205879.37713.
Initialization 1
  Iteration 10	 time lapse 9.69538s	 ll change 33330.23404
  Iteration 20	 time lapse 9.41597s	 ll change 2010.26793
  Iteration 30	 time lapse 9.34249s	 ll change 738.52776
  Iteration 40	 time lapse 8.92955s	 ll change 4286.91990
  Iteratio



TypeError: 'int' object is not iterable

In [19]:
def classify(point): 
    min_distance = float('inf')
    best_index = -1

    for idx, mean in enumerate(means):        
        prec = precisions[idx]
        distance = mahalanobis_distance(x=point, mean=mean, precision=prec)
        
        if distance < min_distance:
            min_distance = distance
            best_index = idx

    return best_index

num_samples = 10000
validation_data = pca.transform(train_data[:num_samples,:])
labels_v = labels[:num_samples]

pontos_representados = 0
classifications_pairs = []

for idx, point in enumerate(validation_data):     
    label = labels_v[idx]
    min_distance_idx = classify(point)
    classifications_pairs.append(np.array([min_distance_idx, label]))

classifications_pairs = np.array(classifications_pairs)

for class_idx in range(10):
    class_pairs = classifications_pairs[classifications_pairs[:,1] == class_idx] 
    coluna = class_pairs[:, 0] 
    valor, contagem = np.unique(coluna, return_counts=True)
    mais_frequente = valor[np.argmax(contagem)]
    print(f'mais frequente classe {class_idx}: {mais_frequente}')
    print(np.max(contagem) / len(coluna)) 
    
        

mais frequente classe 0: 8
0.9495459132189707
mais frequente classe 1: 9
0.382648401826484
mais frequente classe 2: 4
0.7301435406698564
mais frequente classe 3: 2
0.5242814667988107
mais frequente classe 4: 3
0.6008273009307136
mais frequente classe 5: 7
0.38741721854304634
mais frequente classe 6: 1
0.5752741774675972
mais frequente classe 7: 5
0.4273339749759384
mais frequente classe 8: 7
0.4168421052631579
mais frequente classe 9: 3
0.47839195979899496
