In [1]:
'''
  Nesta célula iremos pre processar os dados a partir da saída do nosso modelo anterior,
  e criar o arquivo csv no qual nossa rede neural irá se basear.
  Utilizaremos o modelo com melhor performance nos dados de treino, o modelo de confiança igual a 97%
  
  A única modificação aqui é que ao invés de retornarmos a menor distância de mahalanobis, 
  iremos computar e armazenar todas as distâncias para as 16 gaussianas mais próximas de cada ponto.
  O arquivo resultante dessa célula terá tensores contendo em cada elemento:
    - (216 dim) O ponto já transformado pelo PCA do modelo
    - (16 dim) distâncias euclidianas
    - (16 dim) distâncias de mahalanobis
    - (16 dim) cada uma das classes que essas distâncias de mahalanobis representa
'''

import numpy as np
import pandas as pd
from tqdm import tqdm
from search_optimization import find_closest_gaussians_and_distances
from distance import mahalanobis_distance 
from tensorflow.keras.utils import to_categorical

N_CLASSES = 10
K_NEAREST_GAUSSIANS = 5
MODEL_NAME = 'mahalanobis_class-specific_60.0'
N_SAMPLES = 40000

def load_data(file_path): 
    print("loading data")
    all_data = pd.read_csv(file_path)
    train_data = all_data.to_numpy()[:, 1:]
    labels = all_data.to_numpy()[:, 0]
    print(f"train shape: {train_data.shape}")
    print(f"labels shape: {labels.shape}")
    
    return train_data, labels

def calc_mahalanobis_distances(model, point, indices):
    distances = []
    for idx in indices:
        mean = model.all_means[idx]
        prec = model.all_precisions[idx]
        distance = mahalanobis_distance(x=point, mean=mean, precision=prec)
        distances.append(distance)
    return distances

train_data_file_path = 'C:/src/data_analysis/data/digit-recognizer/train.csv'
train_data, labels = load_data(train_data_file_path)
loaded_arr = np.load(f'../models_storage/{MODEL_NAME}.npy', allow_pickle=True)

kgmm = loaded_arr[0]
labels_v = labels[:N_SAMPLES]
validation_data = train_data[:N_SAMPLES,:]
eval_data = train_data[N_SAMPLES:,:]
eval_labels = labels[N_SAMPLES:]


meta_params = kgmm.meta_params
search_index = kgmm.hnsw
transformed_points = kgmm.pca.transform(validation_data)
transformed_eval = kgmm.pca.transform(eval_data)
meta_params.k_near_centroids = K_NEAREST_GAUSSIANS
X_train = []
X_eval = []

print(transformed_points.shape)
for idx in tqdm(range(len(transformed_points)), desc=f"",ncols=80, colour="blue"): 
  p = transformed_points[idx]
  label = labels_v[idx]
  indices, [distances] = find_closest_gaussians_and_distances(hnsw=search_index, point=p, metaparams=meta_params)
  mahalanobis_distances = calc_mahalanobis_distances(model=kgmm, point=p, indices=indices)
  classes_int = (indices // kgmm.k_segments_per_class).astype(int)
  classes_categ = to_categorical(classes_int, N_CLASSES).reshape(-1)
  tensor=np.concatenate([p, mahalanobis_distances, distances, classes_categ ])
  X_train.append(tensor)

for idx in tqdm(range(len(transformed_eval)), desc=f"",ncols=80, colour="blue"): 
  p = transformed_eval[idx]
  label = eval_labels[idx]
  indices, [distances] = find_closest_gaussians_and_distances(hnsw=search_index, point=p, metaparams=meta_params)
  mahalanobis_distances = calc_mahalanobis_distances(model=kgmm, point=p, indices=indices)
  classes_int = (indices // kgmm.k_segments_per_class).astype(int)
  classes_categ = to_categorical(classes_int, N_CLASSES).reshape(-1)
  tensor=np.concatenate([p, mahalanobis_distances, distances, classes_categ ])
  X_eval.append(tensor)
  

KeyboardInterrupt: 

In [143]:
X_train = np.array(X_train)
y_train = np.array(to_categorical(labels_v, N_CLASSES))

X_eval = np.array(X_eval)
y_eval = np.array(to_categorical(eval_labels, N_CLASSES))
print(X_train.shape)

(40000, 137)


In [145]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_eval = scaler.fit_transform(X_eval)

NUM_CLASSES = 10  # Número de classes a serem previstas

# Construir o modelo
model = Sequential([
    Dense(X_train.shape[1], activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(128, activation='tanh'),  # Camada adicional para maior profundidade
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(64, activation='relu'),  # Camada adicional para maior profundidade
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(NUM_CLASSES, activation='softmax')  # Camada de saída
])

# Compilar o modelo
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Treinar o modelo
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Avaliar o modelo
loss, accuracy = model.evaluate(X_eval, y_eval)

print("Precisão: ", accuracy)
''' Rede resultante: '''  
# model.summary()

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7539 - loss: 0.8206 - val_accuracy: 0.9205 - val_loss: 0.2830
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9038 - loss: 0.3421 - val_accuracy: 0.9258 - val_loss: 0.2548
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9130 - loss: 0.3056 - val_accuracy: 0.9285 - val_loss: 0.2393
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9179 - loss: 0.2730 - val_accuracy: 0.9311 - val_loss: 0.2197
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9232 - loss: 0.2509 - val_accuracy: 0.9325 - val_loss: 0.2150
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9256 - loss: 0.2402 - val_accuracy: 0.9346 - val_loss: 0.2074
Epoch 7/10
[1m500/500[0m [32m━━━━━━━

' Rede resultante: '

In [None]:
''' Gerar submissão '''  
file_path = '../data/digit-recognizer/test.csv' 
test_data = pd.read_csv(file_path)
data_points = test_data.to_numpy()
points_pca = kgmm.pca.transform(data_points)
print(points_pca.shape)
X_test = []

for idx in tqdm(range(len(points_pca)), desc=f"",ncols=80, colour="blue"): 
  p = points_pca[idx]
  label = labels_v[idx]
  indices, [distances] = find_closest_gaussians_and_distances(hnsw=search_index, point=p, metaparams=meta_params)
  mahalanobis_distances = calc_mahalanobis_distances(model=kgmm, point=p, indices=indices)
  classes_int = (indices // kgmm.k_segments_per_class).astype(int)
  classes_categ = to_categorical(classes_int, N_CLASSES).reshape(-1)
  tensor=np.concatenate([p, mahalanobis_distances, classes_categ ])
  X_test.append(tensor)
  





(28000, 213)


100%|[34m███████████████████████████████████[0m| 28000/28000 [00:22<00:00, 1238.54it/s][0m


In [78]:
X_test = scaler.fit_transform(X_test)
predictions = model.predict(X_test)
y_int = np.argmax(predictions, axis=1)

test_data.loc[:, "Label"] = y_int
answer = test_data[["Label"]].copy() 
answer["ImageId"] = range(1, len(answer) + 1)  
answer = answer[["ImageId", "Label"]]

# Save to CSV
output_file = f"mlp_end.csv"
answer.to_csv(output_file, index=False)

[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
