Building AE using Pytorch

Let's start building a very simple autoencoder for the MNIST dataset using Pytorch. The MNIST dataset is eidely used for becnhmark dataset in machine learning and computer vision. It consists of a collection of 28x28 grayscale images of handwritten digits (0-9). The dataset is divided into a training set with 60.000 images and a test set with 10.000 images. It's often employed to evaluate the perfomance of various neural network architectures and algorithms for digit recognition tasks.

First of all, we will import libraries like torch, numpy, and matplotlib.

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import time
import torch.nn.functional as F
import torchvision
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data import SubsetRandomSampler
from torch.utils.data import sampler
import numpy as np
import matplotlib.colors as mcolors
from skimage.metrics import structural_similarity as ssim

Carregando e Preparando o Dataset

In [2]:
# Transformação: Converte para o tensor e normaliza
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Carregar MNIST
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,shuffle=True)

testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

Criando o Autoencoder usando Pytorch

In [3]:
class Autoencoder(nn.Module):
    def __init__(self, latent_dim=16):
        super(Autoencoder, self).__init__()

        #Encoder
        self.encoder = nn.Sequential(
            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim)
        )

        #Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 28*28),
            nn.Sigmoid()    # Para normalizar saída entre 0 e 1
        )

    def forward(self, x):
        encoded  = self.encoder(x)
        decoded  = self.decoder(encoded)
        return decoded

Treinando Modelo

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Autoencoder(latent_dim=16).to(device)

criterion = nn.MSELoss() # Usamos erro quadrático médio
optimizer = optim.Adam(model.parameters(), lr=0.001)

Treinando o Autoencoder

In [8]:
num_epochs = 20

for epoch in range(num_epochs):
    running_loss = 0.0

    for data in trainloader:
        inputs, _ = data
        inputs = inputs.view(inputs.size(0), -1).to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Época {epoch+1}/{num_epochs}, Loss: {running_loss/len(trainloader)}")

Época 1/20, Loss: 0.9253655924980067
Época 2/20, Loss: 0.9253647869456806
Época 3/20, Loss: 0.925366462230174
Época 4/20, Loss: 0.9253684303907952
Época 5/20, Loss: 0.9253702311754735
Época 6/20, Loss: 0.9253676271896119
Época 7/20, Loss: 0.9253701251834187
Época 8/20, Loss: 0.9253684921559494
Época 9/20, Loss: 0.9253708082221465
Época 10/20, Loss: 0.9253715102606491
Época 11/20, Loss: 0.9253697082050828
Época 12/20, Loss: 0.9253710921385141
Época 13/20, Loss: 0.9253731702309428
Época 14/20, Loss: 0.9253694587933229
Época 15/20, Loss: 0.9253674183191776
Época 16/20, Loss: 0.925366715009787
Época 17/20, Loss: 0.9253690126481087
Época 18/20, Loss: 0.9253699523426576
Época 19/20, Loss: 0.9253658905212305
Época 20/20, Loss: 0.9253667473538852


Avaliação do Modelo

In [9]:
model.eval()
test_images, _ = next(iter(testloader))
test_images = test_images.view(test_images.size(0), -1).to(device)
reconstructed = model(test_images)

#Converter para CPU para visualização
test_images = test_images.cpu().detach().numpy()
reconstructed = reconstructed.cpu().detach.numpy()

AttributeError: 'builtin_function_or_method' object has no attribute 'numpy'

Plotando a recosntrução

In [None]:
fig, axes = plt.subplot(2, 10, figsize=(10, 2))
for i in range(10):
    axes[0, i].imshow(test_images[i].reshape(28, 28), cmap='gray')
    axes[0, i].axis('off')
    axes[1, i].imshow(reconstructed[i].reshape(28, 28), cmap='gray')
    axes[1, i].axis('off')
plt.show()

Calculando MSE E SSIM

In [None]:
mse = np.mean((test_images - reconstructed) ** 2)
ssim_value = ssim(test_images[0].reshape(28, 28), reconstructed[0].reshape(28, 28))

print(f"MSE: {mse:.6f}")
print(f"SSIM: {ssim_value:.6f}")

Testando Diferentes Configurações do Espaço Latente

In [None]:
#Variando latent_dim para 8, 16, 32, 64 e comparar os resultados
for latent_dim in [8, 16, 32, 64]:
    model = Autoencoder(latent_dim=latent_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

print(f"Treinando para latent_dim={latent_dim}...")
for epoch in range(10):
    for data in trainloader:
        inputs,_ = data
        inputs = inputs.view(inputs.size(0), -1).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()