### Autoencoder intuition

In [86]:
from examples.empenhos_df import EMPENHOS
    
ds_train = EMPENHOS(
    train=True, testing_mode=True
)  # training dataset
ds_val = EMPENHOS(
    train=False, testing_mode=True
)  # evaluation dataset

In [87]:
from ptsdae.sdae import StackedDenoisingAutoEncoder


# dimensão de input: 387
# dimensão latente: 10
autoencoder = StackedDenoisingAutoEncoder(
        [387, 1000, 2000, 2000, 10], final_activation=None
)

In [24]:
autoencoder.encoder

Sequential(
  (0): Sequential(
    (linear): Linear(in_features=387, out_features=1000, bias=True)
    (activation): ReLU()
  )
  (1): Sequential(
    (linear): Linear(in_features=1000, out_features=2000, bias=True)
    (activation): ReLU()
  )
  (2): Sequential(
    (linear): Linear(in_features=2000, out_features=2000, bias=True)
    (activation): ReLU()
  )
  (3): Sequential(
    (linear): Linear(in_features=2000, out_features=10, bias=True)
  )
)

In [10]:
autoencoder.decoder

Sequential(
  (0): Sequential(
    (linear): Linear(in_features=10, out_features=2000, bias=True)
    (activation): ReLU()
  )
  (1): Sequential(
    (linear): Linear(in_features=2000, out_features=2000, bias=True)
    (activation): ReLU()
  )
  (2): Sequential(
    (linear): Linear(in_features=2000, out_features=1000, bias=True)
    (activation): ReLU()
  )
  (3): Sequential(
    (linear): Linear(in_features=1000, out_features=387, bias=True)
  )
)

In [None]:
for param in autoencoder.encoder.parameters():
    print(param.shape) # temos 8 tensors para um encoder que possui 4 camadas: [387, 1000, 2000, 2000, 10]
    
# formato:
# Weight shape: [out_features, in_features]
# Bias shape: [out_features]

torch.Size([1000, 387])
torch.Size([1000])
torch.Size([2000, 1000])
torch.Size([2000])
torch.Size([2000, 2000])
torch.Size([2000])
torch.Size([10, 2000])
torch.Size([10])


In [None]:
#next(autoencoder.parameters())[0] 

# weights e bias inicializados da seguinte forma:

# nn.init.xavier_uniform_(weight, gain) -> Fills the weight tensor with values sampled from a uniform distribution (gain é um fator escalar: sqrt[2])
# nn.init.constant_(bias, 0) -> Seta todos os bias com valor 0

In [88]:
from torch.utils.data import DataLoader, TensorDataset

dataloader = DataLoader(
        ds_train,
        batch_size=10,
        shuffle=True
    )

validation_loader = DataLoader(
        ds_val,
        batch_size=10,
        shuffle=False,
    )

### Função de Predict (SAE)

In [89]:
# PREDICT FUNCTION
import torch
import tqdm

def predict(
    dataset: torch.utils.data.Dataset,
    model: torch.nn.Module,
    batch_size: int,
    cuda: bool = True,
    silent: bool = False,
    encode: bool = True,
) -> torch.Tensor:

    dataloader = DataLoader(
        dataset, batch_size=batch_size, pin_memory=False, shuffle=False
    )
    data_iterator = tqdm(dataloader, leave=False, unit="batch", disable=silent)
    features = []
    
    
    if isinstance(model, torch.nn.Module): # se existir model, True
        model.eval() # porque isso?
        
        
        
    for batch in data_iterator:
        if isinstance(batch, tuple) or isinstance(batch, list) and len(batch) in [1, 2]:
            batch = batch[0]

        
        batch = batch.squeeze(1).view(batch.size(0), -1)
        if encode:
            output = model.encode(batch)
        else:
            output = model(batch)
        features.append(
            output.detach().cpu()
        )  # move to the CPU to prevent out of memory on the GPU
    return torch.cat(features)

### Função de Train (SAE)

In [98]:
from typing import Any, Callable, Optional
import torch
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader



def train(
    dataset: torch.utils.data.Dataset,
    autoencoder: torch.nn.Module,
    epochs: int,
    batch_size: int,
    optimizer: torch.optim.Optimizer,
    scheduler: Any = None,
    validation: Optional[torch.utils.data.Dataset] = None,
    corruption: Optional[float] = None,
    cuda: bool = True,
    sampler: Optional[torch.utils.data.sampler.Sampler] = None,
    silent: bool = False,
    update_freq: Optional[int] = 1,
    update_callback: Optional[Callable[[float, float], None]] = None,
    num_workers: Optional[int] = None,
    epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None,
) -> None:
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=False,
        sampler=sampler,
        shuffle=True if sampler is None else False,
        num_workers=num_workers if num_workers is not None else 0,
    )
    if validation is not None:
        validation_loader = DataLoader(
            validation,
            batch_size=batch_size,
            shuffle=False,
        )
    else:
        validation_loader = None
    loss_function = nn.MSELoss()
    validation_loss_value = -1
    loss_value = 0
    
    
    for epoch in range(epochs):
        if scheduler is not None:
            scheduler.step()
        data_iterator = tqdm(
            dataloader,
            leave=True,
            unit="batch",
            postfix={"epo": epoch, "lss": "%.6f" % 0.0, "vls": "%.6f" % -1,},
            disable=silent,
        )
        for index, batch in enumerate(data_iterator):
            autoencoder.train()
            if (
                isinstance(batch, tuple)
                or isinstance(batch, list)
                and len(batch) in [1, 2]
            ):
                batch = batch[0] # retornar sem as labels (se existirem)
                
            # run the batch through the autoencoder and obtain the output
            if corruption is not None:
                output = autoencoder(F.dropout(batch, corruption))
            else:
                output = autoencoder(batch)
                
            # Somente o primeiro batch
            if index == 0 and epoch==0:
                print(f"\nOutput of AE (shape: {output.shape})") # o output do autoencoder deve ter as mesmas dimensões que a de entrada
            
            
            loss = loss_function(output, batch)

            loss_value = float(loss.item())
            optimizer.zero_grad() # clears (resets) all the gradients of the model's parameters that were accumulated in the previous backward pass. 
            loss.backward()
            optimizer.step()
            
            data_iterator.set_postfix( # adicionando os stats de treinamento para visualização em tempo real
                epo=epoch, lss="%.6f" % loss_value, vls="%.6f" % validation_loss_value,
            )
        
        
        # AVALIAÇÃO DO MODELO
        autoencoder.eval()
        val_losses = []
        with torch.no_grad():
            for val_batch in validation_loader:
                if isinstance(val_batch, (tuple, list)) and len(val_batch) in [1, 2]:
                    val_batch = val_batch[0]
                val_output = autoencoder(val_batch)
                val_loss = loss_function(val_output, val_batch)
                val_losses.append(val_loss.item())
        mean_val_loss = sum(val_losses) / len(val_losses)
        print(f"Validation loss: {mean_val_loss:.6f}")
                


### Função de pre-training (SAE)

Nessa etapa, fazemos uma iteração para cada camada do SAE e treinamos o modelo a codificar e decodificar em cada camada. Em seguida, os parâmetros são copiados para o SAE original.

In [92]:
import torch.nn as nn
import torch
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset
from ptsdae.dae import DenoisingAutoencoder
from ptsdae.sdae import StackedDenoisingAutoEncoder


# PRETRAIN FUNC

corruption = 0.2
optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9) #  it means the current update is made of 90% of the previous update (momentum) and 10% of the new gradient
scheduler=lambda x: StepLR(x, 100, gamma=0.1)
current_dataset = ds_train
current_validation = ds_val
number_of_subautoencoders = len(autoencoder.dimensions) - 1

for index in range(number_of_subautoencoders): # 4 subautoencoders
    
    encoder, decoder = autoencoder.get_stack(index)
    # index = 0: [encoder from input_dim → 1000] e [decoder from 1000 → input_dim]
    # index = 1: [encoder from 1000 → 2000] e [decoder from 2000 → 1000]
    
    # autoencoder.dimensions = [input_dim, 1000, 2000, 2000, num_clusters]
    # index = 0: embedding_dimension: [input_dim] e hidden_dimension: [1000]
    embedding_dimension = autoencoder.dimensions[index]
    hidden_dimension = autoencoder.dimensions[index + 1]
    
    
    # manual override to prevent corruption for the last subautoencoder
    if index == (number_of_subautoencoders - 1):
        corruption = None
        
        
    # initialise the subautoencoder
    # Representa uma camada do SAE
    sub_autoencoder = DenoisingAutoencoder(
        embedding_dimension=embedding_dimension,
        hidden_dimension=hidden_dimension,
        activation=torch.nn.ReLU() if index != (number_of_subautoencoders - 1) else None,
        corruption=nn.Dropout(corruption) if corruption is not None else None,
    )
    
    # Corruption serve para o modelo aprender a reconstruir as features e não memorizar inputs
    # assim, durante o treinamento, o modelo reconstruindo os dados originais e não corrompidos.

    if index == 0:
        print(f"index: {index}")
        print(embedding_dimension)
        print(hidden_dimension)
        print(sub_autoencoder)
        print(sub_autoencoder.encoder_weight.shape)
        print(sub_autoencoder.decoder_weight.shape)
    
    ae_optimizer = optimizer(sub_autoencoder)
    ae_scheduler = scheduler(ae_optimizer) if scheduler is not None else scheduler
    
    

index: 0
387
1000
DenoisingAutoencoder(
  (activation): ReLU()
  (corruption): Dropout(p=0.2, inplace=False)
)
torch.Size([1000, 387])
torch.Size([387, 1000])


#### Pretrain completa:
treinamento de 20 épocas para cada camada do autoencoder (sub_autoencoder)

In [None]:
import torch.nn as nn
import torch
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import TensorDataset
from ptsdae.dae import DenoisingAutoencoder


# PRETRAIN FUNC

corruption = 0.2
optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9) #  it means the current update is made of 90% of the previous update (momentum) and 10% of the new gradient
scheduler=lambda x: StepLR(x, 100, gamma=0.1)
current_dataset = ds_train
current_validation = ds_val
number_of_subautoencoders = len(autoencoder.dimensions) - 1

for index in range(number_of_subautoencoders): # 4 subautoencoders
    
    encoder, decoder = autoencoder.get_stack(index)
    # index = 0: [encoder from input_dim → 1000] e [decoder from 1000 → input_dim]
    # index = 1: [encoder from 1000 → 2000] e [decoder from 2000 → 1000]
    
    # autoencoder.dimensions = [input_dim, 1000, 2000, 2000, num_clusters]
    # index = 0: embedding_dimension: [input_dim] e hidden_dimension: [1000]
    embedding_dimension = autoencoder.dimensions[index]
    hidden_dimension = autoencoder.dimensions[index + 1]
    
    
    # manual override to prevent corruption for the last subautoencoder
    if index == (number_of_subautoencoders - 1):
        corruption = None
        
        
    # initialise the subautoencoder
    # retorna o objeto sub_autoencoder
    # [Input] → 1000 → 2000 → 2000 → [num_clusters] --> SÃO 4 TRANSIÇÕES
    # sub-autoencoder = one encoder layer + one decoder layer trained to reconstruct its input
    sub_autoencoder = DenoisingAutoencoder( # 
        embedding_dimension=embedding_dimension,
        hidden_dimension=hidden_dimension,
        activation=torch.nn.ReLU() if index != (number_of_subautoencoders - 1) else None,
        corruption=nn.Dropout(corruption) if corruption is not None else None,
    )

    ae_optimizer = optimizer(sub_autoencoder)
    ae_scheduler = scheduler(ae_optimizer) if scheduler is not None else scheduler
    
    # TREINAMENTO POR CAMADA do SAE
    # no index = 0, modelo irá aprender a codificar: [n_samples, 387] -> [n_samples, 1000]
    # e a decodificar: [n_samples, 1000] -> [n_samples, 387]
    train(
        current_dataset,
        sub_autoencoder,
        epochs=20,
        batch_size=10,
        optimizer=ae_optimizer,
        validation=current_validation,
        corruption=None,  # already have dropout in the DAE !! (mas porque?)
        scheduler=ae_scheduler,
        cuda=False,
        sampler=None,
        silent=True,
    )
    # copiar os pesos adquiridos no treinamento acima para o encoder e decoder do SAE instanciado anteriormente
    sub_autoencoder.copy_weights(encoder, decoder)
    
    print(f"Treinamento feito para camada {index}")
    
    # current_dataset e current_validation são substituídos completamente pelo que o sub_autoencoder[index] prediziu
    # note que: ele está fazendo somente o encode iterativamente.
    # note também que: dentro de predict(), muda-se o modelo para modo .eval()
    if index != (number_of_subautoencoders - 1):
        current_dataset = TensorDataset(
            predict(
                current_dataset,
                sub_autoencoder,
                batch_size=10,
                cuda=False,
                silent=False,
            )
        )
        if current_validation is not None:
            current_validation = TensorDataset(
                predict(
                    current_validation,
                    sub_autoencoder,
                    batch_size=10,
                    cuda=False,
                    silent=False,
                )
            )
    else: # Não entendi porque é designado None, no final de contas
        current_dataset = None
        current_validation = None

index: 0
387
1000
DenoisingAutoencoder(
  (activation): ReLU()
  (corruption): Dropout(p=0.2, inplace=False)
)

Output of AE (shape: torch.Size([10, 387]))
Validation loss: 0.024787
Validation loss: 0.018079




Validation loss: 0.013258
Validation loss: 0.010336
Validation loss: 0.008591
Validation loss: 0.007457
Validation loss: 0.006591
Validation loss: 0.006010
Validation loss: 0.005489
Validation loss: 0.005096
Validation loss: 0.004747
Validation loss: 0.004451
Validation loss: 0.004246
Validation loss: 0.004014
Validation loss: 0.003837
Validation loss: 0.003668
Validation loss: 0.003526
Validation loss: 0.003396
Validation loss: 0.003269
Validation loss: 0.003157
Treinamento feito para camada 0


                                        


Output of AE (shape: torch.Size([10, 1000]))




Validation loss: 0.007170
Validation loss: 0.006500
Validation loss: 0.005854
Validation loss: 0.005273
Validation loss: 0.004801
Validation loss: 0.004431
Validation loss: 0.004109
Validation loss: 0.003847
Validation loss: 0.003622
Validation loss: 0.003440
Validation loss: 0.003267
Validation loss: 0.003128
Validation loss: 0.002997
Validation loss: 0.002879
Validation loss: 0.002784
Validation loss: 0.002692
Validation loss: 0.002607
Validation loss: 0.002532
Validation loss: 0.002457
Validation loss: 0.002394
Treinamento feito para camada 1


                                        


Output of AE (shape: torch.Size([10, 2000]))




Validation loss: 0.002791
Validation loss: 0.002669
Validation loss: 0.002541
Validation loss: 0.002418
Validation loss: 0.002304
Validation loss: 0.002202
Validation loss: 0.002105
Validation loss: 0.002023
Validation loss: 0.001948
Validation loss: 0.001878
Validation loss: 0.001814
Validation loss: 0.001755
Validation loss: 0.001702
Validation loss: 0.001651
Validation loss: 0.001607
Validation loss: 0.001565
Validation loss: 0.001526
Validation loss: 0.001489
Validation loss: 0.001455
Validation loss: 0.001423
Treinamento feito para camada 2


                                        


Output of AE (shape: torch.Size([10, 2000]))
Validation loss: 0.000590
Validation loss: 0.000585
Validation loss: 0.000579
Validation loss: 0.000573
Validation loss: 0.000568
Validation loss: 0.000562
Validation loss: 0.000557
Validation loss: 0.000551
Validation loss: 0.000546
Validation loss: 0.000541




Validation loss: 0.000537
Validation loss: 0.000532
Validation loss: 0.000528
Validation loss: 0.000524
Validation loss: 0.000520
Validation loss: 0.000516
Validation loss: 0.000512
Validation loss: 0.000509
Validation loss: 0.000505
Validation loss: 0.000502
Treinamento feito para camada 3


### Após o Pretraining, aplicamos o Train de fato

Vamos reaproveitar a função train já implementada anteriormente para fazer um treinamento de 20 épocas sobre o objeto SAE (StackedAutoEncoder) completo (não será feito treinamento por camadas como no pretreino)

In [97]:
train(
    ds_train,
    autoencoder,
    cuda=False,
    validation=ds_val,
    epochs=20,
    batch_size=10,
    optimizer=ae_optimizer,
    scheduler=StepLR(ae_optimizer, 100, gamma=0.1),
    corruption=0.2,
    silent=True
)

# Como o dataset é minúsculo, o modelo está em regime de overfitting e possui validation loss muito pequeno e constante
    

0.2

Output of AE (shape: torch.Size([10, 387]))
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
Validation loss: 0.009188
