In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=100, shuffle=True)

val_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor(), download=True)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=100, shuffle=False)

In [3]:
from lightning_extensions import BaseModule
from models import VAE

def kl_loss(z_mean, z_log_var):
        return -0.5 * torch.sum(1 + z_log_var - z_mean.pow(2) - z_log_var.exp())
    
def recon_loss(inputs, outputs):
    return F.mse_loss(inputs, outputs, reduction='sum')

class VAEModule(BaseModule):
    def __init__(self):
        model = VAE()
        super().__init__(model)
        self.save_hyperparameters()

    def forward(self, x, y):
        return self.model(x, None, y)

    def step(self, batch, batch_idx, mode = 'train'):
        x, y = batch
        outputs, outputs_masked, z, z_mean, z_log_var = self(x, y)
        loss = {}
        loss['recon_loss_0'] = recon_loss(x, outputs[0])
        loss['kl_loss'] = kl_loss(z_mean, z_log_var)
        loss['loss'] = loss['recon_loss_0'] + loss['kl_loss']
        
        self.log_dict({f"{mode}_{key}": val.item() for key, val in loss.items()}, sync_dist=True, prog_bar=True)
        return loss['loss']

from softadapt import SoftAdapt, NormalizedSoftAdapt, LossWeightedSoftAdapt
class VAEModuleSoftAdapt(BaseModule):
    def __init__(self):
        model = VAE()
        self.softadapt_object = LossWeightedSoftAdapt(beta=0.001)
        self.reconstruction_losses = []
        self.kl_losses = []
        self.adapt_weights = torch.tensor([1,1])
        super().__init__(model)
        self.save_hyperparameters()

    def forward(self, x, y):
        return self.model(x, None, y)

    def step(self, batch, batch_idx, mode = 'train'):
        x, y = batch
        outputs, outputs_masked, z, z_mean, z_log_var = self(x, y)
        loss = {}
        loss['recon_loss_0'] = recon_loss(x, outputs[0])
        loss['kl_loss'] = kl_loss(z_mean, z_log_var)
        loss['loss'] = loss['recon_loss_0'] + loss['kl_loss']

        if mode == 'train':
            self.reconstruction_losses.append(loss['recon_loss_0'])
            self.kl_losses.append(loss['kl_loss'])

            if len(self.reconstruction_losses) > 100:
                first = torch.tensor(self.reconstruction_losses, dtype=torch.float64)
                second = torch.tensor(self.kl_losses, dtype=torch.float64)

                self.adapt_weights = self.softadapt_object.get_component_weights(first, second, verbose=False)

                self.reconstruction_losses = []
                self.kl_losses = []

        self.log_dict({f"{mode}_{key}": val.item() for key, val in loss.items()}, sync_dist=True, prog_bar=True)
        return self.adapt_weights[0]  * loss['recon_loss_0'] + self.adapt_weights[1] * loss['kl_loss']

In [3]:
from lightning_extensions import ExtendedTrainer

model = VAEModule()
model_name = "VAE-convolutional"
trainer = ExtendedTrainer(project_name="MTVAEs_SoftAdapt", max_epochs=30, model_name=model_name)
trainer.fit(model, train_loader, val_loader)

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/pyt ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable cod

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:630: Checkpoint directory /Home/siv34/edzak2974/projects/MastersThesis/src/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:452: A layer with UninitializedParameter was found. Thus, the total number of parameters detected may be inaccurate.

  | Name  | Type | Params
-------------------------------
0 | model | VAE  | 811 K 
-------------------------------
811 K     Trainable params
0         Non-trainable params
811 K     Total params
3.245     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=255` in the `DataLoader` to improve performance.


                                                                           

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=255` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 600/600 [00:10<00:00, 56.59it/s, v_num=i3qz, train_recon_loss_0=2.92e+3, train_kl_loss=568.0, train_loss=3.49e+3, val_recon_loss_0=2.91e+3, val_kl_loss=548.0, val_loss=3.46e+3]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 600/600 [00:10<00:00, 56.58it/s, v_num=i3qz, train_recon_loss_0=2.92e+3, train_kl_loss=568.0, train_loss=3.49e+3, val_recon_loss_0=2.91e+3, val_kl_loss=548.0, val_loss=3.46e+3]




In [4]:
from lightning_extensions import ExtendedTrainer

model = VAEModuleSoftAdapt()
model_name = "VAE-convolutional-softadapt"
trainer = ExtendedTrainer(project_name="MTVAEs_SoftAdapt", max_epochs=30, model_name=model_name)
trainer.fit(model, train_loader, val_loader)

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/pyt ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable cod

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:630: Checkpoint directory /Home/siv34/edzak2974/projects/MastersThesis/src/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:452: A layer with UninitializedParameter was found. Thus, the total number of parameters detected may be inaccurate.

  | Name  | Type | Params
-------------------------------
0 | model | VAE  | 811 K 
-------------------------------
811 K     Trainable params
0         Non-trainable params
811 K     Total params
3.245     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=255` in the `DataLoader` to improve performance.


                                                                           

/Home/siv34/edzak2974/.conda/envs/pytorch2.1/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=255` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 600/600 [00:10<00:00, 56.59it/s, v_num=jczd, train_recon_loss_0=2.65e+3, train_kl_loss=727.0, train_loss=3.37e+3, val_recon_loss_0=2.84e+3, val_kl_loss=716.0, val_loss=3.55e+3]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 600/600 [00:10<00:00, 56.58it/s, v_num=jczd, train_recon_loss_0=2.65e+3, train_kl_loss=727.0, train_loss=3.37e+3, val_recon_loss_0=2.84e+3, val_kl_loss=716.0, val_loss=3.55e+3]


