In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import lib

import torch
import torchvision
from torchvision import transforms as T
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
from torchvision.datasets import ImageFolder
from torchsummary import summary
from pytorch_lightning.loggers import WandbLogger
from torchmetrics import Accuracy
import wandb

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from math import floor, ceil
from sklearn.model_selection import train_test_split
from scipy import stats
import plotly.express as px
import seaborn as sns

import shutil
import requests
import functools
import pathlib
from pathlib import Path
import shutil
from tqdm.auto import tqdm
import os
from collections import defaultdict
import pickle

from IPython.display import clear_output

sns.set_theme()
matplotlib.rcParams['figure.figsize'] = (30, 5)

In [3]:
n_epochs = 3
batch_size_train = 64
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10

random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7ff3a80fd8d0>

In [4]:
train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('mnist/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('mnist/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_test, shuffle=True)

In [5]:
class Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 3, 3, 1, 1),
            nn.BatchNorm2d(3),
            nn.ReLU(),
            nn.Conv2d(3, 6, 3, 1, 1),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(6, 10)
        )
        self.loss = nn.CrossEntropyLoss(label_smoothing=0.1)
        self.accuracy = Accuracy(task='multiclass', num_classes=10)
        self.save_hyperparameters()

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        loss = self.loss(pred, y)
        acc = self.accuracy(pred, y)
        return {'loss': loss, 'acc': acc}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        loss = self.loss(pred, y)
        acc = self.accuracy(pred, y)
        return {'loss': loss, 'acc': acc}

    def configure_optimizers(self):
        """ Define optimizers and LR schedulers. """
        optimizer = torch.optim.Adam([
            {'params': self.layers.parameters(), 'lr': 3e-4},
        ], weight_decay=3e-4)

        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 
            mode='max', 
            factor=0.2, 
            patience=5, 
            verbose=True)
            
        lr_dict = {
            "scheduler": lr_scheduler,
            "interval": "epoch",
            "frequency": 1,
            "monitor": "val/acc"
        } 

        return [optimizer], [lr_dict]

    # OPTIONAL
    def training_epoch_end(self, outputs):
        """log and display average train loss and accuracy across epoch"""
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['acc'] for x in outputs]).mean()

        self.print(f"| TRAIN loss: {avg_loss:.2f}, acc: {avg_acc:.2f}" )

        self.log('train/loss', avg_loss, prog_bar=True, on_epoch=True, on_step=False)
        self.log('train/acc', avg_acc, prog_bar=True, on_epoch=True, on_step=False)

    # OPTIONAL
    def validation_epoch_end(self, outputs):
        """log and display average val loss and accuracy"""
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['acc'] for x in outputs]).mean()

        self.print(f"[Epoch {self.trainer.current_epoch:3}] VALID loss: {avg_loss:.2f}, acc: {avg_acc:.2f}", end= " ")

        self.log('val/loss', avg_loss, prog_bar=True, on_epoch=True, on_step=False)
        self.log('val/acc', avg_acc, prog_bar=True, on_epoch=True, on_step=False)

In [13]:
f'_date={lib.today()}'

'_date=2023-01-15_15:43'

In [10]:
wandb.finish()
wandb.init(
    project='MNIST example with pytorhc lightning', 
    name='save hyperparameters at the end #2', 
    notes='tried to save hp at end')

wandb_logger = WandbLogger()

MyModelCheckpoint = ModelCheckpoint(dirpath='checkpoints/mnist/',
                                    filename='{epoch}_{val_srocc:.3f}_{val_plcc:.3f}_{val_loss:.3f}' + f'_date={lib.today()}',
                                    monitor='val/acc', 
                                    mode='max', 
                                    save_top_k=1,
                                    save_weights_only=True,
                                    verbose=False)

MyEarlyStopping = EarlyStopping(monitor = "val/acc",
                                mode = "max",
                                patience = 15,
                                verbose = True)

trainer = pl.Trainer(
    logger=wandb_logger,
    max_epochs=100,
    accelerator='cpu',
    # devices=[1],
    callbacks=[MyEarlyStopping, MyModelCheckpoint],
    log_every_n_steps=1,
)

model = Model()

0,1
epoch,▁▁
train/acc,▁
train/loss,▁
trainer/global_step,▁▁
val/acc,▁
val/loss,▁

0,1
epoch,0.0
train/acc,0.15495
train/loss,2.27823
trainer/global_step,937.0
val/acc,0.2003
val/loss,2.24086


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669346099994677, max=1.0…

  rank_zero_warn(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
trainer.fit(model, train_loader, test_loader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name     | Type               | Params
------------------------------------------------
0 | layers   | Sequential         | 288   
1 | loss     | CrossEntropyLoss   | 0     
2 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
288       Trainable params
0         Non-trainable params
288       Total params
0.001     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


[Epoch   0] VALID loss: 2.33, acc: 0.11 

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val/acc improved. New best score: 0.197


[Epoch   0] VALID loss: 2.27, acc: 0.20 | TRAIN loss: 2.31, acc: 0.17


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
