In [1]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision import transforms as T
from torchvision.transforms import ToTensor, Lambda
import numpy as np
from tqdm import tqdm
from torchvision.models import resnet18

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from pytorch_lightning.loggers import TensorBoardLogger
import matplotlib.pyplot as plt

from utils import LitModel

import tensorboard

In [2]:
##################
learning_rate = 1e-3
batch_size = 64
epochs = 10
num_tta = 5
##################

In [3]:
train_augmentation = T.Compose([T.ToTensor(),
                                T.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),
                                T.RandomHorizontalFlip(),
                                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

test_augmentation = T.Compose([T.ToTensor(),
                                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

In [4]:
training_data = datasets.CIFAR10(
    root="data",
    train=True,
    download=True,
    transform=train_augmentation
)

test_data = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=test_augmentation
)

test_data_tta = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=train_augmentation
)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [5]:
train_dataset, val_dataset = random_split(training_data, [45000, 5000])

In [6]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=20)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=20)
test_dataloader = DataLoader(test_data, batch_size=batch_size, num_workers=20)
test_dataloader_tta = DataLoader(test_data_tta, batch_size=batch_size, num_workers=20)

In [7]:
logger = TensorBoardLogger("tb_logs", name="my_model")
early_stop_callback = EarlyStopping(monitor="val_loss", patience=3, verbose=True, mode="min")
checkpoint_callback = ModelCheckpoint('models', save_top_k=1, monitor='val_loss', verbose=True, mode='min')

In [8]:
from pickletools import optimize
from pytorch_lightning.core.lightning import LightningModule
from torchmetrics import functional as FM
from torch import nn
from torch.nn import functional as F
import torch
import numpy as np


class LitModel(LightningModule):
    def __init__(self, model, lr):
        super().__init__()
        self.model = model
        self.model.fc = nn.Linear(512, 10)
        self.lr = lr
        self.result_dict = {'val_loss':[]}

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        acc = FM.accuracy(y_hat, y)
        self.log("train_loss", loss, on_step=False, on_epoch=True)
        self.log("train_acc", acc, on_step=False, on_epoch=True)
        return loss

    def training_epoch_end(self, outputs):
        epoch = self.trainer.current_epoch
        train_loss, train_acc = self.trainer.callback_metrics['train_loss'], self.trainer.callback_metrics['train_acc']
        val_loss, val_acc = self.trainer.callback_metrics['val_loss'], self.trainer.callback_metrics['val_acc']
        print(f'epoch {epoch:2d} train_loss: {train_loss:0.4f} val_loss: {val_loss:0.4f} train_acc: {train_acc:0.4f} val_acc: {val_acc:0.4f}')  


    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        acc = FM.accuracy(y_hat, y)
        self.log("val_loss", loss, on_step=False, on_epoch=True)
        self.log("val_acc", acc, on_step=False, on_epoch=True)
        return {'val_loss': loss, 'val_acc': acc} 

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = FM.accuracy(logits, y)
        loss = F.cross_entropy(logits, y)
        self.log("test_loss", loss, on_step=False, on_epoch=True)
        self.log("test_acc", acc, on_step=False, on_epoch=True)

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x, y = batch
        y_hat = self.model(x)
        return y_hat

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer
        # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, mode='min', verbose=True)
        # return [optimizer], [lr_scheduler]

In [9]:
architecture = resnet18(pretrained=True)
model = LitModel(architecture, learning_rate)
trainer = Trainer(max_epochs=5, gpus=1, logger=logger, callbacks=[early_stop_callback, checkpoint_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [10]:
trainer.fit(model, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 11.2 M
---------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.727    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.150
Epoch 0, global step 703: val_loss reached 1.15026 (best 1.15026), saving model to "/home/kang/torch_lightening/models/epoch=0-step=703.ckpt" as top 1


epoch  0 train_loss: 1.2781 train_acc: 0.5548 val_loss: 1.1503 val_acc: 0.6082


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.150 >= min_delta = 0.0. New best score: 1.000
Epoch 1, global step 1407: val_loss reached 0.99977 (best 0.99977), saving model to "/home/kang/torch_lightening/models/epoch=1-step=1407-v1.ckpt" as top 1


epoch  1 train_loss: 0.9889 train_acc: 0.6615 val_loss: 0.9998 val_acc: 0.6644


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.137 >= min_delta = 0.0. New best score: 0.862
Epoch 2, global step 2111: val_loss reached 0.86234 (best 0.86234), saving model to "/home/kang/torch_lightening/models/epoch=2-step=2111.ckpt" as top 1


epoch  2 train_loss: 0.9057 train_acc: 0.6891 val_loss: 0.8623 val_acc: 0.7008


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.092 >= min_delta = 0.0. New best score: 0.770
Epoch 3, global step 2815: val_loss reached 0.76992 (best 0.76992), saving model to "/home/kang/torch_lightening/models/epoch=3-step=2815-v1.ckpt" as top 1


epoch  3 train_loss: 0.8202 train_acc: 0.7172 val_loss: 0.7699 val_acc: 0.7350


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 3519: val_loss was not in top 1


epoch  4 train_loss: 0.7643 train_acc: 0.7368 val_loss: 0.7705 val_acc: 0.7378


In [14]:
trainer.callback_metrics

{'val_loss': tensor(1.0815, device='cuda:0'),
 'val_acc': tensor(0.6246, device='cuda:0')}

In [18]:
trainer.test(test_dataloaders=test_dataloader)

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
  f"`.{fn}(ckpt_path=None)` was called without a model."
Restoring states from the checkpoint path at /home/kang/torch_lightening/models/epoch=8-step=6335.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/kang/torch_lightening/models/epoch=8-step=6335.ckpt


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8069000244140625, 'test_loss': 0.5629028081893921}
--------------------------------------------------------------------------------


[{'test_loss': 0.5629028081893921, 'test_acc': 0.8069000244140625}]

AttributeError: 'Trainer' object has no attribute 'log'

In [19]:
tta_pred_list = []
for _ in tqdm(range(num_tta)):
    y_hat = torch.vstack(trainer.predict(model=model, dataloaders=test_dataloader_tta))
    tta_pred_list.append(y_hat)
tta_pred_mean = torch.stack(tta_pred_list).mean(0)

  0%|          | 0/5 [00:00<?, ?it/s]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 704it [00:00, ?it/s]

 20%|██        | 1/5 [00:01<00:07,  1.88s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 704it [00:00, ?it/s]

 40%|████      | 2/5 [00:03<00:05,  1.91s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 704it [00:00, ?it/s]

 60%|██████    | 3/5 [00:05<00:03,  1.91s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 704it [00:00, ?it/s]

 80%|████████  | 4/5 [00:07<00:01,  1.91s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 704it [00:00, ?it/s]

100%|██████████| 5/5 [00:09<00:00,  1.91s/it]


In [20]:
np.mean(tta_pred_mean.argmax(1).numpy() == np.array(test_data.targets))

0.8075

In [None]:
data augmentation
TTA
# print log per epoch
# save & load model (checkpoint)
early stopping 
# lr scheduler
tensorboard
test example
