In [14]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision import transforms as T
from torchvision.transforms import ToTensor, Lambda
import numpy as np
from tqdm import tqdm
from torchvision.models import resnet18

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from pytorch_lightning.loggers import TensorBoardLogger
import matplotlib.pyplot as plt

from utils import LitModel
import argparse

import tensorboard


In [15]:

# argpaser
parser = argparse.ArgumentParser(description='PyTorch Lightning Example')
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--learning_rate', type=float, default=1e-3)
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--num_tta', type=int, default=10)
parser.add_argument('--es_patience', type=int, default=10)
parser.add_argument('--num_workers', type=int, default=4)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--progress_bar', type=bool, default=False)
parser.add_argument('--checkpoint_verbose', type=bool, default=False)
parser.add_argument('--earlystopping_verbose', type=bool, default=False)
args = parser.parse_args([]) # default settings

seed = args.seed


In [16]:
architecture = resnet18(pretrained=True)

In [17]:

augmentation = T.Compose([T.ToTensor(),
                                T.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),
                                T.RandomHorizontalFlip(),
                                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

no_augmentation = T.Compose([T.ToTensor(),
                                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


In [18]:
training_data = datasets.CIFAR10(
    root="data",
    train=True,
    download=True,
    transform=augmentation
)

test_data = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=no_augmentation
)

test_data_tta = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=augmentation
)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [19]:

train_dataset, val_dataset = random_split(training_data, [45000, 5000])

train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False)
test_dataloader_tta = DataLoader(test_data_tta, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False)

logger = TensorBoardLogger("tb_logs", name="my_model")
early_stop_callback = EarlyStopping(monitor="val_loss", patience=args.es_patience, verbose=args.earlystopping_verbose, mode="min")
checkpoint_callback = ModelCheckpoint('models', save_top_k=1, monitor='val_loss', verbose=args.checkpoint_verbose, mode='min')

model = LitModel(architecture, args.learning_rate)
trainer = Trainer(max_epochs=args.epochs, 
                gpus=1,
                enable_progress_bar=args.progress_bar,
                logger=logger, 
                callbacks=[early_stop_callback, checkpoint_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 11.2 M
---------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.727    Total estimated model params size (MB)


epoch:  0 [train_loss: 1.2845 val_loss: 1.1349] [train_acc: 0.5568 val_acc: 0.6136]


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
Restoring states from the checkpoint path at /home/kang/pytorch_lightning_pipeline/models/epoch=0-step=703.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/kang/pytorch_lightning_pipeline/models/epoch=0-step=703.ckpt


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6662999987602234, 'test_loss': 1.0146721601486206}
--------------------------------------------------------------------------------


Traceback (most recent call last):
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/kang/anaconda3/envs/torch/lib/python3.7/mult

[{'test_loss': 1.0146721601486206, 'test_acc': 0.6662999987602234}]

In [None]:
trainer.fit(model, train_dataloader, val_dataloader)

trainer.test(test_dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 11.2 M
---------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.727    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


epoch:  0 [train_loss: 1.2776 val_loss: 1.1083] [train_acc: 0.5564 val_acc: 0.6204]
epoch:  1 [train_loss: 0.9953 val_loss: 0.9392] [train_acc: 0.6601 val_acc: 0.6750]
epoch:  2 [train_loss: 0.8981 val_loss: 0.8492] [train_acc: 0.6922 val_acc: 0.7008]
epoch:  3 [train_loss: 0.8332 val_loss: 0.8260] [train_acc: 0.7152 val_acc: 0.7158]
epoch:  4 [train_loss: 0.8103 val_loss: 0.8082] [train_acc: 0.7233 val_acc: 0.7182]
epoch:  5 [train_loss: 0.7589 val_loss: 0.8088] [train_acc: 0.7404 val_acc: 0.7272]
epoch:  6 [train_loss: 0.7475 val_loss: 0.6881] [train_acc: 0.7443 val_acc: 0.7610]
epoch:  7 [train_loss: 0.7145 val_loss: 0.8290] [train_acc: 0.7546 val_acc: 0.7322]
epoch:  8 [train_loss: 0.6736 val_loss: 0.7183] [train_acc: 0.7682 val_acc: 0.7526]
epoch:  9 [train_loss: 0.6496 val_loss: 0.6776] [train_acc: 0.7768 val_acc: 0.7706]
epoch: 10 [train_loss: 0.6192 val_loss: 0.6668] [train_acc: 0.7867 val_acc: 0.7686]
epoch: 11 [train_loss: 0.6167 val_loss: 0.6596] [train_acc: 0.7872 val_acc: 

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
  f"`.{fn}(ckpt_path=None)` was called without a model."
Restoring states from the checkpoint path at /home/kang/pytorch_lightning_pipeline/models/epoch=29-step=21119.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/kang/pytorch_lightning_pipeline/models/epoch=29-step=21119.ckpt


epoch: 39 [train_loss: 0.3434 val_loss: 0.5290] [train_acc: 0.8783 val_acc: 0.8290]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8431000113487244, 'test_loss': 0.47976747155189514}
--------------------------------------------------------------------------------


[{'test_loss': 0.47976747155189514, 'test_acc': 0.8431000113487244}]

In [None]:
tta_pred_list = []
for _ in tqdm(range(args.num_tta)):
    y_hat = torch.vstack(trainer.predict(model=model, dataloaders=test_dataloader_tta))
    tta_pred_list.append(y_hat)
tta_pred_mean = torch.stack(tta_pred_list).mean(0)

tta_acc = np.mean(tta_pred_mean.argmax(1).numpy() == np.array(test_data.targets))
print(f"TTA accuracy: {tta_acc}")

  0%|          | 0/10 [00:00<?, ?it/s]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 10%|█         | 1/10 [00:01<00:12,  1.38s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 20%|██        | 2/10 [00:02<00:10,  1.35s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 30%|███       | 3/10 [00:04<00:09,  1.34s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 40%|████      | 4/10 [00:05<00:08,  1.35s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 50%|█████     | 5/10 [00:06<00:06,  1.34s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 60%|██████    | 6/10 [00:08<00:05,  1.34s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 70%|███████   | 7/10 [00:09<00:04,  1.35s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 80%|████████  | 8/10 [00:10<00:02,  1.35s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 90%|█████████ | 9/10 [00:12<00:01,  1.35s/it]LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
100%|██████████| 10/10 [00:13<00:00,  1.35s/it]

TTA accuracy: 0.8527



