In [10]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision as tv
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchmetrics import Accuracy

from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from tqdm.autonotebook import tqdm
from sklearn.metrics import classification_report

SEED = 7
pl.seed_everything(SEED)

Global seed set to 7


7

Параметры обучения

In [11]:
BATCH_SIZE = 30
EPOCHS = 20
NUM_WORKERS = 4
LR = 1e-3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Параметры модели

In [12]:
IN_CHANALS = 1
OUT_CLASSES = 10

Создание DataModule

In [13]:
class MNISTDataModule(pl.LightningDataModule):
  def __init__(self,
               batch_size = 1,
               num_workers = 2,
               train_transforms=None,
               val_transforms=None,
               ):
    super().__init__()
    self.batch_size = batch_size
    self.num_workers = num_workers
    self.train_transforms = train_transforms
    self.val_transforms = val_transforms

  def setup(self, stage=None):
    self.train_set = tv.datasets.MNIST('./Datasets/', download=True, transform=self.train_transforms, train=True)
    val_test = tv.datasets.MNIST('./Datasets/', download=True, transform=self.val_transforms, train=False)
    self.val_set, self.test_set = torch.utils.data.random_split(val_test, [int(0.5*len(val_test)), int(0.5*len(val_test))])

  def train_dataloader(self):
    return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

  def val_dataloader(self):
    return DataLoader(self.val_set, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)

  def test_dataloader(self):
    return DataLoader(self.test_set, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)

Создание LightningModule

In [14]:
class LeNetClassifier(pl.LightningModule):
    def __init__(self, lr=1e-3, in_channels=1, out_classes=10):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.in_channels = in_channels
        self.out_classes = out_classes
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.model_part_1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=self.in_channels,
                            out_channels=6,
                            kernel_size=5,
                            padding=2), torch.nn.Tanh(),
            torch.nn.AvgPool2d(kernel_size=2, stride=2),
            torch.nn.Conv2d(in_channels=6,
                            out_channels=16,
                            kernel_size=5,
                            padding=0), torch.nn.Tanh(),
            torch.nn.AvgPool2d(kernel_size=2, stride=2))
        self.model_part_2 = torch.nn.Sequential(
            torch.nn.Linear(5 * 5 * 16, 120),
            torch.nn.Tanh(),
            torch.nn.Linear(120, 84),
            torch.nn.Tanh(),
            torch.nn.Linear(84, self.out_classes),
        )

        self.loss = torch.nn.CrossEntropyLoss()
        self.metrics = {
            "accuracy":
            Accuracy(task="multiclass",
                     num_classes=self.out_classes).to(device)
        }
        self.preds_stage = {
            "train": {
                "loss": [],
                "accuracy": []
            },
            "valid": {
                "loss": [],
                "accuracy": []
            },
            "test": {
                "loss": [],
                "accuracy": []
            }
        }

    def forward(self, x):
        out = self.model_part_1(x)
        out = out.view(out.size(0), out.size(1) * out.size(2) * out.size(3))
        out = self.model_part_2(out)
        return out

    def shared_step(self, sample, stage):
        x, y = sample
        preds = self.forward(x)
        loss = self.loss(preds, y)
        self.preds_stage[stage]['loss'].append(loss.detach().cpu())
        self.preds_stage[stage]['accuracy'].append(self.metrics["accuracy"](
            preds.argmax(dim=1), y).detach().cpu())
        return loss

    def shared_epoch_end(self, stage):
        loss = self.preds_stage[stage]['loss']
        loss = torch.stack(loss)
        loss = np.mean([x.item() for x in loss])

        acc = self.preds_stage[stage]['accuracy']
        acc = torch.stack(acc)
        acc = np.mean([x.item() for x in acc])

        metrics = {f"{stage}_loss": loss, f"{stage}_acc": acc}

        self.log_dict(metrics, prog_bar=True)

        self.preds_stage[stage]['loss'].clear()
        self.preds_stage[stage]['accuracy'].clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)

        scheduler_dict = {
            "scheduler":
            torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5),
            "interval":
            "epoch",
            "monitor":
            "valid_loss"
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}

    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, "train")

    def on_training_epoch_end(self):
        return self.shared_epoch_end("train")

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, "valid")

    def on_validation_epoch_end(self):
        return self.shared_epoch_end("valid")

    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, "test")

    def on_test_epoch_end(self):
        return self.shared_epoch_end("test")


Обучение

In [15]:
train_transforms = tv.transforms.Compose([
    tv.transforms.RandomHorizontalFlip(p=0.5),
    tv.transforms.ToTensor()    
])

val_transforms = tv.transforms.Compose([
    tv.transforms.ToTensor()    
])

In [16]:
dm = MNISTDataModule(BATCH_SIZE, NUM_WORKERS, train_transforms, val_transforms)
dm.setup()

In [17]:
model = LeNetClassifier(LR, IN_CHANALS, OUT_CLASSES)

callbacks = [
    ModelCheckpoint(dirpath='models',
                    filename='{epoch}_{valid_acc:.2f}_{valid_loss:.2f}',
                    save_top_k=2,
                    monitor='valid_loss',
                    mode='min'),
    LearningRateMonitor(logging_interval="step"),
    EarlyStopping(monitor="valid_loss",
                  min_delta=2e-4,
                  patience=5,
                  verbose=False,
                  mode="min")
]

TENSOR = "./logs"
logger = TensorBoardLogger(TENSOR, name="LeNET")

trainer = pl.Trainer(accelerator= "gpu", max_epochs=15, logger=logger, callbacks=callbacks)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
CHECKPOINT = None
trainer.fit(model, dm, ckpt_path=CHECKPOINT)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type             | Params
--------------------------------------------------
0 | model_part_1 | Sequential       | 2.6 K 
1 | model_part_2 | Sequential       | 59.1 K
2 | loss         | CrossEntropyLoss | 0     
--------------------------------------------------
61.7 K    Trainable params
0         Non-trainable params
61.7 K    Total params
0.247     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=15` reached.


Тестирование

In [19]:
trainer.test(model, dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.05567945263656637, 'test_acc': 0.9814371166828864}]

In [20]:
%load_ext tensorboard
%tensorboard --logdir ./logs/LeNET

Reusing TensorBoard on port 6006 (pid 13412), started 0:35:44 ago. (Use '!kill 13412' to kill it.)