### Training a Network with Ignite

In [1]:
import os
ROOT_PATH = os.path.abspath(os.pardir)

import torchvision
import torchvision.transforms.transforms as T

import torch
import torch.nn as nn
from torch.utils.data  import DataLoader

import ignite
import ignite.engine as ign
import ignite.metrics as igm

import segmentation_models_pytorch as smp

from segwork.config.config import get_experiment_cfg
from segwork.data import DroneDataset, ToTensor
from segwork.registry import ConfigurableRegistry
from segwork.registry import models_reg

#### Read configuration

In [2]:
experiment_name = 'ExperimentA'
dir_path = os.path.join(ROOT_PATH, 'config')
file_path = os.path.join(dir_path, f'{experiment_name}.yaml')

cfg = get_experiment_cfg(file_path)
dict(cfg)


{'NAME': 'ExperimentoA',
 'DATASET': CfgNode({'KEY': 'drone', 'KEY_VAL': 'drone', 'TRANSFORM': CfgNode({'TRANSFORM': 'transformationA', 'TRANSFORM_ARGS': CfgNode({'height': 256, 'width': 512}), 'TARGET_TRANSFORM': 'transformationA', 'TARGET_TRANSFORM_ARGS': CfgNode({'height': 256, 'width': 512})})}),
 'DATALOADER': CfgNode({'BATCH_SIZE': 2, 'VAL_BATCH_SIZE': 2, 'WORKERS': 0}),
 'MODEL': CfgNode({'KEY': 'unet', 'ARGS': CfgNode({'encoder_name': 'resnet34', 'classes': 24}), 'DEVICE': 'cuda'}),
 'OPTIM': CfgNode({'KEY': 'sgd', 'ARGS': CfgNode({'lr': 0.1, 'momentum': 0.9}), 'LOSS': CfgNode({'KEY': 'crossentropyloss', 'ARGS': CfgNode({})})}),
 'LOGGER': CfgNode({'LOG_INTERVAL': 10}),
 'TRAIN': CfgNode({'MAX_EPOCHS': 100}),
 'VALIDATION': CfgNode({'METRICS_KEYS': ['loss'], 'ARGS': CfgNode({'loss': CfgNode({})})})}

In [3]:
# Configuration file
print(f"Config file for experiment: {cfg.get('NAME', None)}")

Config file for experiment: ExperimentoA


#### Data pipeline
Define modular components.
Components can be added when they are defined or later.

In [4]:
data_cfg = cfg.DATASET

dataset_reg = ConfigurableRegistry(
    class_key='dataset',
    unique = True,
    additional_args=['transform', 'target_transform'],
    initial_registry=dict(

        # Aerial drone dataset
        drone = dict(
            dataset = DroneDataset,
            _default_kwargs = dict(
                root = os.path.join(ROOT_PATH, 'data', 'semantic_drone_dataset'),
                pil_target = False,
                transform = T.ToTensor(),
                target_transform = T.ToTensor()
            )
        ), 
    ),
)

In [5]:
dataset_reg['voc'] = dict(
    dataset = torchvision.datasets.VOCSegmentation,
    _default_kwargs = dict(
        root = os.path.join(ROOT_PATH, 'data', 'PASCALVOC'),
        transform = nn.Sequential(ToTensor(), T.Resize(size=(288, 512)))
        
    )
)

In [6]:
dataset_reg['voc']

{'dataset': torchvision.datasets.voc.VOCSegmentation,
 '_default_kwargs': {'root': 'f:\\Desarrollo software\\segwork\\data\\PASCALVOC',
  'transform': Sequential(
    (0): ToTensor()
    (1): Resize(size=(288, 512), interpolation=bilinear, max_size=None, antialias=None)
  )}}

No additional modular components

In [7]:

train_dataset = dataset_reg.get_instance(data_cfg.KEY)
val_dataset = dataset_reg.get_instance(data_cfg.KEY_VAL)

img, label = train_dataset[0]
print(f'Image has type: {type(img)}. With size {img.size()}')
print(f'Label has type: {type(label)}. With size {label.size()}')


Image has type: <class 'torch.Tensor'>. With size torch.Size([3, 4000, 6000])
Label has type: <class 'torch.Tensor'>. With size torch.Size([1, 4000, 6000])


With different transformations. Define your transformation repository.

In [8]:
transformation_reg = ConfigurableRegistry(
    class_key='transformation',
    unique = True,
    initial_registry=dict(

        # Basic transformation
        totensor = dict(
            transformation = ToTensor,
        ), 
    )
)

In [9]:
@transformation_reg.register
class Transformation_A(nn.Sequential):
    """Transformation composed by ToTensor and Resize"""

    _register_name = 'transformationA'

    _default_kwargs = {
        "height" : 256,
        "width" : 512
    }

    def __init__(self, height:int, width:int):
        super().__init__()
        self.size = (height, width)
        self.add_module('totensor', ToTensor())
        self.add_module('resize', T.Resize(size=self.size))

In [10]:
transformation_reg

ConfigurableRegistry
	attr_name: _register_name
	unique: True
	Number of registered classes: 2 
	Registered classes: ['totensor', 'transformationA']
	Class key: transformation
	Attribute args: _default_args
	Attribute kwargs: _default_kwargs
	Additional info from attributes: []

In [11]:
transform_cfg = cfg.DATASET.TRANSFORM

transform = transformation_reg.get_instance(transform_cfg.TRANSFORM, **transform_cfg.TRANSFORM_ARGS) 
target_transform = transformation_reg.get_instance(transform_cfg.TARGET_TRANSFORM, **transform_cfg.TARGET_TRANSFORM_ARGS)

train_dataset = dataset_reg.get_instance(data_cfg.KEY, transform=transform, target_transform=target_transform)
val_dataset = dataset_reg.get_instance(data_cfg.KEY)

img, label = train_dataset[0]
print(f'Image has type: {type(img)}. With size {img.size()}')
print(f'Label has type: {type(label)}. With size {label.size()}')

Image has type: <class 'torch.Tensor'>. With size torch.Size([3, 256, 512])
Label has type: <class 'torch.Tensor'>. With size torch.Size([1, 256, 512])


In [12]:
train_dataset

Dataset DroneDataset
    Number of datapoints: 400
    Root location: f:\Desarrollo software\segwork\data\semantic_drone_dataset
    StandardTransform
Transform: Transformation_A(
             (totensor): ToTensor()
             (resize): Resize(size=(256, 512), interpolation=bilinear, max_size=None, antialias=None)
           )
Target transform: Transformation_A(
                    (totensor): ToTensor()
                    (resize): Resize(size=(256, 512), interpolation=bilinear, max_size=None, antialias=None)
                  )

In [13]:
dataloader_cfg = cfg.DATALOADER

# Parameters can be passed through this method **cfg.DATALOADER.ARGS
def get_data_loaders(batch_size:int, val_batch_size:int, num_workers:int, *args, **kwargs):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, *args, **kwargs)
    val_loader = DataLoader(val_dataset, batch_size=val_batch_size, num_workers=num_workers)
    return (train_loader, val_loader)


#### Define other registers for modular components
When you want to test other component, just add it to the initial registry or use the decorator when defining the class

In [14]:
metrics_reg = ConfigurableRegistry(
    class_key='metric',
    unique = True,
    initial_registry=dict(

        # Accuracy class from Ignite
        cm = dict(
            metric = igm.ConfusionMatrix,
            _default_kwargs = {
                "num_classes" : train_dataset.num_classes
            }
        ), 
        
        # Other metrics...
    ),
)

In [15]:
loss_reg = ConfigurableRegistry(
    class_key='loss',
    unique = True,
    initial_registry=dict(

        # Loss from ignite
        crossentropyloss = dict(
            loss = torch.nn.CrossEntropyLoss,
        ),
        
        # Dice loss from smp
        diceloss = dict(
            loss = smp.losses.DiceLoss,
            _default_kwargs = {
                "mode" : 'multiclass'
            }
        ), 
    ),
)

In [16]:
optim_reg = ConfigurableRegistry(
    class_key='algorithm',
    unique = True,
    additional_args=['required_args'],
    initial_registry=dict(
        
        sgd = dict(
            algorithm = torch.optim.SGD,
            _default_kwargs = {
                "lr" : 0.01,
                "momentum" : 0.9
            },
            required_args = ['Model parameters']
        ), 
    ),
)

#### Training step

##### Training step - Custom

In [17]:
# Get model and place it in device
print(f'Getting model...')
model = models_reg.get_instance(cfg.MODEL.KEY, **cfg.MODEL.ARGS)
device = cfg.MODEL.DEVICE
print(f'Moving model to {device}...')
model.to(device)

# Data loaders
print(f'Getting data loaders...')
train_loader = DataLoader(train_dataset, batch_size=dataloader_cfg.BATCH_SIZE, num_workers=dataloader_cfg.WORKERS)
val_loader = DataLoader(val_dataset, batch_size=dataloader_cfg.VAL_BATCH_SIZE)

# Optimizer - Can be easily include in a custom registry
print(f'Getting optimizer and loss function...')
optimizer = optim_reg.get_instance(cfg.OPTIM.KEY, model.parameters(), **cfg.OPTIM.ARGS)
criterion = loss_reg.get_instance(cfg.OPTIM.LOSS.KEY, **cfg.OPTIM.LOSS.ARGS)

# Val metrics
print(f'Getting validation metrics...')
cm = igm.ConfusionMatrix(num_classes=train_dataset.num_classes)
val_metrics = dict(
    cm = cm,
    mIoU = igm.mIoU(cm, ignore_index=0),
    loss = igm.Loss(criterion)
)

Getting model...
Moving model to cuda...
Getting data loaders...
Getting optimizer and loss function...
Getting validation metrics...


In [18]:
def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()
    x, y = batch[0].to(device), batch[1].to(device)
    y_pred = model(x)
    y = y.squeeze(1).long() # Minimum change
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.item()

trainer = ign.Engine(train_step)

def validation_step(engine, batch):
    model.eval()
    with torch.no_grad():
        x, y = batch[0].to(device), batch[1].to(device)
        y_pred = model(x)
        y = y.squeeze(1).long()
        return y_pred, y

evaluator = ign.Engine(validation_step)

# Attach metrics to the evaluators
for name, metric in val_metrics.items():
    metric.attach(evaluator, name)

for name, metric in val_metrics.items():
    metric.attach(evaluator, name)

In [19]:
@trainer.on(ign.Events.ITERATION_COMPLETED(every=cfg.LOGGER.LOG_INTERVAL))
def log_training_loss(engine):
    print(f"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f}")

@trainer.on(ign.Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(train_loader)
    metrics = evaluator.state.metrics
    print(f"Training Results - Epoch: {trainer.state.epoch}  mIoU: {metrics['mIoU']:.2f} Avg loss: {metrics['loss']:.2f}")

@trainer.on(ign.Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    evaluator.run(val_loader)
    metrics = evaluator.state.metrics
    print(f"Validation Results - Epoch: {trainer.state.epoch}  mIoU: {metrics['mIoU']:.2f} Avg loss: {metrics['loss']:.2f}")

In [20]:
# Score function to return current value of any metric we defined above in val_metrics
def score_function(engine):
    return engine.state.metrics["mIoU"]

# Checkpoint to store n_saved best models wrt score function
model_checkpoint = ignite.handlers.ModelCheckpoint(
    "checkpoint",
    n_saved=2,
    filename_prefix="best",
    score_function=score_function,
    score_name="mIoU",
    global_step_transform=ignite.handlers.global_step_from_engine(trainer), # helps fetch the trainer's state
)
  
# Save the model after every epoch of val_evaluator is completed
evaluator.add_event_handler(ign.Events.COMPLETED, model_checkpoint, {"model": model})

<ignite.engine.events.RemovableEventHandle at 0x15d07a73f70>

In [22]:
trainer.run(train_loader, max_epochs=cfg.TRAIN.MAX_EPOCHS)

Epoch[1], Iter[10] Loss: 1.88
Epoch[1], Iter[20] Loss: 1.50
Epoch[1], Iter[30] Loss: 1.65
Epoch[1], Iter[40] Loss: 1.12
Epoch[1], Iter[50] Loss: 2.06
Epoch[1], Iter[60] Loss: 1.99
Epoch[1], Iter[70] Loss: 1.38
Epoch[1], Iter[80] Loss: 0.97
Epoch[1], Iter[90] Loss: 1.60
Epoch[1], Iter[100] Loss: 1.15
Epoch[1], Iter[110] Loss: 0.75
Epoch[1], Iter[120] Loss: 1.04
Epoch[1], Iter[130] Loss: 0.93
Epoch[1], Iter[140] Loss: 0.44
Epoch[1], Iter[150] Loss: 0.92
Epoch[1], Iter[160] Loss: 5.43
Epoch[1], Iter[170] Loss: 1.24
Epoch[1], Iter[180] Loss: 1.02
Epoch[1], Iter[190] Loss: 1.02
Epoch[1], Iter[200] Loss: 1.13
Training Results - Epoch: 1  mIoU: 0.12 Avg loss: 1.86
Validation Results - Epoch: 1  mIoU: 0.12 Avg loss: 1.86
Epoch[2], Iter[210] Loss: 1.39
Epoch[2], Iter[220] Loss: 1.09
Epoch[2], Iter[230] Loss: 1.10
Epoch[2], Iter[240] Loss: 0.71
Epoch[2], Iter[250] Loss: 1.43
Epoch[2], Iter[260] Loss: 1.07
Epoch[2], Iter[270] Loss: 0.55
Epoch[2], Iter[280] Loss: 0.79
Epoch[2], Iter[290] Loss: 0.9

Engine run is terminating due to exception: 
Engine run is terminating due to exception: 


KeyboardInterrupt: 

In [21]:
logger = ign.utils.setup_logger()

AttributeError: module 'ignite.engine.utils' has no attribute 'setup_logger'

In [21]:
cm = ign.metrics.ConfusionMatrix(num_classes=20)

In [43]:
cm.update((label, label2))

ValueError: y_pred does not have correct number of classes: 256 vs 20