## Using latency optimization with custom operations

Here we describe the additional steps required to enable latency optimization in your model.

### Notebook consists of next main stages:
1. Setup the environment
1. Add a custom non searchable operation/module with latency to use with search space
1. Add a custom searchable operation with latency to use with search space
1. Build model with custom operation
1. Check pretrain search and tune phases

## 1. Setup the environment
First, let's set up the environment and common imports.

In [None]:
import os

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
# You should change to free GPU
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [None]:
from pathlib import Path

import torch
import torch.nn as nn

from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch_optimizer import RAdam
from torchvision.models.mobilenet import ConvBNReLU

from enot.models import BaseSearchableOperation
from enot.models import build_simple_block_model
from enot.models import register_searchable_op
from enot.models import SearchSpaceModel
from enot.phases import pretrain
from enot.phases import search
from enot.phases import train
from enot.utils.latency import conv_mac_count
from enot.utils.latency import LatencyMixin

from enot_utils.metric_utils import accuracy
from enot_utils.schedulers import WarmupScheduler

from tutorial_utils.dataset import create_imagenette_dataloaders

## 2. Add a custom non searchable operation/module with latency to use with search space

To add latency support to your custom regular operation/module, you need to implement latency calculation.

Adding latency calculation is done in two steps:
1. Make your class a child of `enot.utils.latency.LatencyMixin`. `LatencyMixin` offer interface of latency calculation for `SearchSpaceModel` using user defined methods of latency calculations (see next step).
2. Add a method with a signature `latency_<name>(self, spatial_size) -> float`, which calculates latency.

At this moment only `'mmac'` (millions of multiply-accumulates) is supported.

### Head and stem of your model

In [None]:
class MyStem(ConvBNReLU, LatencyMixin):
    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            groups=1,
            norm_layer=None,
    ):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.groups = groups
        
        super().__init__(in_channels, out_channels, kernel_size, stride, groups, norm_layer)
    
    def latency_mmac(self, spatial_size):
        """Calculate millions of multiply-accumulates"""
        mmac, _ = conv_mac_count(
            spatial_size=spatial_size,
            kernel_size=self.kernel_size,
            stride=self.stride,
            in_channels=self.in_channels,
            padding=0,
            out_channels=self.out_channels,
            groups=self.groups,
        )
        
        return mmac
    
    
class MyHead(nn.Sequential, LatencyMixin):
    def __init__(
        self, 
        in_channels,
        hidden_channels,
        num_classes,
        dropout_rate=0.2
    ):
        self.in_channels = in_channels
        self.hidden_channels = hidden_channels
        self.num_classes = num_classes
        
        super().__init__(
            ConvBNReLU(in_channels, hidden_channels, kernel_size=1),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_channels, num_classes)
        )
      
    def latency_mmac(self, spatial_size):
        """Calculate millions of multiply-accumulates"""
        mmac, (h_out, w_out) = conv_mac_count(spatial_size, 1, 1, self.in_channels, 0, self.hidden_channels)
        mmac += h_out * w_out * self.hidden_channels / 10**6
        mmac += self.hidden_channels * self.num_classes / 10**6
        
        return mmac

## 3. Add a custom searchable operation with latency to use with search space

We will re-implement our custom operation with latency support. All details of adding custom operation can be found in "Tutorial - adding custom ops".

**IMPORTANT**:<br>
`BaseSearchableOperation` is child of `LatencyMixin`, so you only need to add a method with a signature `latency_<name>(self, spatial_size) -> float` which calculates latency.

In [None]:
# Define short parameter parsing rules
# Format: {short_param_name: (original_param_name, parser)}
short_args = {
    'k': ('kernel_size', int),
}


@register_searchable_op('MyOp', short_args)
class MyOperation(BaseSearchableOperation):
    def __init__(
        self,
        in_channels,
        out_channels,
        stride,
        kernel_size=3,
        padding=None,   
        use_skip_connection=True
    ):
        super().__init__(
            in_channels,
            out_channels,
            use_skip_connection,
        )
            
        if padding is None:
            padding = (kernel_size - 1) // 2
        
        self.stride = stride
        self.kernel_size = kernel_size
        self.padding = padding
        
        self.operation = nn.Sequential(
            nn.Conv2d(
                in_channels=in_channels, 
                out_channels=out_channels, 
                kernel_size=kernel_size, 
                stride=stride, 
                padding=padding,
            ),
            nn.ReLU(),
            nn.BatchNorm2d(out_channels)
        )

    def get_last_batch_norm(self) -> nn.BatchNorm2d:
        return self.operation[-1]

    def replace_last_batch_norm(self, new_last_batch_norm: nn.BatchNorm2d) -> None:
        self.operation[-1] = new_last_batch_norm
        
    def operation_forward(self, x):
        return self.operation(x)
        
    def latency_mmac(self, spatial_size):
        """Calculate number of millions of multiply-accumulate operations in MyOperation"""
        
        def num_conv_steps(size, padding, kernel, stride):
            size = size + 2 * padding - (kernel - 1)
            return (size + stride - 1) // stride

        h, w = spatial_size
        h_steps = num_conv_steps(h, self.padding, self.kernel_size, self.stride)
        w_steps = num_conv_steps(w, self.padding, self.kernel_size, self.stride)
        mmac = h_steps * w_steps * self.kernel_size**2 * self.in_channels * self.out_channels
        mmac /= 10**6
        
        return mmac

## 4. Build model with custom operation

In [None]:
SEARCH_OPS = [
    'MIB_k=3_t=6',
    'MIB_k=5_t=6',
    'MyOp_k=3',
]

blocks_in_channels = 32
blocks_out_channels = 320
head_hidden_channels = 1280
num_classes = 10

# build model
model = build_simple_block_model(
    in_channels=blocks_in_channels,
    search_ops=SEARCH_OPS,
    blocks_out_channels=[24, 32, 64, 96, 160, blocks_out_channels],
    blocks_count=[2, 2, 2, 1, 2, 1],
    blocks_stride=[2, 2, 2, 1, 2, 1],
    stem=MyStem(in_channels=3, out_channels=blocks_in_channels, stride=2),
    head=MyHead(in_channels=blocks_out_channels, hidden_channels=head_hidden_channels, num_classes=num_classes),
)
# move model to search space
search_space = SearchSpaceModel(model).cuda()

## 5. Check pretrain search and tune phases

Let's check that everything works.

**IMPORTANT**:<br>
To turn on latency optimization you must set `latency_loss_weight` (> 0) parameter of `search`. 

In [None]:
ENOT_HOME_DIR = Path.home() / '.enot'
ENOT_DATASETS_DIR = ENOT_HOME_DIR / 'datasets'
PROJECT_DIR = ENOT_HOME_DIR / 'using_latency_optimization'

ENOT_HOME_DIR.mkdir(exist_ok=True)
ENOT_DATASETS_DIR.mkdir(exist_ok=True)
PROJECT_DIR.mkdir(exist_ok=True)

In [None]:
dataloaders = create_imagenette_dataloaders(
    dataset_root_dir=ENOT_DATASETS_DIR, 
    project_dir=PROJECT_DIR,
    input_size=(224, 224),
    batch_size=32,
    imagenette_kind='imagenette2-320',
)

In [None]:
# define directory for text logs and tensorboard logs
pretrain_dir = PROJECT_DIR / 'pretrain'
pretrain_dir.mkdir(exist_ok=True)

N_EPOCHS = 3
N_WARMUP_EPOCHS = 1
len_train = len(dataloaders['pretrain_train_dataloader'])

optimizer = SGD(params=search_space.model_parameters(), lr=0.06, momentum=0.9, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=len_train*N_EPOCHS, eta_min=1e-8)
scheduler = WarmupScheduler(scheduler, warmup_steps=len_train*N_WARMUP_EPOCHS)
loss_function = nn.CrossEntropyLoss().cuda()

pretrain(
    search_space=search_space,
    exp_dir=pretrain_dir,
    train_loader=dataloaders['pretrain_train_dataloader'],
    valid_loader=dataloaders['pretrain_validation_dataloader'],
    optimizer=optimizer,
    scheduler=scheduler,
    metric_function=accuracy,
    loss_function=loss_function,
    epochs=N_EPOCHS,
)

In [None]:
# define directory for text logs and tensorboard logs
search_dir = PROJECT_DIR / 'search'
search_dir.mkdir(exist_ok=True)

optimizer = RAdam(search_space.architecture_parameters(), lr=0.01)

search(
    search_space=search_space,
    exp_dir=search_dir,
    search_loader=dataloaders['search_train_dataloader'],
    valid_loader=dataloaders['search_validation_dataloader'],
    optimizer=optimizer,
    loss_function=loss_function,
    metric_function=accuracy,
    latency_loss_weight=2.0e-3,
    epochs=3,
)

In [None]:
# get regular model with best architecture
best_model = search_space.get_network_with_best_arch().cuda()

In [None]:
# define directory for text logs and tensorboard logs
tune_dir = PROJECT_DIR / 'tune'
tune_dir.mkdir(exist_ok=True)

optimizer = RAdam(best_model.parameters(), lr=5e-3, weight_decay=4e-5)

train(
    model=best_model,
    exp_dir=tune_dir,
    train_loader=dataloaders['tune_train_dataloader'],
    valid_loader=dataloaders['tune_validation_dataloader'],
    optimizer=optimizer,
    loss_function=loss_function,
    metric_function=accuracy,
    epochs=3,
)