# Training and Exporting Final Model

Now that we have chosen a good final model from the ones we trained and tuned, it's time to export and save them. SuperGradients allows models to be saves as ONNX models that are easy to deploy and run inferences with, and that is what we do here.

To condense the exporting notebook, I have omitted explanations to the steps which are already explained in the `03_supergradients_model.ipynb` notebook. Please refer back to it for further details.

***NOTE! This notebook and model training was all run on Kaggle and/or SaturnCloud with heavy GPU augmentation, and took up to two hours to run. I would recommend loading the notebooks onto SaturnCloud to run them.***

In [2]:
import os
import numpy as np 
import pandas as pd
import torch
import torchvision
import super_gradients
from pathlib import Path, PurePath

from PIL import Image
import pprint
import matplotlib.pyplot as plt
from torchvision.io import read_image
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torchvision.transforms import Resize
from torchvision import transforms

import albumentations as A
from albumentations.pytorch import ToTensorV2

from super_gradients import init_trainer, Trainer
from super_gradients.common import MultiGPUMode
from super_gradients.training.utils.distributed_training_utils import setup_gpu_mode
from super_gradients.training import Trainer
from super_gradients.training import training_hyperparams

from super_gradients.training import models
from super_gradients import Trainer

import bentoml
import onnx

You did not mention an AWS environment.You can set the environment variable ENVIRONMENT_NAME with one of the values: development,staging,production
env_sanity_check -INFO- ** A sanity check is done when importing super_gradients for the first time. **
-> You can see the details by setting the env variable DISPLAY_SANITY_CHECK=True prior to import.


In [3]:
class config:
    EXPERIMENT_NAME = 'kitchenware_classification'
    MODEL_NAME = 'vit_large'
    CHECKPOINT_DIR = 'checkpoints'
    WEIGHTS = "imagenet"
    TRAINING_PARAMS = "training_hyperparams/imagenet_vit_train_params"
    NUM_CLASSES = 6
    BATCH_SIZE = 16

    # specify the paths to training and validation set 
    IMAGE_PATH = './data/images'
    TRAIN_DATA = './data/train.csv'
    TEST_DATA = './data/test.csv'

    

    # set the input height and width
    INPUT_HEIGHT = 224
    INPUT_WIDTH = 224

    # set the input height and width
    IMAGENET_MEAN = [0.485, 0.456, 0.406]
    IMAGENET_STD = [0.229, 0.224, 0.225]

    NUM_WORKERS = os.cpu_count()

    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
trainer = Trainer(experiment_name=config.EXPERIMENT_NAME, ckpt_root_dir=config.CHECKPOINT_DIR)

model = models.get(model_name=config.MODEL_NAME, num_classes=config.NUM_CLASSES, pretrained_weights=config.WEIGHTS)

In [5]:
training_params =  training_hyperparams.get(config.TRAINING_PARAMS)

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize_config_dir(config_dir=pkg_resources.resource_filename("super_gradients.recipes", "")):


In [6]:
training_params["max_epochs"] = 5
training_params["zero_weight_decay_on_bias_and_bn"] = True
training_params['train_metrics_list'] = ['Accuracy']
training_params['valid_metrics_list'] = ['Accuracy']
training_params['ema'] = True
training_params["criterion_params"] = {'smooth_eps': 0.1} 
training_params['average_best_models'] = True
training_params["sg_logger_params"]["launch_tensorboard"] = False

In [7]:
def add_image_col(df):
    df['image'] = df['Id'].apply(lambda x: x +'.jpg')
    
# read labels into pandas df all cols as string
labels_df = pd.read_csv(config.TRAIN_DATA, dtype='str')
test_df = pd.read_csv(config.TEST_DATA, dtype='str')

# create col (xxxx.jpg), the image filename
add_image_col(labels_df)
add_image_col(test_df)

# map labels to integer
le = LabelEncoder()
labels_df['targets'] = le.fit_transform(labels_df['label'])

#split into train and validation sets
train_df, val_df = train_test_split(labels_df,  stratify= labels_df['targets'], test_size=.10, shuffle=True, random_state=42)

# number of samples in each
train_df.shape[0] , val_df.shape[0], test_df.shape[0]

(5003, 556, 3808)

In [8]:
class KitchwareDataset(Dataset):
    def __init__(self, dataframe , img_dir, split, transform = None):
        self.img_labels = dataframe 
        self.img_dir = img_dir
        self.split = split
        self.transform = transform
        
    def __len__(self):
        return len(self.img_labels)
    
    def __getitem__(self , idx):
        if self.split in ['train', 'val']:
            img_path = os.path.join(self.img_dir , self.img_labels.iloc[idx, 2])
            label = self.img_labels.iloc[idx, 3]
        else:
            img_path = os.path.join(self.img_dir , self.img_labels.iloc[idx, 1])
            
        original_image = Image.open(img_path)
        image = np.array(original_image)
        
        if self.transform:
            image = self.transform(image=image)['image']
            
        if self.split in ['train', 'val']: 
            return image, label 
        else:
            return image

In [9]:
# initialize our data augmentation functions
make_tensor = ToTensorV2()

normalize = A.Normalize(mean=config.IMAGENET_MEAN, 
                        std=config.IMAGENET_STD)

resize = A.Resize(height=config.INPUT_HEIGHT,
                  width=config.INPUT_WIDTH)

horizontal_flip = A.HorizontalFlip(p=0.50)

flip = A.Flip(p=0.50)

random_ninety = A.RandomRotate90()

random_crop = A.RandomCrop(height=config.INPUT_HEIGHT,
                           width=config.INPUT_WIDTH,
                           p=0.75)

hue_saturation = A.HueSaturationValue(p=.5)

iso_noise = A.ISONoise(p=.5)

color_jitter = A.ColorJitter(p=.5)

emboss = A.Emboss(p=.5)

channel_shuffle = A.ChannelShuffle(p=.5)

randomly_choose_one = A.OneOf([flip, 
                               random_ninety, 
                               iso_noise,
                               color_jitter,
                               emboss,
                               hue_saturation,
                               channel_shuffle], p=.50)

# initialize our training and validation set data augmentation pipeline
train_transforms = A.Compose([
  resize, 
  horizontal_flip, 
  random_crop,
  randomly_choose_one,
  normalize,
  make_tensor
])

val_transforms = A.Compose([resize, normalize, make_tensor])

In [10]:
train_data = KitchwareDataset(train_df , config.IMAGE_PATH , 'train', transform = train_transforms)
val_data = KitchwareDataset(val_df , config.IMAGE_PATH , 'val', transform = val_transforms)
test_data = KitchwareDataset(test_df, config.IMAGE_PATH, 'test',transform = val_transforms)

train_dataloader = DataLoader(train_data, batch_size = config.BATCH_SIZE , shuffle = True)
val_dataloader = DataLoader(val_data, batch_size = config.BATCH_SIZE, shuffle = True)
test_dataloader = DataLoader(test_data, batch_size = config.BATCH_SIZE, shuffle = False)

In [11]:
trainer.train(model=model, 
              training_params=training_params, 
              train_loader=train_dataloader,
              valid_loader=val_dataloader)

sg_trainer -INFO- Using EMA with params {'decay': 0.9999, 'beta': 15, 'exp_activation': True}
"events.out.tfevents.1671025833.w-aaish-kitchenware-classifica-972fef4f0a844bf6b8722c2a82avvgsx.1904.1" will not be deleted
"events.out.tfevents.1671026630.w-aaish-kitchenware-classifica-972fef4f0a844bf6b8722c2a82asqvbk.124.0" will not be deleted
"events.out.tfevents.1671025887.w-aaish-kitchenware-classifica-972fef4f0a844bf6b8722c2a82avvgsx.1904.2" will not be deleted
"events.out.tfevents.1671024953.w-aaish-kitchenware-classifica-972fef4f0a844bf6b8722c2a82avvgsx.1904.0" will not be deleted


sg_trainer -INFO- Started training for 5 epochs (0/4)



Train epoch 0: 100%|██████████| 313/313 [09:59<00:00,  1.91s/it, Accuracy=0.957, LabelSmoothingCrossEntropyLoss=0.53, gpu_mem=11.3] 
Validation epoch 0: 100%|██████████| 35/35 [00:26<00:00,  1.32it/s]


SUMMARY OF EPOCH 0
├── Training
│   └── Labelsmoothingcrossentropyloss = 0.5297
└── Validation
    ├── Accuracy = 0.9838
    └── Labelsmoothingcrossentropyloss = 0.4708

base_sg_logger -INFO- Checkpoint saved in checkpoints/kitchenware_classification/ckpt_best.pth
sg_trainer -INFO- Best checkpoint overriden: validation Accuracy: 0.9838129281997681


Train epoch 1: 100%|██████████| 313/313 [10:05<00:00,  1.94s/it, Accuracy=0.985, LabelSmoothingCrossEntropyLoss=0.464, gpu_mem=11.3]
Validation epoch 1: 100%|██████████| 35/35 [00:26<00:00,  1.32it/s]


SUMMARY OF EPOCH 1
├── Training
│   └── Labelsmoothingcrossentropyloss = 0.4645
│       ├── Best until now = 0.5297 ([32m↘ -0.0652[0m)
│       └── Epoch N-1      = 0.5297 ([32m↘ -0.0652[0m)
└── Validation
    ├── Accuracy = 0.982
    │   ├── Best until now = 0.9838 ([31m↘ -0.0018[0m)
    │   └── Epoch N-1      = 0.9838 ([31m↘ -0.0018[0m)
    └── Labelsmoothingcrossentropyloss = 0.4699
        ├── Best until now = 0.4708 ([32m↘ -0.0009[0m)
        └── Epoch N-1      = 0.4708 ([32m↘ -0.0009[0m)



Train epoch 2: 100%|██████████| 313/313 [10:02<00:00,  1.92s/it, Accuracy=0.991, LabelSmoothingCrossEntropyLoss=0.446, gpu_mem=11.3]
Validation epoch 2: 100%|██████████| 35/35 [00:26<00:00,  1.32it/s]


SUMMARY OF EPOCH 2
├── Training
│   └── Labelsmoothingcrossentropyloss = 0.446
│       ├── Best until now = 0.4645 ([32m↘ -0.0185[0m)
│       └── Epoch N-1      = 0.4645 ([32m↘ -0.0185[0m)
└── Validation
    ├── Accuracy = 0.9838
    │   ├── Best until now = 0.9838 ([32m↘ 0.0[0m)
    │   └── Epoch N-1      = 0.982  ([32m↗ 0.0018[0m)
    └── Labelsmoothingcrossentropyloss = 0.4665
        ├── Best until now = 0.4699 ([32m↘ -0.0034[0m)
        └── Epoch N-1      = 0.4699 ([32m↘ -0.0034[0m)



Train epoch 3: 100%|██████████| 313/313 [10:00<00:00,  1.92s/it, Accuracy=0.996, LabelSmoothingCrossEntropyLoss=0.433, gpu_mem=11.3]
Validation epoch 3: 100%|██████████| 35/35 [00:25<00:00,  1.37it/s]


SUMMARY OF EPOCH 3
├── Training
│   └── Labelsmoothingcrossentropyloss = 0.4326
│       ├── Best until now = 0.446  ([32m↘ -0.0134[0m)
│       └── Epoch N-1      = 0.446  ([32m↘ -0.0134[0m)
└── Validation
    ├── Accuracy = 0.9838
    │   ├── Best until now = 0.9838 ([32m↘ 0.0[0m)
    │   └── Epoch N-1      = 0.9838 ([32m↘ 0.0[0m)
    └── Labelsmoothingcrossentropyloss = 0.466
        ├── Best until now = 0.4665 ([32m↘ -0.0005[0m)
        └── Epoch N-1      = 0.4665 ([32m↘ -0.0005[0m)



Train epoch 4: 100%|██████████| 313/313 [09:58<00:00,  1.91s/it, Accuracy=0.997, LabelSmoothingCrossEntropyLoss=0.428, gpu_mem=11.3]
Validation epoch 4: 100%|██████████| 35/35 [00:25<00:00,  1.37it/s]


SUMMARY OF EPOCH 4
├── Training
│   └── Labelsmoothingcrossentropyloss = 0.4284
│       ├── Best until now = 0.4326 ([32m↘ -0.0042[0m)
│       └── Epoch N-1      = 0.4326 ([32m↘ -0.0042[0m)
└── Validation
    ├── Accuracy = 0.9838
    │   ├── Best until now = 0.9838 ([32m↘ 0.0[0m)
    │   └── Epoch N-1      = 0.9838 ([32m↘ 0.0[0m)
    └── Labelsmoothingcrossentropyloss = 0.4657
        ├── Best until now = 0.466  ([32m↘ -0.0003[0m)
        └── Epoch N-1      = 0.466  ([32m↘ -0.0003[0m)

sg_trainer -INFO- RUNNING ADDITIONAL TEST ON THE AVERAGED MODEL...


Validation epoch 5: 100%|██████████| 35/35 [00:24<00:00,  1.41it/s]


SUMMARY OF EPOCH 5
├── Training
│   └── Labelsmoothingcrossentropyloss = 0.4284
│       ├── Best until now = 0.4326 ([32m↘ -0.0042[0m)
│       └── Epoch N-1      = 0.4326 ([32m↘ -0.0042[0m)
└── Validation
    ├── Accuracy = 0.9856
    │   ├── Best until now = 0.9838 ([32m↗ 0.0018[0m)
    │   └── Epoch N-1      = 0.9838 ([32m↗ 0.0018[0m)
    └── Labelsmoothingcrossentropyloss = 0.465
        ├── Best until now = 0.4657 ([32m↘ -0.0007[0m)
        └── Epoch N-1      = 0.4657 ([32m↘ -0.0007[0m)



In [12]:
# Load the best model that we trained
best_model = models.get(config.MODEL_NAME,
                        num_classes=config.NUM_CLASSES,
                        checkpoint_path=os.path.join(trainer.checkpoints_dir_path, "ckpt_best.pth"))

### Saving the model

This is the step that saves the best model that we loaded previously as an ONNX model. This will save a seperate file into the directory which can be accessed by the other files.


In [14]:
best_model.eval()
best_model.prep_model_for_conversion(input_size=[1, 3, 224, 224])
dummy_input = torch.randn([1, 3, 224, 224], device=next(best_model.parameters()).device)

torch.onnx.export(
    best_model, 
    dummy_input, 
    "kitchenware_model.onnx",
    export_params=True,
    do_constant_folding=True,
    input_names=['input'],
    output_names=['output'],
)