## Setup

In [1]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import pandas as pd
import numpy as np
from numpy import random as np_rnd
import random as rnd
import gc
import datetime
import copy
import pickle
from sklearn.metrics import accuracy_score, f1_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import transforms
from torch.optim import AdamW
from transformers import get_polynomial_decay_schedule_with_warmup

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj
    
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)

In [4]:
class CFG:
    local = False
    debug = False
    
    epochs = 20
    early_stopping_rounds = max(10, epochs // 5)
    batch_size = 128
    number_of_labels = 10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    eta = 5e-4
    weight_decay = 1e-4

if CFG.debug:
    CFG.epochs = 5

## Loading CIFAR10 Dataset

In [5]:
# Create an instance for training. 
# When we run this code for the first time, the CIFAR10 train dataset will be downloaded locally. 
train_ds = datasets.CIFAR10(root="./data", train=True, download=True)
train_mean = [train_ds.data[:,:,:,i].mean() for i in range(train_ds.data.shape[-1])]
del train_ds; gc.collect()

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data


120

**INFO: Use MinMax Normalization 0~1 (different from the original paper)**

In [6]:
# Create Train dataset loader
train_ft = transforms.Compose([
    transforms.ToTensor(),
    transforms.ConvertImageDtype(torch.float32),
    transforms.Normalize(0, 255),
    # transforms.Normalize(train_mean, 1),
    transforms.RandomHorizontalFlip(),
])
train_dl = DataLoader(
    datasets.CIFAR10(root="./data", train=True, transform=train_ft ,download=True),
    batch_size=CFG.batch_size, shuffle=True, drop_last=True
)

# Create Test dataset loader
test_ft = transforms.Compose([
    transforms.ToTensor(),
    transforms.ConvertImageDtype(torch.float32),
    transforms.Normalize(0, 255),
    # transforms.Normalize(train_mean, 1),
    # transforms.RandomHorizontalFlip(),
])
test_dl = DataLoader(
    datasets.CIFAR10(root="./data", train=False, transform=test_ft, download=True),
    batch_size=CFG.batch_size, shuffle=False, drop_last=False
)

print("The number of images in a test set is: ", len(test_dl) * CFG.batch_size)
print("The number of iteration is: ", len(train_dl))

Files already downloaded and verified
Files already downloaded and verified
The number of images in a test set is:  10112
The number of iteration is:  390


## Create Model & Define helper functions

In [7]:
def get_optimizer_params(model, eta, weight_decay):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        # apply weight decay
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'lr': eta, 'weight_decay': weight_decay},
        # don't apply weight decay for LayerNormalization layer
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'lr': eta, 'weight_decay': 0.0},
    ]
    return optimizer_parameters


def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    scheduler = get_polynomial_decay_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=0.5, lr_end=1e-7
    )
    return scheduler


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

**INFO: The model is simplified architecture from original paper**


In [8]:
base_hidden_layers = 16

class VGGNet(nn.Module):
    def __init__(self, ):
        super(VGGNet,self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(3, base_hidden_layers, kernel_size=(3, 3), padding="same"),
            nn.ReLU(),
            nn.Conv2d(base_hidden_layers, base_hidden_layers, kernel_size=(3, 3), padding="same"),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2)),
            nn.Conv2d(base_hidden_layers, base_hidden_layers * 2, kernel_size=(3, 3), padding="same"),
            nn.ReLU(),
            nn.Conv2d(base_hidden_layers * 2, base_hidden_layers * 2, kernel_size=(3, 3), padding="same"),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2)),
            nn.Flatten(),
        )

        self.fc = nn.Sequential(
            nn.Dropout(),
            nn.Linear(base_hidden_layers * 2 * 8 * 8, base_hidden_layers * 2 * 8 * 8 // 4),
            nn.ReLU(),
        )

        self.classifier = nn.Linear(base_hidden_layers * 2 * 8 * 8 // 4, CFG.number_of_labels)

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return self.classifier(x)

## Training

In [9]:
def fn_training():
    return_score_dic = []
    best_score = np.inf
    
    seed_everything()
    for epoch in range(CFG.epochs):
        train_loss = AverageMeter('Loss', ':.4e')
        valid_loss = AverageMeter('Loss', ':.4e')
        train_accuracy = AverageMeter('Accuracy', ':.4e')
        valid_accuracy = AverageMeter('Accuracy', ':.4e')
        train_f1 = AverageMeter('F1', ':.4e')
        valid_f1 = AverageMeter('F1', ':.4e')

        model.train()
        for feature, label in train_dl:
            with torch.cuda.amp.autocast():
                feature = feature.to(device)
                label = label.to(device)
                output = model(feature)
                loss = criterion(output, label)

            # initialization gradients to zero
            optimizer.zero_grad()
            # get scaled gradients by float16 (default)
            grad_scaler.scale(loss).backward()
            # apply original gradients (unscaling) to parameters
            # if these gradients do not contain infs or NaNs, optimizer.step() is then called.
            # otherwise, optimizer.step() is skipped.
            grad_scaler.step(optimizer)
            grad_scaler.update()
            scheduler.step()
            
            train_loss.update(loss.item())

            y_pred = output.argmax(axis=-1).detach().cpu().numpy()
            y_true = label.detach().cpu().numpy()

            train_accuracy.update(accuracy_score(y_true, y_pred))        
            train_f1.update(f1_score(y_true, y_pred, average="macro"))

        model.eval()
        for feature, label in test_dl:
            with torch.no_grad():
                feature = feature.to(device)
                label = label.to(device)
                output = model(feature)
                loss = criterion(output, label)

            valid_loss.update(loss.item())  

            y_pred = output.argmax(axis=-1).detach().cpu().numpy()
            y_true = label.detach().cpu().numpy()

            valid_accuracy.update(accuracy_score(y_true, y_pred))        
            valid_f1.update(f1_score(y_true, y_pred, average="macro"))

        score = valid_loss.avg
        return_score_dic.append({
            "epoch": epoch,
            "train_loss": train_loss.avg,
            "valid_loss": valid_loss.avg,
            "train_accuracy": train_accuracy.avg,
            "valid_accuracy": valid_accuracy.avg,
            "train_f1": train_f1.avg,
            "valid_f1": valid_f1.avg,  
        })

        if score < best_score:
            best_score = copy.deepcopy(score)
            early_stopping_cnt = 0 
        else:
            early_stopping_cnt += 1

        if early_stopping_cnt == CFG.early_stopping_rounds:
            print("INFO : Early Stopped ! (Epoch[{0}/{1}])".format(epoch+1, CFG.epochs))  
            break

        print(f'[{epoch+1:02d}/{CFG.epochs}]:  * Train Loss {train_loss.avg:.3f} * Train Accuracy {train_accuracy.avg:.3f} * Train F1 {train_f1.avg:.3f} * Valid Loss {valid_loss.avg:.3f} * Valid Accuracy {valid_accuracy.avg:.3f} * Valid F1 {valid_f1.avg:.3f}')

    return return_score_dic

In [10]:
# model
model = VGGNet()
model.to(device)

# optimizer & scheduler
optimizer_parameters = get_optimizer_params(
    model,
    eta=CFG.eta,
    weight_decay=CFG.weight_decay
)
optimizer = AdamW(optimizer_parameters, lr=CFG.eta, weight_decay=CFG.weight_decay)
scheduler = get_scheduler(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dl) * CFG.epochs)
)

# loss function
criterion = nn.CrossEntropyLoss()

# gradient scaler for fast operation with float16
grad_scaler = torch.cuda.amp.GradScaler()

In [11]:
%%time

score_dic = fn_training()

[01/20]:  * Train Loss 2.303 * Train Accuracy 0.098 * Train F1 0.024 * Valid Loss 2.303 * Valid Accuracy 0.099 * Valid F1 0.018
[02/20]:  * Train Loss 2.303 * Train Accuracy 0.098 * Train F1 0.023 * Valid Loss 2.303 * Valid Accuracy 0.101 * Valid F1 0.018
[03/20]:  * Train Loss 2.303 * Train Accuracy 0.096 * Train F1 0.021 * Valid Loss 2.303 * Valid Accuracy 0.099 * Valid F1 0.018
[04/20]:  * Train Loss 2.229 * Train Accuracy 0.150 * Train F1 0.087 * Valid Loss 1.959 * Valid Accuracy 0.305 * Valid F1 0.286
[05/20]:  * Train Loss 1.890 * Train Accuracy 0.323 * Train F1 0.308 * Valid Loss 1.775 * Valid Accuracy 0.367 * Valid F1 0.352
[06/20]:  * Train Loss 1.763 * Train Accuracy 0.373 * Train F1 0.359 * Valid Loss 1.656 * Valid Accuracy 0.413 * Valid F1 0.396
[07/20]:  * Train Loss 1.681 * Train Accuracy 0.401 * Train F1 0.387 * Valid Loss 1.601 * Valid Accuracy 0.431 * Valid F1 0.416
[08/20]:  * Train Loss 1.632 * Train Accuracy 0.420 * Train F1 0.405 * Valid Loss 1.554 * Valid Accuracy

In [12]:
pd.DataFrame(score_dic).mean().iloc[1:]

train_loss        1.679539
valid_loss        1.615322
train_accuracy    0.388708
valid_accuracy    0.415546
train_f1          0.362206
valid_f1          0.386823
dtype: float64