## Model Training:
> #### Main ideas:
* StratifiedKFold cross validation strategy.
* Bi-Tempered loss function to handle noisy labels.
* Self-distillation using soft labels.

More info about the model training: [How to train ML models with mislabeled data](https://aminey.medium.com/how-to-train-ml-models-with-mislabeled-data-cf4bb353b3d9?sk=9f4ce905cd5c4f2d86ec3bf7b93d024c).


## Datasets
You can either download the datasets below and train the model locally or create a kaggle notebook and attach the datasets to the notebook.


* The datasets are available at https://www.kaggle.com/c/cassava-leaf-disease-classification/data

* Soft labels dataset: https://www.kaggle.com/nickuzmenkov/cassava-leaf-disease-soft-targets-09-model

* The training notebook was forked and modified from https://www.kaggle.com/yasufuminakama/cassava-resnext50-32x4d-starter-training


# Import libraries

In [1]:
import os
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # specify GPUs locally
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

import albumentations as A
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Reading the data

In [2]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


#Path to image folder
TRAIN_PATH = '../input/cassava-leaf-disease-classification/train_images'

#Train dataset
train = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train= pd.get_dummies(train, columns=['label'])

#Soft labels dataset
soft_labels='../input/cassava-leaf-disease-soft-targets-09-model/soft_targets_2020.csv'

# Configuration parameters

In [3]:
class CFG:
    debug=False
    apex=False
    print_freq=100
    num_workers=4
    model_name='resnext50_32x4d'
    size=328
    epochs=10
    T_0=10 # CosineAnnealingWarmRestarts
    lr=1e-4
    min_lr=1e-6
    batch_size=32
    weight_decay=1e-6
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    target_size=5
    target_col='label'
    n_fold=5
    trn_fold=[0]
    train=True
    smoothing=0.05
    t1=0.3 # bi-tempered-loss temperature 1 parameter
    t2=1.0 # bi-tempered-loss temperature 2 parameter
    solf_label=True

# Utils

In [4]:
# helper functions
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

# Cross-Validation
StratifiedKFold cross validation to split the 5 classes equally between the folds

In [5]:
folds = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)

# Dataset
Dataset with the `soft_labels_filename` parameter for self-distillation. To blend the original one hot encoded labels with the soft labels (out of folds predictions).

In [6]:
class TrainDataset(Dataset):
    def __init__(self, df, soft_labels_filename=None, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        if soft_labels_filename == "":
            print("soft_labels is None")
            self.soft_labels = None
        else:
            self.soft_labels = pd.read_csv(soft_labels_filename)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        #Read image
        file_name = self.file_names[index]
        file_path = f'{TRAIN_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']

        # Soft label for self-distillation
        if self.soft_labels is not None:
            label = torch.FloatTensor(
                (self.df.iloc[index, 1:].values * 0.7).astype(np.float16)
                + (self.soft_labels.iloc[index, 1:].values * 0.3).astype(np.float16)
            )
        else:
            label = torch.FloatTensor(self.df.iloc[index, 1:].values.astype(np.float16))

        return image, label

# Data augmentations:

In [7]:
def get_transforms(*, data):
    
    if data == 'train':
        return A.Compose([
            #Resize(CFG.size, CFG.size),
            A.RandomResizedCrop(CFG.size, CFG.size),
            A.Transpose(p=0.5),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.ShiftScaleRotate(p=0.5),
            A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
            A.RandomBrightnessContrast(brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            A.CoarseDropout(p=0.5),
            A.Cutout(p=0.5),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return A.Compose([
            A.Resize(CFG.size, CFG.size),
            A.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

# MODEL

In [8]:
class CustomResNext(nn.Module):
    def __init__(self, model_name='resnext50_32x4d', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        x = self.model(x)
        return x

# Bi-Tempered-Loss
Bi-Tempered loss function to train ML models with noisy data, introduced by google AI in their blog: https://ai.googleblog.com/2019/08/bi-tempered-logistic-loss-for-training.html

Code available in the google-ai github: https://github.com/google/bi-tempered-loss

In [9]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing=0.0,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

In [10]:
class BiTemperedLogisticLoss(nn.Module): 
    def __init__(self, t1, t2, smoothing=0.0): 
        super(BiTemperedLogisticLoss, self).__init__() 
        self.t1 = t1
        self.t2 = t2
        self.smoothing = smoothing
    def forward(self, logit_label, truth_label):
        loss_label = bi_tempered_logistic_loss(
            logit_label, truth_label,
            t1=self.t1, t2=self.t2,
            label_smoothing=self.smoothing,
            reduction='none'
        )
        
        loss_label = loss_label.mean()
        return loss_label

# Helper functions

In [11]:
# Helper functions
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        loss = criterion(y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        # record accuracy
        
        preds.append(y_preds.softmax(1).to('cpu').numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# Train loop

In [12]:
folds.sample()

Unnamed: 0,image_id,label,fold
9134,2615227158.jpg,4,2


In [13]:
# Train loop
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # loader
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainDataset(train_folds, soft_labels_filename=soft_labels,
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, soft_labels_filename=soft_labels, 
                                 transform=get_transforms(data='valid'))

    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    #Scheduler
    def get_scheduler(optimizer):
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler

    #Model
    model = CustomResNext(CFG.model_name, pretrained=True)    
    model.to(device)

    #Optimzer, scheduler and loss
    optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)
    criterion = BiTemperedLogisticLoss(t1=CFG.t1, t2=CFG.t2, smoothing=CFG.smoothing)
    #criterion = CrossEntropyLossOneHot


    # loop 
    LOGGER.info(f'Criterion: {criterion}')

    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        #train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        #eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        
        #Read the labels for validation
        if CFG.solf_label:
            #Reverse One hot encoding
            valid_labels=valid_folds.rename(columns={'label_0':0,'label_1':1,'label_2':2,'label_3':3,'label_4':4})
            valid_labels['label']=valid_labels.iloc[:,1:].idxmax(axis=1)
            
            #Compute accuracy with original labels
            valid_labels = valid_labels[CFG.target_col].values
        else:
            valid_labels = valid_folds[CFG.target_col].values
            
        #Save oof labels
        
        
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()


        
        # scoring
        score = get_score(valid_labels, preds.argmax(1))
        del valid_labels

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Accuracy: {score}')

        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    valid_folds['preds'] = check_point['preds'].argmax(1)
    

    return valid_folds

In [14]:
def main():
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")

In [15]:
if __name__ == '__main__':
    main()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnext50_32x4d_ra-d733960d.pth" to /root/.cache/torch/hub/checkpoints/resnext50_32x4d_ra-d733960d.pth
Criterion: BiTemperedLogisticLoss()


Epoch: [1][0/534] Data 2.389 (2.389) Elapsed 0m 4s (remain 38m 56s) Loss: 0.2933(0.2933) Grad: 1.8009  
Epoch: [1][100/534] Data 0.000 (0.024) Elapsed 1m 5s (remain 4m 39s) Loss: 0.1039(0.1375) Grad: 1.1592  
Epoch: [1][200/534] Data 0.000 (0.012) Elapsed 2m 5s (remain 3m 27s) Loss: 0.1386(0.1249) Grad: 1.1669  
Epoch: [1][300/534] Data 0.000 (0.008) Elapsed 3m 5s (remain 2m 23s) Loss: 0.1113(0.1181) Grad: 1.0025  
Epoch: [1][400/534] Data 0.000 (0.007) Elapsed 4m 5s (remain 1m 21s) Loss: 0.1223(0.1135) Grad: 1.2078  
Epoch: [1][500/534] Data 0.000 (0.005) Elapsed 5m 6s (remain 0m 20s) Loss: 0.1264(0.1098) Grad: 1.2845  
Epoch: [1][533/534] Data 0.000 (0.005) Elapsed 5m 25s (remain 0m 0s) Loss: 0.0658(0.1086) Grad: 0.8226  
EVAL: [0/134] Data 1.521 (1.521) Elapsed 0m 1s (remain 3m 42s) Loss: 0.1126(0.1126) 
EVAL: [100/134] Data 0.680 (0.253) Elapsed 0m 40s (remain 0m 13s) Loss: 0.0896(0.0875) 


Epoch 1 - avg_train_loss: 0.1086  avg_val_loss: 0.0879  time: 379s
Epoch 1 - Accuracy: 0.8378504672897197
Epoch 1 - Save Best Score: 0.8379 Model


EVAL: [133/134] Data 0.000 (0.247) Elapsed 0m 53s (remain 0m 0s) Loss: 0.1345(0.0879) 
Epoch: [2][0/534] Data 1.845 (1.845) Elapsed 0m 2s (remain 22m 11s) Loss: 0.0741(0.0741) Grad: 0.8962  
Epoch: [2][100/534] Data 0.000 (0.019) Elapsed 1m 3s (remain 4m 30s) Loss: 0.0945(0.0915) Grad: 0.9347  
Epoch: [2][200/534] Data 0.003 (0.010) Elapsed 2m 2s (remain 3m 23s) Loss: 0.0796(0.0912) Grad: 0.7225  
Epoch: [2][300/534] Data 0.000 (0.007) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0597(0.0901) Grad: 0.7590  
Epoch: [2][400/534] Data 0.000 (0.005) Elapsed 4m 2s (remain 1m 20s) Loss: 0.0915(0.0885) Grad: 0.8910  
Epoch: [2][500/534] Data 0.000 (0.004) Elapsed 5m 1s (remain 0m 19s) Loss: 0.0686(0.0879) Grad: 0.8598  
Epoch: [2][533/534] Data 0.000 (0.004) Elapsed 5m 21s (remain 0m 0s) Loss: 0.0752(0.0879) Grad: 0.7848  
EVAL: [0/134] Data 1.604 (1.604) Elapsed 0m 1s (remain 3m 53s) Loss: 0.1130(0.1130) 
EVAL: [100/134] Data 0.589 (0.221) Elapsed 0m 37s (remain 0m 12s) Loss: 0.1062(0.0910) 


Epoch 2 - avg_train_loss: 0.0879  avg_val_loss: 0.0904  time: 371s
Epoch 2 - Accuracy: 0.8558411214953271
Epoch 2 - Save Best Score: 0.8558 Model


EVAL: [133/134] Data 0.000 (0.214) Elapsed 0m 49s (remain 0m 0s) Loss: 0.1184(0.0904) 
Epoch: [3][0/534] Data 1.764 (1.764) Elapsed 0m 2s (remain 21m 45s) Loss: 0.0857(0.0857) Grad: 0.8318  
Epoch: [3][100/534] Data 0.000 (0.018) Elapsed 1m 3s (remain 4m 30s) Loss: 0.0599(0.0799) Grad: 0.3873  
Epoch: [3][200/534] Data 0.000 (0.009) Elapsed 2m 2s (remain 3m 23s) Loss: 0.0690(0.0821) Grad: 0.5025  
Epoch: [3][300/534] Data 0.000 (0.007) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0765(0.0824) Grad: 0.8225  
Epoch: [3][400/534] Data 0.005 (0.005) Elapsed 4m 2s (remain 1m 20s) Loss: 0.1214(0.0830) Grad: 1.0834  
Epoch: [3][500/534] Data 0.000 (0.004) Elapsed 5m 2s (remain 0m 19s) Loss: 0.0687(0.0832) Grad: 0.6559  
Epoch: [3][533/534] Data 0.000 (0.004) Elapsed 5m 21s (remain 0m 0s) Loss: 0.0725(0.0832) Grad: 0.5923  
EVAL: [0/134] Data 1.464 (1.464) Elapsed 0m 1s (remain 3m 35s) Loss: 0.0985(0.0985) 
EVAL: [100/134] Data 0.802 (0.225) Elapsed 0m 37s (remain 0m 12s) Loss: 0.0782(0.0792) 


Epoch 3 - avg_train_loss: 0.0832  avg_val_loss: 0.0792  time: 371s
Epoch 3 - Accuracy: 0.8658878504672897
Epoch 3 - Save Best Score: 0.8659 Model


EVAL: [133/134] Data 0.000 (0.218) Elapsed 0m 49s (remain 0m 0s) Loss: 0.1102(0.0792) 
Epoch: [4][0/534] Data 2.232 (2.232) Elapsed 0m 2s (remain 26m 0s) Loss: 0.0336(0.0336) Grad: 0.3723  
Epoch: [4][100/534] Data 0.000 (0.023) Elapsed 1m 3s (remain 4m 31s) Loss: 0.0562(0.0777) Grad: 0.5306  
Epoch: [4][200/534] Data 0.000 (0.012) Elapsed 2m 2s (remain 3m 23s) Loss: 0.1069(0.0774) Grad: 0.8286  
Epoch: [4][300/534] Data 0.000 (0.008) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0971(0.0772) Grad: 0.6754  
Epoch: [4][400/534] Data 0.000 (0.006) Elapsed 4m 3s (remain 1m 20s) Loss: 0.0919(0.0776) Grad: 0.8855  
Epoch: [4][500/534] Data 0.000 (0.005) Elapsed 5m 3s (remain 0m 19s) Loss: 0.1096(0.0779) Grad: 0.9963  
Epoch: [4][533/534] Data 0.000 (0.005) Elapsed 5m 22s (remain 0m 0s) Loss: 0.0955(0.0781) Grad: 0.6026  
EVAL: [0/134] Data 1.400 (1.400) Elapsed 0m 1s (remain 3m 25s) Loss: 0.1012(0.1012) 
EVAL: [100/134] Data 0.926 (0.229) Elapsed 0m 38s (remain 0m 12s) Loss: 0.0788(0.0777) 


Epoch 4 - avg_train_loss: 0.0781  avg_val_loss: 0.0774  time: 371s
Epoch 4 - Accuracy: 0.8707943925233644
Epoch 4 - Save Best Score: 0.8708 Model


EVAL: [133/134] Data 0.000 (0.213) Elapsed 0m 48s (remain 0m 0s) Loss: 0.1101(0.0774) 
Epoch: [5][0/534] Data 1.797 (1.797) Elapsed 0m 2s (remain 22m 0s) Loss: 0.0818(0.0818) Grad: 0.6481  
Epoch: [5][100/534] Data 0.001 (0.018) Elapsed 1m 2s (remain 4m 30s) Loss: 0.1109(0.0751) Grad: 0.9106  
Epoch: [5][200/534] Data 0.000 (0.010) Elapsed 2m 2s (remain 3m 23s) Loss: 0.0791(0.0760) Grad: 0.5616  
Epoch: [5][300/534] Data 0.000 (0.007) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0607(0.0762) Grad: 0.5065  
Epoch: [5][400/534] Data 0.000 (0.005) Elapsed 4m 2s (remain 1m 20s) Loss: 0.0667(0.0755) Grad: 0.5711  
Epoch: [5][500/534] Data 0.000 (0.004) Elapsed 5m 1s (remain 0m 19s) Loss: 0.0788(0.0754) Grad: 0.7910  
Epoch: [5][533/534] Data 0.000 (0.004) Elapsed 5m 21s (remain 0m 0s) Loss: 0.0761(0.0756) Grad: 0.7487  
EVAL: [0/134] Data 1.717 (1.717) Elapsed 0m 1s (remain 4m 7s) Loss: 0.0919(0.0919) 
EVAL: [100/134] Data 0.484 (0.223) Elapsed 0m 37s (remain 0m 12s) Loss: 0.0857(0.0769) 


Epoch 5 - avg_train_loss: 0.0756  avg_val_loss: 0.0778  time: 370s
Epoch 5 - Accuracy: 0.8710280373831776
Epoch 5 - Save Best Score: 0.8710 Model


EVAL: [133/134] Data 0.000 (0.215) Elapsed 0m 49s (remain 0m 0s) Loss: 0.1227(0.0778) 
Epoch: [6][0/534] Data 1.991 (1.991) Elapsed 0m 2s (remain 23m 16s) Loss: 0.0747(0.0747) Grad: 0.5342  
Epoch: [6][100/534] Data 0.000 (0.020) Elapsed 1m 3s (remain 4m 31s) Loss: 0.0811(0.0758) Grad: 0.6503  
Epoch: [6][200/534] Data 0.000 (0.010) Elapsed 2m 3s (remain 3m 24s) Loss: 0.0569(0.0752) Grad: 0.5478  
Epoch: [6][300/534] Data 0.000 (0.007) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0391(0.0748) Grad: 0.5272  
Epoch: [6][400/534] Data 0.000 (0.006) Elapsed 4m 2s (remain 1m 20s) Loss: 0.0641(0.0741) Grad: 0.5892  
Epoch: [6][500/534] Data 0.000 (0.005) Elapsed 5m 2s (remain 0m 19s) Loss: 0.0771(0.0737) Grad: 0.5781  
Epoch: [6][533/534] Data 0.000 (0.004) Elapsed 5m 22s (remain 0m 0s) Loss: 0.0744(0.0734) Grad: 0.8270  
EVAL: [0/134] Data 1.530 (1.530) Elapsed 0m 1s (remain 3m 43s) Loss: 0.0954(0.0954) 
EVAL: [100/134] Data 1.128 (0.233) Elapsed 0m 38s (remain 0m 12s) Loss: 0.0736(0.0736) 


Epoch 6 - avg_train_loss: 0.0734  avg_val_loss: 0.0741  time: 372s
Epoch 6 - Accuracy: 0.880607476635514
Epoch 6 - Save Best Score: 0.8806 Model


EVAL: [133/134] Data 0.000 (0.224) Elapsed 0m 50s (remain 0m 0s) Loss: 0.1337(0.0741) 
Epoch: [7][0/534] Data 1.750 (1.750) Elapsed 0m 2s (remain 21m 14s) Loss: 0.0800(0.0800) Grad: 0.5425  
Epoch: [7][100/534] Data 0.000 (0.018) Elapsed 1m 2s (remain 4m 29s) Loss: 0.1016(0.0673) Grad: 0.9042  
Epoch: [7][200/534] Data 0.000 (0.009) Elapsed 2m 2s (remain 3m 22s) Loss: 0.0962(0.0688) Grad: 0.7449  
Epoch: [7][300/534] Data 0.000 (0.007) Elapsed 3m 1s (remain 2m 20s) Loss: 0.0311(0.0686) Grad: 0.5540  
Epoch: [7][400/534] Data 0.010 (0.005) Elapsed 4m 1s (remain 1m 20s) Loss: 0.0711(0.0689) Grad: 0.7482  
Epoch: [7][500/534] Data 0.000 (0.004) Elapsed 5m 1s (remain 0m 19s) Loss: 0.0479(0.0693) Grad: 0.7920  
Epoch: [7][533/534] Data 0.000 (0.004) Elapsed 5m 20s (remain 0m 0s) Loss: 0.0543(0.0689) Grad: 0.5413  
EVAL: [0/134] Data 2.120 (2.120) Elapsed 0m 2s (remain 5m 1s) Loss: 0.0892(0.0892) 
EVAL: [100/134] Data 0.826 (0.234) Elapsed 0m 38s (remain 0m 12s) Loss: 0.0685(0.0741) 


Epoch 7 - avg_train_loss: 0.0689  avg_val_loss: 0.0745  time: 370s
Epoch 7 - Accuracy: 0.8789719626168224


EVAL: [133/134] Data 0.000 (0.220) Elapsed 0m 49s (remain 0m 0s) Loss: 0.1334(0.0745) 
Epoch: [8][0/534] Data 2.067 (2.067) Elapsed 0m 2s (remain 24m 36s) Loss: 0.0582(0.0582) Grad: 0.6563  
Epoch: [8][100/534] Data 0.000 (0.021) Elapsed 1m 2s (remain 4m 29s) Loss: 0.0621(0.0669) Grad: 0.8415  
Epoch: [8][200/534] Data 0.000 (0.011) Elapsed 2m 2s (remain 3m 23s) Loss: 0.0508(0.0675) Grad: 0.5527  
Epoch: [8][300/534] Data 0.000 (0.007) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0514(0.0660) Grad: 0.6277  
Epoch: [8][400/534] Data 0.000 (0.006) Elapsed 4m 2s (remain 1m 20s) Loss: 0.0908(0.0666) Grad: 0.6926  
Epoch: [8][500/534] Data 0.000 (0.005) Elapsed 5m 1s (remain 0m 19s) Loss: 0.0617(0.0666) Grad: 0.9075  
Epoch: [8][533/534] Data 0.000 (0.005) Elapsed 5m 21s (remain 0m 0s) Loss: 0.0689(0.0666) Grad: 0.7719  
EVAL: [0/134] Data 1.459 (1.459) Elapsed 0m 1s (remain 3m 33s) Loss: 0.0941(0.0941) 
EVAL: [100/134] Data 0.652 (0.228) Elapsed 0m 38s (remain 0m 12s) Loss: 0.0750(0.0708) 


Epoch 8 - avg_train_loss: 0.0666  avg_val_loss: 0.0710  time: 371s
Epoch 8 - Accuracy: 0.880607476635514


EVAL: [133/134] Data 0.000 (0.217) Elapsed 0m 49s (remain 0m 0s) Loss: 0.1406(0.0710) 
Epoch: [9][0/534] Data 1.987 (1.987) Elapsed 0m 2s (remain 23m 56s) Loss: 0.0740(0.0740) Grad: 0.8147  
Epoch: [9][100/534] Data 0.000 (0.020) Elapsed 1m 3s (remain 4m 31s) Loss: 0.0602(0.0645) Grad: 0.6171  
Epoch: [9][200/534] Data 0.000 (0.011) Elapsed 2m 3s (remain 3m 24s) Loss: 0.0335(0.0640) Grad: 0.4622  
Epoch: [9][300/534] Data 0.000 (0.007) Elapsed 3m 3s (remain 2m 21s) Loss: 0.0530(0.0634) Grad: 0.6419  
Epoch: [9][400/534] Data 0.000 (0.006) Elapsed 4m 2s (remain 1m 20s) Loss: 0.0386(0.0630) Grad: 0.6531  
Epoch: [9][500/534] Data 0.000 (0.005) Elapsed 5m 2s (remain 0m 19s) Loss: 0.0742(0.0631) Grad: 0.7997  
Epoch: [9][533/534] Data 0.000 (0.004) Elapsed 5m 21s (remain 0m 0s) Loss: 0.0604(0.0629) Grad: 0.6158  
EVAL: [0/134] Data 1.454 (1.454) Elapsed 0m 1s (remain 3m 37s) Loss: 0.0909(0.0909) 
EVAL: [100/134] Data 1.434 (0.236) Elapsed 0m 39s (remain 0m 12s) Loss: 0.0666(0.0703) 


Epoch 9 - avg_train_loss: 0.0629  avg_val_loss: 0.0707  time: 372s
Epoch 9 - Accuracy: 0.8841121495327103
Epoch 9 - Save Best Score: 0.8841 Model


EVAL: [133/134] Data 0.000 (0.221) Elapsed 0m 50s (remain 0m 0s) Loss: 0.1307(0.0707) 
Epoch: [10][0/534] Data 1.738 (1.738) Elapsed 0m 2s (remain 21m 11s) Loss: 0.0591(0.0591) Grad: 0.7369  
Epoch: [10][100/534] Data 0.000 (0.018) Elapsed 1m 3s (remain 4m 30s) Loss: 0.0636(0.0636) Grad: 0.7908  
Epoch: [10][200/534] Data 0.000 (0.009) Elapsed 2m 2s (remain 3m 23s) Loss: 0.0841(0.0622) Grad: 0.8928  
Epoch: [10][300/534] Data 0.000 (0.006) Elapsed 3m 2s (remain 2m 21s) Loss: 0.0751(0.0623) Grad: 0.7309  
Epoch: [10][400/534] Data 0.000 (0.005) Elapsed 4m 2s (remain 1m 20s) Loss: 0.0306(0.0621) Grad: 0.5439  
Epoch: [10][500/534] Data 0.000 (0.004) Elapsed 5m 2s (remain 0m 19s) Loss: 0.0473(0.0618) Grad: 0.7679  
Epoch: [10][533/534] Data 0.000 (0.004) Elapsed 5m 21s (remain 0m 0s) Loss: 0.0540(0.0620) Grad: 0.8300  
EVAL: [0/134] Data 1.432 (1.432) Elapsed 0m 1s (remain 3m 31s) Loss: 0.0904(0.0904) 
EVAL: [100/134] Data 0.964 (0.232) Elapsed 0m 38s (remain 0m 12s) Loss: 0.0681(0.0694) 

Epoch 10 - avg_train_loss: 0.0620  avg_val_loss: 0.0698  time: 371s
Epoch 10 - Accuracy: 0.8827102803738318


EVAL: [133/134] Data 0.000 (0.219) Elapsed 0m 49s (remain 0m 0s) Loss: 0.1321(0.0698) 
