For Google Colab, run the cells below

In [1]:
GOOGLE_COLAB = False
if GOOGLE_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    #!nvidia-smi
    !pip install albumentations==0.4.6 --quiet
    !pip install pytorch-lightning --quiet
    !pip install timm --quiet
    !pip install transformers --quiet

In [2]:
if GOOGLE_COLAB:
    # Install RAPIDS
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
    !bash rapidsai-csp-utils/colab/rapids-colab.sh stable

    import sys, os, shutil

    sys.path.append('/usr/local/lib/python3.7/site-packages/')
    os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
    os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
    os.environ["CONDA_PREFIX"] = "/usr/local"
    for so in ['cudf', 'rmm', 'nccl', 'cuml', 'cugraph', 'xgboost', 'cuspatial']:
      fn = 'lib'+so+'.so'
      source_fn = '/usr/local/lib/'+fn
      dest_fn = '/usr/lib/'+fn
      if os.path.exists(source_fn):
        print(f'Copying {source_fn} to {dest_fn}')
        shutil.copyfile(source_fn, dest_fn)
    # fix for BlazingSQL import issue
    # ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by /usr/local/lib/python3.7/site-packages/../../libblazingsql-engine.so)
    if not os.path.exists('/usr/lib64'):
        os.makedirs('/usr/lib64')
    for so_file in os.listdir('/usr/local/lib'):
      if 'libstdc' in so_file:
        shutil.copyfile('/usr/local/lib/'+so_file, '/usr/lib64/'+so_file)
        shutil.copyfile('/usr/local/lib/'+so_file, '/usr/lib/x86_64-linux-gnu/'+so_file)

# Shopee Deep Learning

In [3]:
INTERNET_ON = False#True # Whether the notebook can use internet # Should be set to False for submissions
if INTERNET_ON:
    ! pip install neptune-client --quiet

To make use of Neptune, you need to have an account

In [4]:
if INTERNET_ON:
    NEPTUNE_KEY = "..."
    NEPTUNE_PROJECT_NAME = "..." 

# Load Libraries

In [5]:
# System and OS
import sys
sys.path = [
    '../input/timm-pytorch-image-models/pytorch-image-models-master'
] + sys.path
import os

# Fundamentals
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
#from matplotlib import style
#style.use('fivethirtyeight')

# Premade Modules and Models
import timm # for efficientnet backbone


# Machine Learning Standards
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

# Garbage Collector
import gc

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

if torch.cuda.is_available():
    # cuDF mirrors Pandas but with GPU integration
    import cudf
    # cuML mirrors Sci-kit learn but with GPU integration
    import cuml
    from cuml.feature_extraction.text import TfidfVectorizer
    from cuml.neighbors import NearestNeighbors
    # CuPy mirrors Scipy but with GPU integration
    import cupy

# Image Augmentation
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Natural Language Processing
# Transformers
import transformers
from transformers import AutoModel, AutoTokenizer
#from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel

# Fast Training 
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import BackboneFinetuning, StochasticWeightAveraging, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.metrics.classification import F1


# Progressbars
from tqdm.notebook import tqdm 

# Randomness
import random

# For reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
os.environ['PYTHONHASHSEED'] = str(SEED)

# Configuration

In [6]:
class Config:
    # ---- Reproducibility ---- 
    SEED = SEED
    
    # ----  Directories & Files ---- 
    ROOT_DIR = '../input/shopee-product-matching'
    TRAIN_DIR = '../input/shopee-product-matching/train_images'
    TEST_DIR = '../input/shopee-product-matching/test_images'
    TRAIN_CSV = '../input/shopee-product-matching/train.csv'
    TEST_CSV = '../input/shopee-product-matching/test.csv'
    
    # ---- Ensemble ----
    ENSEMBLE = False#True # Whether to ensemble the embeddings of multiple models
    
    # ---- Experiment ----
    DESCRIPTION = "DistilBert" # Comment on the training
    EXPERIMENT = 0 # ID of the experiment, used for Ensembling
    MAX_NUM_EXPERIMENTS = 4 # Max number of experiments on which we ensemble
    MODEL_TYPE = 'Image_And_Text' #'Text' #'Image' # Model type to train and predict 
    # 'Image_And_Text' is only used to combine the predictions form the image and text model
    
    # ---- Training ----
    MAX_EPOCHS = 20 #25 # Maximal number of epochs (influence SWA callback !)
    WEIGHT_DECAY = 0.00005 # L2 Regularization
    NORM_CLIPPING = 0.015 # Clip the gradient norm to prevent gradient explosion (value advised for nfnets)
    BATCH_SIZE = 2048 #64
    LR = 3.5e-4 #1e-3  # Learning Rate
    NUM_WORKERS = 4
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # not working... -> #PRECISION = 16 # 16-bit precision rather than 32-bit (at least 2x faster, even more if GPU optimised for 16-bit precision)
    VALIDATION_SIZE = 0.43 #0.5 #10032 # Size of dataset used for validation
    COSINE_ANNEALING_WARM_RESTARTS = True #False # Whether to use the cosine annealing warm restart scheduler
    BACKBONE_FINETUNING = False # Whether to use the backbone fine-tuning callback
    STOCHASTIC_WEIGHT_AVERAGING = True # Whether to use stochastic weight averaging during the last epochs
    EARLY_STOPPING = False # Whether to use the early stopping callback

    # ---- Cosine Annealing Warm Restarts ----
    #if COSINE_ANNEALING_WARM_RESTARTS:
    CAWR_T0 = 15 #5 # Period of the cosine annealing warm restart scheduler
    CAWR_T_MULT = 1 #2 # Multiplier applied to increase the period during training
    CAWR_MIN_LR = 1e-7 # Minimum learning rate that the scheduler is allowed to reach

    # ---- Backbone Finetuning ----
    #if BACKBONE_FINETUNING:  
    UNFREEZE_EPOCH = 5 # Unfreeze backbone parameters from epoch <UNFREEZE_EPOCH> onwards
    MULTIPLY_LR = 1.5 # Multiply the lr by <MULTIPLY_LR> at each epoch
    BACKBONE_RATIO_LR = 0.1 # The backbone will be tuned at first at a <BACKBONE_RATIO_LR> ratio of the model head lr
    
    # ---- Stochastic Weight Averaging ----
    #if STOCHASTIC_WEIGHT_AVERAGING:
    SWA_EPOCH_START = 0.80 # At (SWA_EPOCH_START * max_epoch)-th epoch, start SWA
    SWA_ANNEALING_EPOCHS = 10 #5 # Number of epochs in the annealing phase
    
    # ---- Early Stopping ----
    #if EARLY_STOPPING:
    PATIENCE = 5 # Minimal number of epochs without improvements to stop training
    MIN_DELTA = 0.001 # Minimal perceivable change in the score between 2 epochs
    
    # ---- Classification ---- 
    NUM_CLASSES = 11014 # Number of different <label_group> in the dataset

    # ---- Loss ----
    LOSS = 'CurricularFace' #'CurricularFace' #'AdaCos' #'ArcFace' # None
    #if LOSS != 'AdaCos':
    SCALE = 30 # Scale logits for ArcFace
    MARGIN = 0.5 # Additional margin for ArcFace

    # ---- Model ----
    SAVE_MODEL = True #False #True # Whether to save the model weights in Neptune
    MODEL_DIR = '/shopee-models' #'./drive/MyDrive/Shopee/shopeemodels' # Path where the model is saved
    USE_FC = True # whether to add a classifier made of fc layers on top of the backbone
    #if USE_FC:
    DROPOUT = 0 
    FC_DIM = 512 # Number of dimensions at the output of the fully connected layer
    IS_TRAINING_FROM_SCRATCH = False # False = train only the head of the model # True = train the whole model
    
    if 'Image' in MODEL_TYPE:
        # ----  Images ----  
        IMG_SIZE = 512 #128
        IMG_N_CHANNELS = 3 # R-G-B
        IMG_MEAN = [0.485, 0.456, 0.406] # Mean of ImageNet
        IMG_STD = [0.229, 0.224, 0.225] # Std of ImageNet

        # ---- Data Augmentation ----
        DATA_AUGMENTATION = True # Whether to use data augmentation
        if DATA_AUGMENTATION:
            BOOST_DATA_AUGMENTATION = False # Whether to boost augmentation with added noise 
                                            # to prevent overfitting (bad performance)

        # ---- Image Model ---- 
        IMAGE_MODEL_NAME = 'eca_nfnet_l0' #'resnet34' #'resnet50' #'eca_nfnet_l0'
        ACTIVATIONS = 'Mish' #'ReLU' # 'SiLU' #'Mish'
    
    if 'Text' in MODEL_TYPE:
        # ---- Text Model ----
        DISTIL_BERT = True
        if DISTIL_BERT:
            if EXPERIMENT == 0 or EXPERIMENT == 1:
                TEXT_MODEL_NAME = 'distilbertbaseindonesian'
                TEXT_MODEL_PATH = '../input/cahya' + TEXT_MODEL_NAME
            elif EXPERIMENT == 2 or EXPERIMENT == 3:
                TEXT_MODEL_NAME = 'paraphrase-xlm-r-multilingual-v1'
                TEXT_MODEL_PATH = '../input/sentence-transformers/' + TEXT_MODEL_NAME
        else:
            TEXT_MODEL_PATH='../input/cahyabertbaseindonesian522M'
        
        if ENSEMBLE:
            TEXT_BACKBONE_PATHS = ['../input/cahyadistilbertbaseindonesian', 
                                   '../input/cahyadistilbertbaseindonesian',
                                   '../input/sentence-transformers/paraphrase-xlm-r-multilingual-v1',
                                   '../input/sentence-transformers/paraphrase-xlm-r-multilingual-v1']
            TEXT_MODEL_PATHS = ['...']
        
        # ---- Tokenizer ----
        TOKENIZER_MAX_LENGTH = 70

## Load Test Data

In [7]:
IS_SUBMISSION = False # Automatically set to False for training, True for submitting

test = pd.read_csv(Config.TEST_CSV)
if len(test)>3: # Submission Mode
    IS_SUBMISSION = True
else: # Commit Mode
    print('Commit Mode')

IS_SUBMISSION = True

Commit Mode


## Neptune Logger 
To save all experiments

In [8]:
#NEPTUNE_LOGGER = True
if INTERNET_ON:
    from pytorch_lightning.loggers.neptune import NeptuneLogger
    
    PARAMS = {
    'seed': Config.SEED,
    
    'model_type': Config.MODEL_TYPE,
    'loss': Config.LOSS,
    'is_training_from_scratch': Config.IS_TRAINING_FROM_SCRATCH,
    
    'validation_size': Config.VALIDATION_SIZE,
    'max_epochs': Config.MAX_EPOCHS,
    'batch_size': Config.BATCH_SIZE,
    'lr': Config.LR,
    }
    if Config.WEIGHT_DECAY > 0.0:
        PARAMS.update({'weight_decay': Config.WEIGHT_DECAY})

    if Config.DESCRIPTION != '':
        PARAMS.update({'description': Config.DESCRIPTION})

    if 'Image' in Config.MODEL_TYPE:
        PARAMS.update({'image_model_name': Config.IMAGE_MODEL_NAME,
                       'activations': Config.ACTIVATIONS,
                       'img_size': Config.IMG_SIZE,
                       'data_augmentation': Config.DATA_AUGMENTATION})
        
        if Config.DATA_AUGMENTATION:
            PARAMS.update({'boost_data_augmentation': Config.BOOST_DATA_AUGMENTATION})
    
    if 'Text' in Config.MODEL_TYPE:
        PARAMS.update({'text_model_name': Config.TEXT_MODEL_NAME,
                       'tokenizer_max_length': Config.TOKENIZER_MAX_LENGTH})

    if Config.NORM_CLIPPING < 1.0:
        PARAMS.update({'norm_clipping': Config.NORM_CLIPPING})

    if Config.USE_FC:
        PARAMS.update({
            'use_fc': Config.USE_FC,
            'dropout': Config.DROPOUT,
        })

    if Config.COSINE_ANNEALING_WARM_RESTARTS:
        PARAMS.update({
                'cawr_T0': Config.CAWR_T0,
                'cawr_T_mult': Config.CAWR_T_MULT,
                'cawr_min_lr': Config.CAWR_MIN_LR,
        })

    if Config.BACKBONE_FINETUNING:
        PARAMS.update({
              'unfreeze_epoch': Config.UNFREEZE_EPOCH,
              'multiply_lr': Config.MULTIPLY_LR,
              'initial_backbone_ratio_lr': Config.BACKBONE_RATIO_LR,
        })

    if Config.STOCHASTIC_WEIGHT_AVERAGING:
        PARAMS.update({
            'swa_epoch_start': Config.SWA_EPOCH_START,
            'swa_annealing_epochs': Config.SWA_ANNEALING_EPOCHS,
        })

    if Config.EARLY_STOPPING:
        PARAMS.update({
            'patience': Config.PATIENCE,
            'min_delta': Config.MIN_DELTA,
        })

    if Config.LOSS == "ArcFace" or Config.LOSS == "CurricularFace":
        PARAMS.update({
            'scale': Config.SCALE,
            'margin': Config.MARGIN,
        })

    NEPTUNE_LOGGER = NeptuneLogger(
        api_key=NEPTUNE_KEY,
        project_name=NEPTUNE_PROJECT_NAME,
        close_after_fit=False,
        params=PARAMS,
    )

# Pytorch Lightning Callbacks

In [9]:
if not IS_SUBMISSION:
    # Choose callbacks
    callbacks = []
    
    if Config.BACKBONE_FINETUNING:
        # Finetune the backbone model at lower lr progressively increasing to align to the model head lr
        backbone_finetuning_callback = BackboneFinetuning(
           unfreeze_backbone_at_epoch=Config.UNFREEZE_EPOCH,  # unfreeze backbone parameters from epoch <UNFREEZE_EPOCH> onwards
           lambda_func=lambda epoch: Config.MULTIPLY_LR, # multiply the lr by <MULTIPLY_LR> at each epoch
           backbone_initial_ratio_lr=Config.BACKBONE_RATIO_LR, # the backbone will be tuned at first at a <BACKBONE_RATIO_LR>% of the model head lr
           verbose=True
        )
        callbacks.append(backbone_finetuning_callback)
        
    if Config.STOCHASTIC_WEIGHT_AVERAGING:
        # Stochastic Weight Averaging is a very efficient approach to obtain more generalizable model (better test error)
        # source: https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/
        SWA_callback = StochasticWeightAveraging(
            swa_epoch_start=Config.SWA_EPOCH_START,  
            annealing_epochs=Config.SWA_ANNEALING_EPOCHS,
        )
        callbacks.append(SWA_callback)
        
    if Config.COSINE_ANNEALING_WARM_RESTARTS:
        lr_monitor_callback = LearningRateMonitor(logging_interval='epoch')
        callbacks.append(lr_monitor_callback)
        
    if Config.EARLY_STOPPING:
        # Stop the training if the <val_loss> does not improve by at least <min_delta> during <patience> epochs
        early_stopping_callback = EarlyStopping( # Pay attention that early stopping does not enter in conflict with SWA
           monitor='val_loss',
           min_delta=Config.MIN_DELTA,
           patience=Config.PATIENCE,
           verbose=True,
           mode='min'
        )
        callbacks.append(early_stopping_callback)
        
    if Config.SAVE_MODEL:
        filename_prefix = 'model'
        if 'Image' in Config.MODEL_TYPE:
            filename_prefix = f'{Config.IMAGE_MODEL_NAME}_{Config.ACTIVATIONS}_{Config.LOSS}'
        elif 'Text' in Config.MODEL_TYPE:
            filename_prefix = f'{Config.TEXT_MODEL_NAME}_{Config.LOSS}'
            
        # Checkpoint the model at each epoch where the <val_loss> improves
        checkpoint_callback = ModelCheckpoint(
            monitor='val_loss',
            dirpath=Config.MODEL_DIR,    
            filename=filename_prefix + '-{epoch:02d}-{val_loss:.2f}',
            save_top_k=2,
            mode='min'
        )
        callbacks.append(checkpoint_callback)

# Mish Activation Layer
State-of-the-art activation function that outperforms Swish and ReLU in DNN's.

In [10]:
class Mish(nn.Module):
    def __init__(self):
        super(Mish, self).__init__()

    def forward(self, input):
        return input * (torch.tanh(F.softplus(input)))

# Loss Functions

## Arc Margin Product 
Additive Angular Margin Loss (ArcFace)
- Loss function to obtain highly discriminative image features
- Used initially in 2018 for state-of-the-art Face Recognition

In [11]:
# credits: https://arxiv.org/pdf/1801.07698.pdf
#          https://github.com/ronghuaiyang/arcface-pytorch/blob/master/models/metrics.py
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

## AdaCos
Adaptively Scaling Cosine Logits
- State-of-the-art since 2019 in Deep Face Recognition (https://arxiv.org/abs/1905.00292)
- Adaptation of CosFace
- Outperforms CosFace and ArcFace
- No parameter to tune -> self-tuning loss

In [12]:
# credits: https://catalyst-team.github.io/catalyst/_modules/catalyst/contrib/nn/modules/cosface.html#AdaCos
class AdaCos(nn.Module):
    """Implementation of
    `AdaCos\: Adaptively Scaling Cosine Logits for Effectively Learning Deep Face Representations`_.

    .. _AdaCos\: Adaptively Scaling Cosine Logits for\
        Effectively Learning Deep Face Representations:
        https://arxiv.org/abs/1905.00292

    Args:
        in_features: size of each input sample.
        out_features: size of each output sample.
        dynamical_s: option to use dynamical scale parameter.
            If ``False`` then will be used initial scale.
            Default: ``True``.
        eps: operation accuracy.
            Default: ``1e-6``.

    Shape:
        - Input: :math:`(batch, H_{in})` where
          :math:`H_{in} = in\_features`.
        - Output: :math:`(batch, H_{out})` where
          :math:`H_{out} = out\_features`.

    Example:
        >>> layer = AdaCos(5, 10)
        >>> loss_fn = nn.CrosEntropyLoss()
        >>> embedding = torch.randn(3, 5, requires_grad=True)
        >>> target = torch.empty(3, dtype=torch.long).random_(10)
        >>> output = layer(embedding, target)
        >>> loss = loss_fn(output, target)
        >>> loss.backward()

    """  # noqa: E501,W505

    def __init__(  # noqa: D107
        self, in_features: int, out_features: int, dynamical_s: bool = True, eps: float = 1e-6,
    ):
        super(AdaCos, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = math.sqrt(2) * math.log(out_features - 1)
        self.eps = eps

        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def __repr__(self) -> str:
        """Object representation."""
        rep = (
            "AdaCos("
            f"in_features={self.in_features},"
            f"out_features={self.out_features},"
            f"s={self.s},"
            f"eps={self.eps}"
            ")"
        )
        return rep

    def forward(self, input: torch.Tensor, target: torch.LongTensor = None) -> torch.Tensor:
        """
        Args:
            input: input features,
                expected shapes ``BxF`` where ``B``
                is batch dimension and ``F`` is an
                input feature dimension.
            target: target classes,
                expected shapes ``B`` where
                ``B`` is batch dimension.
                If `None` then will be returned
                projection on centroids.
                Default is `None`.

        Returns:
            tensor (logits) with shapes ``BxC``
            where ``C`` is a number of classes
            (out_features).
        """
        cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))
        theta = torch.acos(torch.clamp(cos_theta, -1.0 + self.eps, 1.0 - self.eps))

        if target is None:
            return cos_theta

        one_hot = torch.zeros_like(cos_theta)
        one_hot.scatter_(1, target.view(-1, 1).long(), 1)

        if self.train:
            with torch.no_grad():
                b_avg = (
                    torch.where(
                        one_hot < 1, torch.exp(self.s * cos_theta), torch.zeros_like(cos_theta),
                    )
                    .sum(1)
                    .mean()
                )
                theta_median = theta[one_hot > 0].median()
                theta_median = torch.min(torch.full_like(theta_median, math.pi / 4), theta_median)
                self.s = (torch.log(b_avg) / torch.cos(theta_median))#.item()

        logits = self.s * cos_theta
        return logits

## CurricularFace
https://arxiv.org/pdf/2004.00288.pdf
https://github.com/HuangYG123/CurricularFace/blob/8b2f47318117995aa05490c05b455b113489917e/head/metrics.py#L70
- Superseed ArcFace and its variants
- Adaptative Curriculum Learning: learn over easy (=low-loss scoring) samples during the first epochs and then learn over hard (=high-loss scoring) samples.

In [13]:
def l2_norm(input, axis = 1):
    norm = torch.norm(input, 2, axis, True)
    output = torch.div(input, norm)

    return output

class CurricularFace(pl.LightningModule):#nn.Module):
    def __init__(self, in_features, out_features, m = 0.5, s = 64.):
        super(CurricularFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.m = m
        self.s = s
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.threshold = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        self.kernel = nn.Parameter(torch.Tensor(in_features, out_features, device=self.device))
        self.register_buffer('t', torch.zeros(1, device=self.device))
        nn.init.normal_(self.kernel, std=0.01)

    def forward(self, embbedings, label):
        embbedings = l2_norm(embbedings, axis = 1)
        kernel_norm = l2_norm(self.kernel, axis = 0)
        cos_theta = torch.mm(embbedings, kernel_norm)
        cos_theta = cos_theta.clamp(-1, 1)  # for numerical stability
        with torch.no_grad():
            origin_cos = cos_theta.clone()
        target_logit = cos_theta[torch.arange(0, embbedings.size(0)), label].view(-1, 1)

        sin_theta = torch.sqrt(1.0 - torch.pow(target_logit, 2))
        cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m #cos(target+margin)
        mask = cos_theta > cos_theta_m
        final_target_logit = torch.where(target_logit > self.threshold, cos_theta_m, target_logit - self.mm)

        hard_example = cos_theta[mask]
        with torch.no_grad():
            self.t = target_logit.mean() * 0.01 + (1 - 0.01) * self.t
        cos_theta[mask] = hard_example * (self.t + hard_example)
        cos_theta.scatter_(1, label.view(-1, 1).long(), final_target_logit)
        output = cos_theta * self.s
        return output#, origin_cos * self.s

# F1 Score

In [14]:
# credits: https://www.kaggle.com/c/shopee-product-matching/discussion/225093
def getMetric(col):
    def f1score(row): # or Dice Metric
        n = len(np.intersect1d(row.targets, row[col]))
        return 2 * n / (len(row.targets) + len(row[col]))
    return f1score

# Load Train Data

In [15]:
if not IS_SUBMISSION:
    train = pd.read_csv(Config.TRAIN_CSV)
    # Number of unique values in the dataset
    print(train[['posting_id', 'label_group', 'image_phash', 'title']].nunique())

# Use Image Embbedings

Remove duplicate images from the train set

In [16]:
if not IS_SUBMISSION:
    train.drop_duplicates(subset=['image_phash', 'label_group'], inplace=True)
    print("Number of unique values")
    print(train[['posting_id', 'label_group', 'image_phash', 'title']].nunique())
    print("Number of instances alone (occurence = 1 in a label_group)")
    rows_alone = train.drop_duplicates(subset=['label_group'], keep=False)
    n_instances_alone = len(rows_alone)
    print(n_instances_alone)
    if not Config.ENSEMBLE:
        indices_drop_alone = train.index[~train.duplicated('label_group', keep=False)]
        train.drop(indices_drop_alone, inplace=True)
        print("Number of unique values after dropping alone instances")
        print(train[['posting_id', 'label_group', 'image_phash', 'title']].nunique())

Separation of the dataset at random into stratified folds

In [17]:
if not IS_SUBMISSION and Config.ENSEMBLE:
    NUM_FOLDS = 4
    folds = []
    for i in range(NUM_FOLDS):
        folds.append(pd.DataFrame([train.iloc[i]]))
    for index_row, i in zip(train.iterrows(), range(NUM_FOLDS, len(train))):
        _, row = index_row
        j = 0
        while j < NUM_FOLDS and row.label_group in folds[(i+j)%NUM_FOLDS].label_group:
            j += 1
        folds[(i+j)%NUM_FOLDS] = folds[(i+j)%NUM_FOLDS].append(row, ignore_index=False) 
    
    for i in range(NUM_FOLDS):
        print(f"Fold #{i} length: {len(folds[i])}")
        print(f"Fold #{i} -> number of unique labels: {folds[i].label_group.nunique()}")

In [18]:
if not IS_SUBMISSION and not Config.ENSEMBLE:
    train = train.sample(frac=1, random_state=Config.SEED).reset_index(drop=True) # Shuffle trainset rows in place
    indices_validation = train.index[~train.duplicated('label_group')]
    validation = train.loc[indices_validation]
    train.drop(indices_validation, inplace=True)
    train = train.reset_index(drop=True)

    print(f"Trainset length: {len(train)}")
    print(f"Validation set length: {len(validation)}")
    print(f"Validation set -> number of unique labels: {validation.label_group.nunique()}")
    print(f"Train set -> number of unique labels: {train.label_group.nunique()}")

Add a column `targets` containing a list of target ids to which each sample should be associated (including itself)

In [19]:
if not IS_SUBMISSION:
    targets_per_label_group = validation.groupby('label_group').posting_id.agg('unique')
    validation['targets'] = validation.label_group.map(targets_per_label_group)
    print('Validation shape:', validation.shape)

## Image Dataset

In [20]:
class ShopeeImageDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, image_files, labels=pd.Series(dtype=object), transform=None):
        """Initializes a dataset containing images and labels."""
        super().__init__()
        self.image_dir = image_dir
        self.image_files = image_files
        if labels.any():
            print("Shopee Dataset Train")
            label_encoder = LabelEncoder()
            self.labels = label_encoder.fit_transform(labels)
            self.train = True
        else:
            print("Shopee Dataset Test")
            self.train = False
        self.transform = transform
                
    def __len__(self):
        """Returns the size of the dataset."""
        return len(self.image_files)

    def __getitem__(self, index):
        """Returns the index-th data item of the dataset."""
        image_file = self.image_files.iloc[index]
        image_path = os.path.join(self.image_dir, image_file)
        image = cv2.imread(image_path)
        # By default OpenCV uses BGR color space for color images,
        # so we need to convert the image to RGB color space.
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
            
        if self.train:
            return image, torch.tensor(self.labels[index]).long()
        else:
            return image, torch.tensor(1)


In [21]:
if not IS_SUBMISSION:
    def show_sample_data_loader(loader, predicted_labels=(), cols=3):
        dataiter = iter(loader)
        images, true_labels = dataiter.next()
        rows = int(np.ceil(len(true_labels) / cols))
        figure, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(12, 24))
        i = 0
        for image, true_label in zip(images, true_labels):

            title = predicted_labels[i] + ' | ' + true_label if predicted_labels else true_label
            color = "green" if not predicted_labels or true_label == predicted_labels[i] else "red"
            ax.ravel()[i].imshow(np.transpose(image, (1, 2, 0)))
            ax.ravel()[i].set_title(title, color=color)
            ax.ravel()[i].set_axis_off()
            i += 1
        plt.tight_layout()
        plt.show()

## Data Augmentation (Validation & Test Set)

In [22]:
if 'Image' in Config.MODEL_TYPE:
    test_transform = A.Compose([A.Resize(Config.IMG_SIZE, Config.IMG_SIZE), 
                                A.Normalize(Config.IMG_MEAN, Config.IMG_STD),
                                ToTensorV2()])

    if not IS_SUBMISSION:
        validation_set = ShopeeImageDataset(Config.TRAIN_DIR, validation.image, validation.label_group, transform=test_transform)
        validation_loader = DataLoader(validation_set, batch_size=20, shuffle=True, num_workers=0)

        #show_sample_data_loader(validation_loader, cols=5)

# Data Augmentation (Train Set)

We apply different random transformations to the train set images that should create additional coherent and plausible product images to train from. They may very well be found as such in the test set (eg: we should not apply a random vertical flip because an important part of products have and are very often shown in their standing position with sometimes a bit of rotation). We thus opted for a random combination of:
- a horizontal flip to cope for asymmetric characteristics that could be seen from both side.
- a random rotation to cope for the different angles of view.
- a variable zoom (most often towards the center of the image) to focus on different key characteristics of products leaving aside misleading backgrounds.

In [23]:
if not IS_SUBMISSION and 'Image' in Config.MODEL_TYPE:
    crop_factor_range = (0.0, 0.20)
    if Config.DATA_AUGMENTATION:
        if Config.BOOST_DATA_AUGMENTATION:
            train_transform = A.Compose([   A.Compose([
                                                A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                                                A.HorizontalFlip(),
                                                A.OneOf([
                                                    A.IAAAdditiveGaussianNoise(),
                                                    A.GaussNoise(),
                                                ], p=0.2),
                                                A.OneOf([
                                                    A.MotionBlur(blur_limit=(5, 7), p=0.2),
                                                    A.MedianBlur(blur_limit=5, p=0.1),
                                                    A.Blur(blur_limit=5, p=0.1),
                                                ], p=0.2),
                                                A.ShiftScaleRotate(shift_limit=0.1, scale_limit=(0, 0.3), rotate_limit=0, p=0.2),
                                                A.Rotate(limit=45, p=0.8),
                                                A.OneOf([
                                                    A.CLAHE(clip_limit=2),
                                                    A.IAASharpen(),
                                                    A.IAAEmboss(),
                                                    A.RandomBrightnessContrast(),
                                                    A.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.2, hue=0.05)
                                                ], p=0.4),
                                                A.HueSaturationValue(),
                                                A.Crop(int(random.uniform(*crop_factor_range)*Config.IMG_SIZE), int(random.uniform(*crop_factor_range)*Config.IMG_SIZE), int((1-random.uniform(*crop_factor_range))*Config.IMG_SIZE), int((1-random.uniform(*crop_factor_range))*Config.IMG_SIZE), p=1),
                                            ], p=0.9),
                                            A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                                            A.Normalize(Config.IMG_MEAN, Config.IMG_STD),
                                            ToTensorV2()
                                        ])
        else:
            train_transform = A.Compose([   A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                                        A.HorizontalFlip(),
                                        A.Rotate(limit=45, p=0.7),
                                        A.Crop(int(random.uniform(*crop_factor_range)*Config.IMG_SIZE), int(random.uniform(*crop_factor_range)*Config.IMG_SIZE), int((1-random.uniform(*crop_factor_range))*Config.IMG_SIZE), int((1-random.uniform(*crop_factor_range))*Config.IMG_SIZE), p=0.8),
                                        A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                                        A.Normalize(Config.IMG_MEAN, Config.IMG_STD),
                                        ToTensorV2()
                                    ])
    else:
        train_transform = A.Compose([   A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                                        A.Normalize(Config.IMG_MEAN, Config.IMG_STD),
                                        ToTensorV2()
                                    ])

    train_set = ShopeeImageDataset(Config.TRAIN_DIR, train.image, train.label_group, transform=train_transform) # training dataloader
    train_loader = DataLoader(train_set, batch_size=20, shuffle=True, num_workers=0)

    #show_sample_data_loader(train_loader, cols=5)

In [24]:
if 'Image' in Config.MODEL_TYPE:
    if not IS_SUBMISSION:
        # Sampler
        classes_weight = {t: 1. / np.log(len(np.where(train.label_group == t)[0])) for t in np.unique(train.label_group)}#np.array([len(np.where(train_labels == t)[0]) for t in np.unique(train_labels)])
        samples_weight = np.array([classes_weight[t] for t in train.label_group])
        samples_weight = torch.from_numpy(samples_weight)
        samples_weight = samples_weight.double()

        weighted_random_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        # Training set
        train_set = ShopeeImageDataset(Config.TRAIN_DIR, 
                                       train.image, 
                                       train.label_group, 
                                       transform=train_transform)

        # Validation set
        validation_set = ShopeeImageDataset(Config.TRAIN_DIR, 
                                            validation.image, 
                                            validation.label_group, 
                                            transform=test_transform)
    else:
        # Testing set
        test_set = ShopeeImageDataset(Config.TEST_DIR, 
                                      test.image,
                                      transform=test_transform)

Shopee Dataset Test


In [25]:
if 'Image' in Config.MODEL_TYPE:
    if not IS_SUBMISSION:
        # Training dataloader
        train_loader = DataLoader(train_set, 
                                  batch_size=Config.BATCH_SIZE, 
                                  #shuffle=True, 
                                  num_workers=Config.NUM_WORKERS, 
                                  sampler=weighted_random_sampler, 
                                  pin_memory=True, 
                                  drop_last=True)
        # Validation dataloader
        validation_loader = DataLoader(validation_set, 
                                       batch_size=Config.BATCH_SIZE, 
                                       shuffle=False, 
                                       num_workers=Config.NUM_WORKERS, 
                                       pin_memory=True, 
                                       drop_last=False)
    else:
        # Testing loader
        test_loader = DataLoader(test_set, 
                                 batch_size=Config.BATCH_SIZE, 
                                 shuffle=False, 
                                 num_workers=Config.NUM_WORKERS, 
                                 pin_memory=True, 
                                 drop_last=False)

## Model

In [26]:
 if 'Image' in Config.MODEL_TYPE:
    class ShopeeImageModel(pl.LightningModule):

        def __init__(
            self,
            num_classes = Config.NUM_CLASSES,
            backbone_name = Config.IMAGE_MODEL_NAME,
            activations = Config.ACTIVATIONS,
            use_fc = Config.USE_FC,
            fc_dim = Config.FC_DIM,
            dropout = Config.DROPOUT,
            loss_name = Config.LOSS,
            margin = Config.MARGIN,
            scale = Config.SCALE,
            is_training_from_scratch = Config.IS_TRAINING_FROM_SCRATCH,
            pretrained = INTERNET_ON):

            super(ShopeeImageModel,self).__init__()
            print('Building model backbone from {}'.format(backbone_name))

            self.backbone = timm.create_model(backbone_name, pretrained=pretrained)

            if 'resnet' in backbone_name:
                final_in_features = self.backbone.fc.in_features
                self.backbone.fc = nn.Identity()
                self.backbone.global_pool = nn.Identity()
                existing_layer = torch.nn.ReLU

            elif 'efficientnet' in backbone_name:
                final_in_features = self.backbone.classifier.in_features
                self.backbone.classifier = nn.Identity()
                self.backbone.global_pool = nn.Identity()
                existing_layer = torch.nn.SiLU

            elif 'nfnet' in backbone_name:
                final_in_features = self.backbone.head.fc.in_features
                self.backbone.head.fc = nn.Identity()
                self.backbone.head.global_pool = nn.Identity()
                existing_layer = torch.nn.SiLU

            if activations == "Mish":
                # Change the activation functions of the model with Mish
                new_layer = Mish()
                self.backbone = self.replace_activations(self.backbone, existing_layer, new_layer)

            if not is_training_from_scratch:
                for param in self.backbone.parameters(): # Fix backbone parameters, we will therefore only tune the model head
                    param.requires_grad = False

            self.pooling = nn.AdaptiveAvgPool2d(1)

            self.use_fc = use_fc

            if use_fc:
                self.dropout = nn.Dropout(p=dropout)
                self.fc = nn.Linear(final_in_features, fc_dim)
                self.bn = nn.BatchNorm1d(fc_dim)
                
                self.init_params()
                final_in_features = fc_dim

            if loss_name == 'ArcFace':
                self.final = ArcMarginProduct(
                    final_in_features,
                    num_classes,
                    scale = scale,
                    margin = margin,
                    easy_margin = False,
                    ls_eps = 0.0
                )

            elif loss_name == 'AdaCos':
                self.final = AdaCos(final_in_features, num_classes)

            elif loss_name == 'CurricularFace':
                self.final = CurricularFace(
                    final_in_features,
                    num_classes,
                    m = margin,
                    s = scale
                )

            elif loss_name == None:
                self.final = nn.Linear(final_in_features, num_classes)

            else:
                raise ValueError(
                    'Argument `loss_name` expected to be one of the following:'
                    f' \'ArcFace\', \'AdaCos\', \'CurricularFace\' but got {loss_name}'
                )

        # credits: https://www.kaggle.com/parthdhameliya77/shopee-pytorch-eca-nfnet-l0-image-training/execution
        def replace_activations(self, model, existing_layer, new_layer):
            for name, module in reversed(model._modules.items()):
                if len(list(module.children())) > 0:
                    model._modules[name] = self.replace_activations(module, existing_layer, new_layer)

                if type(module) == existing_layer:
                    layer_old = module
                    layer_new = new_layer
                    model._modules[name] = layer_new
            return model

        def init_params(self):
            nn.init.xavier_normal_(self.fc.weight)
            nn.init.constant_(self.fc.bias, 0)
            nn.init.constant_(self.bn.weight, 1)
            nn.init.constant_(self.bn.bias, 0)

        def forward(self, image, label):
            features = self.extract_feat(image)
            if Config.LOSS == None:
                logits = self.final(features)
            else:
                logits = self.final(features, label)
            return logits

        def extract_feat(self, x):
            batch_size = x.shape[0]
            x = self.backbone(x)
            x = self.pooling(x).view(batch_size, -1)

            if self.use_fc:
                x = self.dropout(x)
                x = self.fc(x)
                x = self.bn(x)
                
            return x

        def training_step(self, train_batch, batch_idx):
            output = self.forward(*train_batch)
            _, y = train_batch
            loss = nn.CrossEntropyLoss()(output, y)
            y_hat = torch.argmax(output, dim=1)
            logs = {'train_loss': loss}
            self.log_dict(logs, on_epoch=True)
            return loss

        def validation_step(self, val_batch, batch_idx):
            output = self.forward(*val_batch)
            _, y = val_batch
            loss = nn.CrossEntropyLoss()(output, y)
            y_hat = torch.argmax(output, dim=1)
            logs = {'val_loss': loss}
            self.log_dict(logs)
            return loss

        def configure_optimizers(self, 
                                 lr=Config.LR, 
                                 weight_decay=Config.WEIGHT_DECAY, 
                                 T0=Config.CAWR_T0, 
                                 T_mult=Config.CAWR_T_MULT, 
                                 min_lr=Config.CAWR_MIN_LR):
            optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
            if Config.COSINE_ANNEALING_WARM_RESTARTS:
                scheduler = CosineAnnealingWarmRestarts(optimizer, T0, T_mult=T_mult, eta_min=min_lr, verbose=True)
                return [optimizer], [scheduler]

            return optimizer

    if not IS_SUBMISSION:
        image_model = ShopeeImageModel()
        #print(image_model)

In [27]:
if not IS_SUBMISSION and 'Image' in Config.MODEL_TYPE:
    # Train the model
    trainer = pl.Trainer(gpus=-1, 
                         max_epochs=Config.MAX_EPOCHS,
                         callbacks=callbacks,
                         stochastic_weight_avg=Config.STOCHASTIC_WEIGHT_AVERAGING,
                         gradient_clip_val=Config.NORM_CLIPPING,
                         #precision=Config.PRECISION,
                         #profiler="simple",
                         logger=NEPTUNE_LOGGER)
    trainer.fit(image_model, train_loader, validation_loader)

    if Config.SAVE_MODEL:
        if INTERNET_ON:
            # Log model checkpoint to Neptune
            for k in checkpoint_callback.best_k_models.keys():
                model_name = 'checkpoints/' + k.split('/')[-1]
                NEPTUNE_LOGGER.experiment.log_artifact(k, model_name)

            # Log score of the best model checkpoint
            NEPTUNE_LOGGER.experiment.set_property('best_model_score', checkpoint_callback.best_model_score.tolist())

        best_model_path = checkpoint_callback.best_model_path
        print("Best model path: ", best_model_path)

    if INTERNET_ON:
        NEPTUNE_LOGGER.experiment.stop()

# Use Text Embeddings

In [28]:
if not IS_SUBMISSION and 'Text' in Config.MODEL_TYPE:
    train, validation, _, _ = train_test_split(train, 
                                               train.label_group, 
                                               test_size=Config.VALIDATION_SIZE, 
                                               random_state=Config.SEED,
                                               stratify=train.label_group)
    print(f"Trainset length: {len(train)}")
    print(f"Validation set length: {len(validation)}")
    print(f"Validation set -> number of unique labels: {validation.label_group.nunique()}")
    print(f"Train set -> number of unique labels: {train.label_group.nunique()}")

In [29]:
if not IS_SUBMISSION and 'Text' in Config.MODEL_TYPE and False: # Would have been used for ensembling
    if Config.EXPERIMENT == 0 or Config.EXPERIMENT == 2:
        train, validation, _, _ = train_test_split(train, 
                                                   train.label_group, 
                                                   test_size=0.5,#Config.VALIDATION_SIZE, 
                                                   random_state=Config.SEED,
                                                   stratify=train.label_group)
    elif Config.EXPERIMENT == 1 or Config.EXPERIMENT == 3:
        validation, train, _, _ = train_test_split(train, 
                                                   train.label_group, 
                                                   test_size=0.5,#Config.VALIDATION_SIZE, 
                                                   random_state=Config.SEED,
                                                   stratify=train.label_group)

    print(f"Trainset length: {len(train)}")
    print(f"Validation set length: {len(validation)}")
    print(f"Validation set -> number of unique labels: {validation.label_group.nunique()}")
    print(f"Train set -> number of unique labels: {train.label_group.nunique()}")

In [30]:
if not IS_SUBMISSION and 'Text' in Config.MODEL_TYPE:
    targets_per_label_group = validation.groupby('label_group').posting_id.agg('unique')
    validation['targets'] = validation.label_group.map(targets_per_label_group)
    print('Validation shape:', validation.shape)

In [31]:
if 'Text' in Config.MODEL_TYPE:
    tokenizer = AutoTokenizer.from_pretrained(Config.TEXT_MODEL_PATH)

In [32]:
class ShopeeTextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, labels=pd.Series(dtype=object), max_length=None):
        """Initializes a dataset containing texts and labels."""
        super().__init__()
        texts = list(texts.apply(lambda x: str(x)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        
        if labels.any():
            label_encoder = LabelEncoder()
            self.labels = label_encoder.fit_transform(labels)
            self.train = True
        else:
            self.train = False
                
    def __len__(self):
        """Returns the size of the dataset."""
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        """Returns the index-th data item of the dataset."""
        item = {key: torch.tensor(values[index]) for key, values in self.encodings.items()}
            
        if self.train:
            item['label'] = torch.tensor(self.labels[index]).long()

        return item

In [33]:
if not IS_SUBMISSION and 'Text' in Config.MODEL_TYPE:
    title_lengths = train['title'].apply(lambda x: len(x.split(" "))).to_numpy()
    print(f"Train set -> Min words: {title_lengths.min()}, Max words: {title_lengths.max()}")
    plt.hist(title_lengths)
    
    title_lengths = validation['title'].apply(lambda x: len(x.split(" "))).to_numpy()
    print(f"Validation set -> Min words: {title_lengths.min()}, Max words: {title_lengths.max()}")
    plt.hist(title_lengths)

In [34]:
if 'Text' in Config.MODEL_TYPE:
    if not IS_SUBMISSION:
        # Sampler
        classes_weight = {t: 1. / (np.log(len(np.where(train.label_group == t)[0])) + 1) for t in np.unique(train.label_group)}#np.array([len(np.where(train_labels == t)[0]) for t in np.unique(train_labels)])
        samples_weight = np.array([classes_weight[t] for t in train.label_group])
        samples_weight = torch.from_numpy(samples_weight)
        samples_weight = samples_weight.double()

        weighted_random_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        trainset = ShopeeTextDataset(tokenizer, 
                                     train['title'], 
                                     labels=train['label_group'], 
                                     max_length=Config.TOKENIZER_MAX_LENGTH)
        train_loader = torch.utils.data.DataLoader(trainset, 
                                                   batch_size=Config.BATCH_SIZE, 
                                                   num_workers=Config.NUM_WORKERS, 
                                                   #shuffle=True,
                                                   sampler=weighted_random_sampler,
                                                   pin_memory=True,
                                                   drop_last=False)

        validationset = ShopeeTextDataset(tokenizer, 
                                          validation['title'], 
                                          labels=validation['label_group'], 
                                          max_length=Config.TOKENIZER_MAX_LENGTH)
        validation_loader = torch.utils.data.DataLoader(validationset, 
                                                        batch_size=Config.BATCH_SIZE, 
                                                        num_workers=Config.NUM_WORKERS, 
                                                        shuffle=False,
                                                        pin_memory=True,
                                                        drop_last=False)
    else:
        testset = ShopeeTextDataset(tokenizer,
                                    test['title'], 
                                    max_length=Config.TOKENIZER_MAX_LENGTH)
        test_loader = DataLoader(testset, 
                                 batch_size=Config.BATCH_SIZE, 
                                 num_workers=Config.NUM_WORKERS,
                                 shuffle=False, 
                                 pin_memory=True, 
                                 drop_last=False)

In [35]:
if 'Text' in Config.MODEL_TYPE:
    class ShopeeTextModel(pl.LightningModule):
        def __init__(self,
                     num_classes=Config.NUM_CLASSES,
                     backbone_path=Config.TEXT_MODEL_PATH,
                     use_fc=Config.USE_FC,
                     fc_dim=Config.FC_DIM,
                     dropout = Config.DROPOUT,
                     loss_name = Config.LOSS,
                     margin = Config.MARGIN,
                     scale = Config.SCALE,
                     is_training_from_scratch=Config.IS_TRAINING_FROM_SCRATCH):
            super().__init__()
            self.backbone = AutoModel.from_pretrained(backbone_path)
            final_in_features = self.backbone.config.hidden_size

            if not is_training_from_scratch:
                for param in self.backbone.parameters(): # Fix backbone parameters, we will therefore only tune the model head
                    param.requires_grad = False
            else:
                self.backbone.train()

            #self.pooling = nn.AdaptiveAvgPool2d(1)

            self.use_fc = use_fc
            if use_fc:
                self.dropout = nn.Dropout(p=dropout)
                self.fc = nn.Linear(final_in_features, fc_dim)
                self.bn = nn.BatchNorm1d(fc_dim)
                self.init_params()
                final_in_features = fc_dim

            if loss_name == 'ArcFace':
                self.final = ArcMarginProduct(
                    final_in_features,
                    num_classes,
                    scale = scale,
                    margin = margin,
                    easy_margin = False,
                    ls_eps = 0.0
                )

            elif loss_name == 'AdaCos':
                self.final = AdaCos(final_in_features, num_classes)

            elif loss_name == 'CurricularFace':
                self.final = CurricularFace(
                    final_in_features,
                    num_classes,
                    m = margin,
                    s = scale
                )
            elif loss_name == None:
                self.final = nn.Linear(final_in_features, num_classes)
            else:
                raise ValueError(
                    'Argument `loss_name` expected to be one of the following:'
                    f' \'ArcFace\', \'AdaCos\', \'CurricularFace\' but got {loss_name}'
                )     


        def init_params(self):
            nn.init.xavier_normal_(self.fc.weight)
            nn.init.constant_(self.fc.bias, 0)
            nn.init.constant_(self.bn.weight, 1)
            nn.init.constant_(self.bn.bias, 0)
        
        def extract_features(self, input_ids, attention_mask):
            output = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
            CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
            x = CLS_token_state
            if self.use_fc:
                x = self.dropout(x)
                x = self.fc(x)
                x = self.bn(x)
            return x
        
        def forward(self, batch):
            CLS_hidden_state = self.extract_features(batch['input_ids'], batch['attention_mask'])
            output = self.final(CLS_hidden_state, batch['label'])
            return output

        def training_step(self, train_batch, batch_idx):
            output = self.forward(train_batch)
            y = train_batch['label']
            loss = nn.CrossEntropyLoss()(output, y)
            logs = {'train_loss': loss}
            self.log_dict(logs, on_epoch=True)
            return loss

        def validation_step(self, val_batch, batch_idx):
            output = self.forward(val_batch)
            y = val_batch['label']
            loss = nn.CrossEntropyLoss()(output, y)
            logs = {'val_loss': loss}
            self.log_dict(logs)
            return loss

        def configure_optimizers(self, 
                                 lr=Config.LR, 
                                 weight_decay=Config.WEIGHT_DECAY, 
                                 T0=Config.CAWR_T0, 
                                 T_mult=Config.CAWR_T_MULT, 
                                 min_lr=Config.CAWR_MIN_LR):
            optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
            if Config.COSINE_ANNEALING_WARM_RESTARTS:
                scheduler = CosineAnnealingWarmRestarts(optimizer, T0, T_mult=T_mult, eta_min=min_lr, verbose=True)
                return [optimizer], [scheduler]

            return optimizer

    if not IS_SUBMISSION:
        text_model = ShopeeTextModel()
        #print(text_model)

In [36]:
if not IS_SUBMISSION and 'Text' in Config.MODEL_TYPE:
    # Train the model
    trainer = pl.Trainer(gpus=-1, 
                         max_epochs=Config.MAX_EPOCHS,
                         callbacks=callbacks,
                         stochastic_weight_avg=Config.STOCHASTIC_WEIGHT_AVERAGING,
                         gradient_clip_val=Config.NORM_CLIPPING,
                         #precision=Config.PRECISION,
                         #profiler="simple",
                         logger=NEPTUNE_LOGGER)
    trainer.fit(text_model, train_loader, validation_loader)

    if Config.SAVE_MODEL:
        if INTERNET_ON:
            # Log model checkpoint to Neptune
            for k in checkpoint_callback.best_k_models.keys():
                model_name = 'checkpoints/' + k.split('/')[-1]
                NEPTUNE_LOGGER.experiment.log_artifact(k, model_name)

            # Log score of the best model checkpoint
            NEPTUNE_LOGGER.experiment.set_property('best_model_score', checkpoint_callback.best_model_score.tolist())

        best_model_path = checkpoint_callback.best_model_path
        print("Best model path: ", best_model_path)
    
    if INTERNET_ON:
        NEPTUNE_LOGGER.experiment.stop()

# Predictions

## Predictions Based on Images

credits: https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700

In [37]:
def get_image_embeddings(model_path, dataloader):
    model = ShopeeImageModel.load_from_checkpoint(model_path)
    model.eval()
    
    all_embeddings = []
    model.to(Config.DEVICE)
    with torch.no_grad():
        for img, label in tqdm(dataloader): 
            img = img.cuda()
            feat = model.extract_feat(img)
            image_embeddings = feat.detach().cpu().numpy()
            all_embeddings.append(image_embeddings)
    
    del model
    image_embeddings = np.concatenate(all_embeddings)
    image_embeddings /= np.linalg.norm(image_embeddings, 2, axis=1, keepdims=True) # Normalize embeddings
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del all_embeddings
    gc.collect()
    
    return image_embeddings

In [38]:
def get_image_predictions(df, embeddings, threshold=0.0):
    
    if len(df) > 3:
        KNN = 50
    else: 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].to_numpy()
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect() # Garbage Collector
    return predictions

In [39]:
if 'Image' in Config.MODEL_TYPE:
    if IS_SUBMISSION:
        test_set = ShopeeImageDataset(Config.TEST_DIR, 
                                          test.image,
                                          transform=test_transform)
        test_loader = DataLoader(test_set, 
                                 batch_size=12,  
                                 num_workers=Config.NUM_WORKERS,
                                 shuffle=False,
                                 pin_memory=True, 
                                 drop_last=False)
        dataloader = test_loader
    else:
        dataloader = validation_loader
    #show_sample_data_loader(dataloader)

Shopee Dataset Test


In [40]:
if 'Image' in Config.MODEL_TYPE:
    if not Config.ENSEMBLE:
        best_model_path = "../input/shopeepretrainedmodels/eca_nfnet_l0_Mish_CurricularFace-epoch14-val_loss12.28.ckpt"
        image_embeddings = get_image_embeddings(model_path=best_model_path, dataloader=dataloader)
        image_embeddings /= np.linalg.norm(image_embeddings, 2, axis=1, keepdims=True) # Normalize embeddings 
    else:
        best_model_paths = ['']
        all_image_embeddings = np.zeros(len(test))
        for experiment in range(Config.MAX_NUM_EXPERIMENTS):
            image_embeddings = get_image_embeddings(model_path=best_model_paths[experiment], dataloader=validation_loader)
            all_image_embeddings.append(image_embeddings)
        image_embeddings = np.concatenate(all_image_embeddings, axis=1)
        image_embeddings /= np.linalg.norm(image_embeddings, 2, axis=1, keepdims=True) # Normalize embeddings 

Building model backbone from eca_nfnet_l0


  0%|          | 0/1 [00:00<?, ?it/s]

Our image embeddings shape is (3, 512)


In [41]:
if 'Image' in Config.MODEL_TYPE:
    if not IS_SUBMISSION:
        validation_image_predictions = get_image_predictions(validation, image_embeddings, threshold=0.35)
        validation['pred_image'] = validation_image_predictions
    else:
        test_image_predictions = get_image_predictions(test, image_embeddings, threshold=0.35)
        test['pred_image'] = test_image_predictions

#validation.head() 
#test.head()

  0%|          | 0/3 [00:00<?, ?it/s]

In [42]:
if not IS_SUBMISSION and 'Image' in Config.MODEL_TYPE:
    validation['f1score'] = validation.apply(getMetric('pred_image'), axis=1)
    print('Image Prediction Validation Score: ', validation.f1score.mean())

## Predictions Based on Text

In [43]:
def get_text_embeddings(model_path, dataloader, backbone_path=None):
    if backbone_path != None:
        model = ShopeeTextModel(backbone_path=backbone_path).load_from_checkpoint(model_path)
    else:
        model = ShopeeTextModel.load_from_checkpoint(model_path)
    model.eval()

    all_embeddings = []
    model.to(Config.DEVICE)
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(Config.DEVICE)
            attention_mask = batch['attention_mask'].to(Config.DEVICE)
            feat = model.extract_features(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            all_embeddings.append(text_embeddings)

    del model
    text_embeddings = np.concatenate(all_embeddings)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del all_embeddings
    gc.collect()

    return text_embeddings

In [44]:
def get_text_predictions(df, embeddings, threshold=0.0):

    if len(df) > 3:
        KNN = 50
    else: 
        KNN = 3

    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].to_numpy()
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect() # Garbage Collector
    return predictions

In [45]:
if 'Text' in Config.MODEL_TYPE:
    if IS_SUBMISSION:
        testset = ShopeeTextDataset(tokenizer,
                                        test['title'], 
                                        max_length=Config.TOKENIZER_MAX_LENGTH)
        test_loader = DataLoader(testset, 
                                 batch_size=12, 
                                 num_workers=Config.NUM_WORKERS,
                                 shuffle=False, 
                                 pin_memory=True, 
                                 drop_last=False)
        dataloader = test_loader
    else:
        dataloader = validation_loader
        #show_sample_data_loader(dataloader)

In [46]:
if 'Text' in Config.MODEL_TYPE:
    if not Config.ENSEMBLE:
        best_model_path = '../input/shopeepretrainedmodels/distilbert-base-indonesian_CurricularFace-epoch08-val_loss12.95.ckpt'
        text_embeddings = get_text_embeddings(model_path=best_model_path, dataloader=dataloader)
        text_embeddings /= np.linalg.norm(text_embeddings, 2, axis=1, keepdims=True) # Normalize embeddings 
    else:
        all_text_embeddings = np.zeros(len(test))
        for experiment in range(Config.MAX_NUM_EXPERIMENTS):
            text_embeddings = get_text_embeddings(backbone_path=TEXT_BACKBONE_PATHS[experiment], model_path=TEXT_MODEL_PATHS[experiment], dataloader=dataloader)
            all_text_embeddings.append(image_embeddings)
        text_embeddings = np.concatenate(all_text_embeddings)
        text_embeddings /= np.linalg.norm(text_embeddings, 2, axis=1, keepdims=True) # Normalize embeddings 

  0%|          | 0/1 [00:00<?, ?it/s]

Our text embeddings shape is (3, 512)


In [47]:
if 'Text' in Config.MODEL_TYPE:
    if not IS_SUBMISSION:
        validation_text_predictions = get_text_predictions(validation, text_embeddings, threshold=0.15)
        validation['pred_text'] = validation_text_predictions
    else:
        test_text_predictions = get_text_predictions(test, text_embeddings, threshold=0.15)
        test['pred_text'] = test_text_predictions

#test.head()
#validation.head()

  0%|          | 0/3 [00:00<?, ?it/s]

In [48]:
if not IS_SUBMISSION and 'Text' in Config.MODEL_TYPE:
    validation['f1score'] = validation.apply(getMetric('pred_text'), axis=1)
    print('Text Prediction Validation Score: ', validation.f1score.mean())

## Predictions based on image_phash
Group together all duplicate images, i.e. with the same `image_phash`. Let's calcuate the baseline train score for this submission.

In [49]:
def predict_phash(dataset):
    ids_per_image_phash = dataset.groupby('image_phash').posting_id.agg('unique')
    return dataset.image_phash.map(ids_per_image_phash)

In [50]:
if not IS_SUBMISSION:
    #train['pred_phash'] = predict_phash(train)
    validation['pred_phash'] = predict_phash(validation)
else:
    test['pred_phash'] = predict_phash(test)

In [51]:
if not IS_SUBMISSION:
    #train['f1score'] = train.apply(getMetric('pred_phash'), axis=1)
    #print('Baseline Train Score: ', train.f1score.mean())
    validation['f1score'] = validation.apply(getMetric('pred_phash'), axis=1)
    print('Phash Prediction Validation Score: ', validation.f1score.mean())

## Predictions Based on Concatenated Text and Image Embeddings

In [52]:
def get_predictions(df, embeddings, threshold=0.0):

    if len(df) > 3:
        KNN = 50
    else: 
        KNN = 3

    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].to_numpy()
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect() # Garbage Collector
    return predictions

In [53]:
if IS_SUBMISSION and 'Image' in Config.MODEL_TYPE and 'Text' in Config.MODEL_TYPE:
    all_embeddings = np.concatenate([image_embeddings, text_embeddings], axis=1)
    all_embeddings /= np.linalg.norm(all_embeddings, 2, axis=1, keepdims=True)
    test['pred_all'] = get_predictions(test, all_embeddings, threshold=0.35)

  0%|          | 0/3 [00:00<?, ?it/s]

## Combine Predictions


In [54]:
if not IS_SUBMISSION:
    def combine_predictions(row):
        if 'Text' in Config.MODEL_TYPE and 'Image' in Config.MODEL_TYPE:
            predictions = np.concatenate([row['pred_text'], row['pred_image'], row['pred_all'], row['pred_phash']])
        elif 'Text' in Config.MODEL_TYPE:
            predictions = np.concatenate([row['pred_text'], row['pred_phash']]) 
        elif 'Image' in Config.MODEL_TYPE:
            predictions = np.concatenate([row['pred_image'], row['pred_phash']])
        return np.unique(predictions)


    validation['matches'] = validation.apply(combine_predictions, axis=1)
    
#validation.head()

In [55]:
if not IS_SUBMISSION:
    validation['f1score'] = validation.apply(getMetric('matches'), axis=1)
    print('Final Validation Score: ', validation.f1score.mean())

# Make a Submission

In [56]:
test.head()

Unnamed: 0,posting_id,image,image_phash,title,pred_image,pred_text,pred_phash,pred_all
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929]


In [57]:
def combine_predictions(row):
    if 'Text' in Config.MODEL_TYPE and 'Image' in Config.MODEL_TYPE:
        predictions = np.concatenate([row['pred_text'], row['pred_image'], row['pred_all'], row['pred_phash']])
    elif 'Text' in Config.MODEL_TYPE:
        predictions = np.concatenate([row['pred_text'], row['pred_phash']]) 
    elif 'Image' in Config.MODEL_TYPE:
        predictions = np.concatenate([row['pred_image'], row['pred_phash']])
    return ' '.join(np.unique(predictions))

if not IS_SUBMISSION:
    validation['matches'] = validation.apply(combine_predictions, axis=1)
    #train['matches'] = train.apply(combine_predictions, axis=1)
else:
    test['matches'] = test.apply(combine_predictions, axis=1)
#train.head()
#validation.head()
#test.head()

In [58]:
submission = test[['posting_id', 'matches']]
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929
