In [1]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

In [2]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np

# Visuals and CV2
import cv2
import seaborn as sns

# albumentations for augs
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

#torch
import torch
import timm
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader


import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
from cuml.metrics import pairwise_distances
from scipy.spatial.distance import cosine

from transformers import AutoModel, AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
import transformers
import nltk

import lightgbm as lgb
import joblib
from sklearn.preprocessing import StandardScaler

In [3]:
DIM768 = (768, 768)
DIM544 = (544, 544)
DIM512 = (512, 512)
DIM224 = (224, 224)

NUM_WORKERS = 4
BATCH_SIZE = 12
SEED = 2020

device = torch.device('cuda')

CLASSES = 11014

FEATURES = ['effnet512_dist', 'nfnet512_dist', 'tfidf_dist', 'bert_dist', 'sentence_dist']
TFIDF = ['tfidf_dist']
tfidf_coef = 0.2
CV = ['effnet512_dist', 'nfnet512_dist']
cv_coef = 0.5
TEXT = ['bert_dist', 'sentence_dist']
text_coef = 0.3
# FEATURES = ['effnet768_dist', 'effnet512_dist', 'effnet224_dist', 'nfnet512_dist', 'tfidf_dist', 'bert_dist']

KNN_classifier = 100

################################################  ADJUSTING FOR CV OR SUBMIT ##############################################

CHECK_SUB = False
GET_CV = True
SHORT = False

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: GET_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')


################################################# MODEL ####################################################################

# model_name = 'efficientnet_b3' #efficientnet_b0-b7

################################################ MODEL PATH ###############################################################

PATH_EFFNET_224 = '../input/shopee-models/model_efficientnet_b3_IMG_SIZE_224_arcface_9.088_FullData.bin'
PATH_EFFNET_512 = '../input/shopee-models/model_efficientnet_b4_IMG_SIZE_512_arcface_8.395_FullData.bin'
PATH_EFFNET_768 = '../input/shopee-models/model_efficientnet_b3_IMG_SIZE_768_arcface_9.358_FullData.bin'
PATH_NFNET_512 = '../input/shopee-models/model_eca_nfnet_l0_IMG_SIZE_512_arcface_8.239_FullData.bin'
PATH_XCEPT_512 = '../input/shopee-models/XCEPTION_512_arcface_9.596_RMAC_GEM.bin'
PATH_NFNET_TEMP_512 = '../input/shopee-models/model_eca_nfnet_l0_IMG_SIZE_512_arcface_6.501_FullData_temp_2.bin'


PATH_DISTIL_BERT = '../input/shopee-models/DistilBertFullData.pt'
PATH_IND_ROBERTA = '../input/shopee-text-indonesian-roberta/indonesian_roberta_base_best_loss_num_epochs_60_arcface.bin'
PATH_SENTENCE_MODEL = '../input/shopee-text-sentence-transformers-multilingual/sentence_transfomer_bert_best_loss_num_epochs_70_arcface.bin'

################################################ Metric Loss and its params #######################################################
loss_module = 'arcface' #'cosface' #'adacos'
s = 30.0
m = 0.5 
ls_eps = 0.0
easy_margin = False

model_params = {
    'n_classes':11014,
    'use_fc':True,
    'temperature': 0,
    'fc_dim':512,
    'dropout':0.0,
    'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785,
    'pretrained':False
}

transformer_model = '../input/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

sentence_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'pooling':'mean_pooling',   #max_pooling , #mean_pooling
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
    'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785
}

this submission notebook will compute CV score, but commit notebook will not


In [4]:
to_eng = { "wanita": "woman", "anak": "child", "bayi": "baby", "tas": "bag", "masker": "face mask", "pria": "men", "murah": "cheap", "tangan": "hand", "alat": "tool", "motif": "motive", "warna": "color", "bahan": "material", "celana": "pants", "baju": "clothes", "kaos": "t-shirt", "sepatu": "shoes", "rambut": "hair", "mainan": "toy", "sarung": "holster", "polos": "plain", "rak": "rack", "botol": "bottle", "sabun": "soap", "kain": "fabric", "panjang": "long", "kabel": "cable", "buku": "book", "plastik": "plastic", "mobil": "car", "hitam": "black", "karakter": "character", "putih": "white", "dompet": "purse", "kaki": "feet", "pembersih": "cleaners", "lipat": "folding", "silikon": "silicone", "minyak": "oil", "isi": "contents", "paket": "package", "susu": "milk", "gamis": "robe", "mandi": "bath", "madu": "honey", "kulit": "skin", "serbaguna": "multipurpose", "bisa": "can", "kacamata": "spectacles", "pendek": "short", "tali": "rope", "selempang": "sash", "topi": "hat", "obat": "drug", "gantungan": "hanger", "tahun": "year", "jilbab": "hijab", "dapur": "kitchen", "dinding": "wall", "kuas": "brush", "perempuan": "woman", "katun": "cotton", "sepeda": "bike", "lucu": "funny", "lengan": "arm", "kaca": "glass", "garansi": "warranty", "bunga": "flower", "handuk": "towel", "dewasa": "adult", "elektrik": "electric", "timbangan": "balance", "besar": "big", "bahan": "ingredient", "ransel": "backpack", "kertas": "paper"}
to_ind = {v: k for k, v in to_eng.items()}
to_ind_reg = {r'(\b){}(\b)'.format(k):r'\1{}\2'.format(v) for k,v in to_ind.items()}

In [5]:
def read_dataset():
    if GET_CV:
#         df = pd.read_csv('../input/shopee-product-matching/train.csv')
        df = pd.read_csv('../input/shopee-folds/train_fold.csv')
        df = df[df.fold == 0]
        if SHORT:
            df = df.sample(150)
        df['title'] = df.title.apply(lambda x: x.lower().replace('\\', ' '))
        df['title'] = df['title'].replace(to_ind_reg, regex=True)
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df['title'] = df.title.apply(lambda x: x.lower().replace('\\', ' '))
        df['title'] = df['title'].replace(to_ind_reg, regex=True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_paths

In [6]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [7]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def prec_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    precision = intersection / (len_y_pred)
    return precision

def recall_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    recall = intersection / (len_y_true)
    return recall


def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['bert_predictions']])
    return ' '.join(np.unique(x))


def get_neighbors(df, embeddings, KNN = 50, type_='image'):
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        if type_ == 'bert':
            thresholds = list(np.arange(1, 35, 1))
        else:
            thresholds = list(np.arange(0.01, 1, 0.05))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            df['precision'] = prec_score(df['matches'], df['pred_matches'])
            df['recall'] = recall_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            print(f"Our precision score for threshold {threshold} is {df['precision'].mean()}")
            print(f"Our recall score for threshold {threshold} is {df['recall'].mean()}")
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] < best_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if type_=='image':
                idx = np.where(distances[k,] < 0.65)[0]
            elif type_=='bert':
                idx = np.where(distances[k,] < 7.5)[0]
            else:
                idx = np.where(distances[k,] < 0.7)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

def get_test_transforms(DIM):
    return albumentations.Compose(
        [
            albumentations.Resize(DIM[0],DIM[1],always_apply=True),
            albumentations.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [8]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
        
        
        return image,torch.tensor(1)
    

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['label_code'].values
        texts = list(dataframe['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
        
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
        return item
    
class SentenceDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask, torch.tensor(1)

In [9]:
class CFG:
    DistilBERT = True # if set to False, BERT model will be used
    bert_hidden_size = 768
    
    batch_size = 128
    epochs = 200
    num_workers = 4
    learning_rate = 3e-5 #3e-5
    scheduler = "ReduceLROnPlateau"
    step = 'epoch'
    patience = 2
    factor = 0.8
    dropout = 0.5
    model_path = "./"
    max_length = 60
    model_save_name = "distil_bert.pt"
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    
if CFG.DistilBERT:
    model_name='../input/distilbert-base-indonesian'
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    bert_model = DistilBertModel.from_pretrained(model_name)
    
class BertModel(nn.Module):
    def __init__(self, 
                 bert_model, 
                 num_classes=11014, 
                 last_hidden_size=CFG.bert_hidden_size):
        
        super().__init__()
        self.bert_model = bert_model
        self.arc_margin = ArcMarginProduct(last_hidden_size, 
                                           num_classes, 
                                           s=30.0, 
                                           m=0.50, 
                                           easy_margin=False)
    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        #output = self.arc_margin(CLS_hidden_state, batch['labels'])
        return CLS_hidden_state
     

        
class SentenceModel(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 pooling='mean_pooling',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 s=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(SentenceModel, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(transformer_model)
        final_in_features = self.transformer.config.hidden_size
        
        self.pooling = pooling
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim

        self.loss_module = loss_module
        if loss_module == 'arcface':
            self.final = ArcMarginProduct(final_in_features, n_classes,
                                          s=s, m=margin, easy_margin=False, ls_eps=ls_eps)
        else:
            self.final = nn.Linear(final_in_features, n_classes)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask, label):
        feature = self.extract_feat(input_ids,attention_mask)
        if self.loss_module == 'arcface':
            logits = self.final(feature, label)
        else:
            logits = self.final(feature)
        return feature, logits

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        if self.pooling == 'mean_pooling':
            features = x[0]
            features = torch.mean(features,1)
            #self.mean_pooling(x,attention_mask)
        elif self.pooling == 'max_pooling':
            festures = self.max_pooling(x,attention_mask)
        else:
            features = x[0]
            features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features
    
    def max_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
        max_over_time = torch.max(token_embeddings, 1)[0]
        return max_over_time
    
    def mean_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='efficientnet_b0',
                 use_fc=False,
                 temperature=0,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 s=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785,
                 pretrained=False):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()
        print('Model building for {} backbone'.format(model_name))
        
        if temperature != 0:
            self.temperature = nn.Parameter(torch.ones(1) * temperature)

        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        if model_name.startswith('xception'):
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        if model_name.startswith('efficientnet'):
            print("EFFNET")
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif model_name.startswith('inception_resnet'):
            print("INCEPTIONRESNET")
            final_in_features = self.backbone.classif.in_features
            self.backbone.classif = nn.Identity()
        elif 'nfnet' in model_name:
            print("NFNET")
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head = nn.Identity()
            
        self.backbone.global_pool = nn.Identity()
        
        if model_name.startswith('xception'):
            self.rmac_pooling = RMAC()
            self.gem_pooling = GeM()
        
        self.pooling =  nn.AdaptiveAvgPool2d(1)
            
        self.use_fc = use_fc
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            if model_name.startswith('xception'):
                self.fc = nn.Linear(2*final_in_features, fc_dim)
            else:
                self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn1 = nn.BatchNorm2d(final_in_features)
            self.bn2 = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim

        self.loss_module = loss_module
        if loss_module == 'arcface':
            self.final = ArcMarginProduct(final_in_features, n_classes,
                                          s=s, m=margin, easy_margin=False, ls_eps=ls_eps)
        elif loss_module == 'cosface':
            self.final = AddMarginProduct(final_in_features, n_classes, s=s, m=margin)
        elif loss_module == 'adacos':
            self.final = AdaCos(final_in_features, n_classes, m=margin, theta_zero=theta_zero)
        else:
            self.final = nn.Linear(final_in_features, n_classes)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn1.weight, 1)
        nn.init.constant_(self.bn1.bias, 0)
        nn.init.constant_(self.bn2.weight, 1)
        nn.init.constant_(self.bn2.bias, 0)

    def forward(self, x, label):
        feature = self.extract_feat(x)
        if self.loss_module in ('arcface', 'cosface', 'adacos'):
            logits = self.final(feature, label)
        else:
            logits = self.final(feature)
        return feature, logits

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.bn1(x)
    
        try:
            gem_x = self.gem_pooling(x).view(batch_size, -1)
            rmac_x = self.rmac_pooling(x).view(batch_size, -1)
            x = torch.cat([rmac_x, gem_x], axis=1)
        except:
            x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn2(x)
            x = F.normalize(x)

        return x

In [10]:
def rmac(x, L=3, eps=1e-6):
    ovr = 0.4 # desired overlap of neighboring regions
    steps = torch.Tensor([2, 3, 4, 5, 6, 7]) # possible regions for the long dimension

    W = x.size(3)
    H = x.size(2)

    w = min(W, H)
    w2 = math.floor(w/2.0 - 1)

    b = (max(H, W)-w)/(steps-1)
    (tmp, idx) = torch.min(torch.abs(((w**2 - w*b)/w**2)-ovr), 0) # steps(idx) regions for long dimension

    # region overplus per dimension
    Wd = 0;
    Hd = 0;
    if H < W:  
        Wd = idx.item() + 1
    elif H > W:
        Hd = idx.item() + 1

    v = F.max_pool2d(x, (x.size(-2), x.size(-1)))
    v = v / (torch.norm(v, p=2, dim=1, keepdim=True) + eps).expand_as(v)

    for l in range(1, L+1):
        wl = math.floor(2*w/(l+1))
        wl2 = math.floor(wl/2 - 1)

        if l+Wd == 1:
            b = 0
        else:
            b = (W-wl)/(l+Wd-1)
        cenW = torch.floor(wl2 + torch.Tensor(range(l-1+Wd+1))*b) - wl2 # center coordinates
        if l+Hd == 1:
            b = 0
        else:
            b = (H-wl)/(l+Hd-1)
        cenH = torch.floor(wl2 + torch.Tensor(range(l-1+Hd+1))*b) - wl2 # center coordinates
            
        for i_ in cenH.tolist():
            for j_ in cenW.tolist():
                if wl == 0:
                    continue
                R = x[:,:,(int(i_)+torch.Tensor(range(wl)).long()).tolist(),:]
                R = R[:,:,:,(int(j_)+torch.Tensor(range(wl)).long()).tolist()]
                vt = F.max_pool2d(R, (R.size(-2), R.size(-1)))
                vt = vt / (torch.norm(vt, p=2, dim=1, keepdim=True) + eps).expand_as(vt)
                v += vt

    return v


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
    # return F.lp_pool2d(F.threshold(x, eps, eps), p, (x.size(-2), x.size(-1))) # alternative


    
class RMAC(nn.Module):

    def __init__(self, L=3, eps=1e-6):
        super(RMAC,self).__init__()
        self.L = L
        self.eps = eps

    def forward(self, x):
        return rmac(x, L=self.L, eps=self.eps)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'L=' + '{}'.format(self.L) + ')'
    
    
class GeM(nn.Module):

    def __init__(self, p=4, eps=1e-6):
        super(GeM,self).__init__()
        self.p = Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'
    
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [11]:
def get_image_embeddings(image_paths, model_name, PATH_EFFNET, DIM):
    embeds = []
    
    model = ShopeeNet(model_name=model_name, **model_params)
    model.load_state_dict(torch.load(PATH_EFFNET))
    model = model.to(device)
    model.eval()

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms(DIM))
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat, _ = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

def get_bert_embeddings(df):
    embeds = []
    
    bert_dataset = TextDataset(df, tokenizer, max_length=CFG.max_length, mode='test')
    model = BertModel(bert_model).to(CFG.device)
    model.load_state_dict(torch.load(PATH_DISTIL_BERT))
    model = model.to(device)
    model.eval()
    
    loader = torch.utils.data.DataLoader(
        bert_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(CFG.device) for k, v in batch.items()}
        preds = model(batch)
        bert_embeddings = preds.detach().cpu().numpy()
        embeds.append(bert_embeddings)
    
    del model
    bert_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {bert_embeddings.shape}')
    del embeds
    gc.collect()
    return bert_embeddings


def get_sentence_embeddings(df):
    embeds = []
    
    bert_dataset = SentenceDataset(csv=df)
    model = SentenceModel(**sentence_params)
    model.load_state_dict(torch.load(PATH_SENTENCE_MODEL))
    model = model.to(device)
    model.eval()
    
    loader = torch.utils.data.DataLoader(
        bert_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    tqdm_object = tqdm(enumerate(loader), total=len(loader))
    for bi,d in tqdm_object:
        
        batch_size = d[0].shape[0]

        input_ids = d[0]
        attention_mask = d[1]
        targets = d[2]

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        targets = targets.to(device)

        preds, _ = model(input_ids,attention_mask,targets)
        sentence_embeddings = preds.detach().cpu().numpy()
        embeds.append(sentence_embeddings)
        
    
    del model
    sentence_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {sentence_embeddings.shape}')
    del embeds
    gc.collect()
    return sentence_embeddings

In [12]:
def get_text_embeddings(df_cu, max_features = 15000, n_components = 5000):
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
#     pca = PCA(n_components = n_components)
#     text_embeddings = pca.fit_transform(text_embeddings).get()
    print(f'Our title text embedding shape is {text_embeddings.shape}')
#     del model, pca
    del model
    gc.collect()
    return text_embeddings.get()

In [13]:
df,df_cu,image_paths = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,fold,matches
7,train_1806152124,0014f61389cbaa687a58e38a97b6383d.jpg,eea7e1c0c04da33d,kulot plisket salur /candy plisket /wish kulot...,1565741687,0,train_1806152124
9,train_831680791,001be52b2beec40ddc1d2d7fc7a68f08.jpg,e1ce953d1a70618f,besar sale sepatu pantofel kulit keren kerja k...,2630990665,0,train_831680791
14,train_4287573913,001f5580b058c6b8e33132190a757318.jpg,dc85e1750687f932,charger vizz vz-tc11 / batok charger vizz 1a o...,1932232224,0,train_4287573913
19,train_2961381387,00303ad1c062fdeaf5f41b9ffb71a5fb.jpg,e48d9b652098efe1,madame gie makeup blush on by gisell,2098400894,0,train_2961381387
21,train_2238403912,003524b70715bf6bfa00451ca08e66e0.jpg,ba35c44a3fb7c068,kangaroo teflon / allu fry pan 18 cm - kg652,531768918,0,train_2238403912


In [15]:
def find_neighbors(embeddings, n):
    if len(df) > 3:
        n = n
    else: 
        n = 3
    weights = np.logspace(0, -1.5, n)
    model = NearestNeighbors(n_neighbors=n, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    neighbors = []
    for k in range(embeddings.shape[0]):
        emb = (embeddings[indices[k, ]])
        nearers = np.dot(weights, emb)
        neighbors.append(nearers)
        
    neighbors = np.vstack(neighbors)
    
    del model, distances, indices
    gc.collect()
    return neighbors

In [16]:
bert_embeddings = get_bert_embeddings(df)
sentence_embeddings = get_sentence_embeddings(df)

image512_embeddings = get_image_embeddings(image_paths.values, 'efficientnet_b3', PATH_EFFNET_512, DIM544)
image512_embeddings_nfnet = get_image_embeddings(image_paths.values, 'eca_nfnet_l0', PATH_NFNET_512, DIM544)

text_embeddings = get_text_embeddings(df_cu, max_features = 15000, n_components = 5000)


100%|██████████| 571/571 [00:09<00:00, 58.57it/s]


Our image embeddings shape is (6850, 768)


100%|██████████| 571/571 [00:17<00:00, 32.38it/s]


Our image embeddings shape is (6850, 768)
Model building for efficientnet_b3 backbone
EFFNET


100%|██████████| 571/571 [02:10<00:00,  4.37it/s]


Our image embeddings shape is (6850, 512)
Model building for eca_nfnet_l0 backbone
NFNET


100%|██████████| 571/571 [02:13<00:00,  4.29it/s]


Our image embeddings shape is (6850, 512)
Our title text embedding shape is (6850, 12185)


In [17]:
image512_embeddings_new = find_neighbors(image512_embeddings, n=4).astype(np.half)
image512_embeddings_nfnet_new = find_neighbors(image512_embeddings_nfnet, n=4).astype(np.half)
bert_embeddings_new = find_neighbors(bert_embeddings, n=4).astype(np.half)
sentence_embeddings_new = find_neighbors(sentence_embeddings, n=4).astype(np.half)

del image512_embeddings, image512_embeddings_nfnet, bert_embeddings, sentence_embeddings
gc.collect()

0

In [19]:
from functools import reduce

thr = -3.3
thr_unpopular = -3
numb_unpopular = 2
thr_popular = -3.4
numb_popular = 100
CHUNK = 4096
    
print("THR: ", thr)
cts = len(bert_embeddings_new) // CHUNK
if len(bert_embeddings_new) % CHUNK != 0: 
    cts += 1

result = pd.DataFrame()

for i in range(cts):
    a = i*CHUNK
    b = (i+1)*CHUNK
    b = min(b,len(bert_embeddings_new))
#         print('chunk',a,'to',b)

    dfs = []
    for embs, dist_col in zip([image512_embeddings_new, image512_embeddings_nfnet_new,
                               text_embeddings, bert_embeddings_new, sentence_embeddings_new],
                              FEATURES):
#       ####################

        model = NearestNeighbors(n_neighbors = KNN_classifier)
        model.fit(embs)
        distances, indices = model.kneighbors(embs[a:b])

        predictions = []
        pred_distances = []
        numb = CHUNK if b % CHUNK == 0 else b % CHUNK
        for k in range(numb):
            predictions.append(df['posting_id'].iloc[indices[k,:]].values)
            pred_distances.append(distances[k,:])

        df_ = df.reset_index().loc[a:b-1]
        df_['labels'] = predictions
        df_['dists'] = pred_distances
        df_markup = cudf.DataFrame({'posting_id':np.repeat(df_.posting_id.values, df_.dists.str.len()),
                  'neigh':np.concatenate(df_.labels.values),
                  dist_col:np.concatenate(df_.dists.values)})

        dfs.append(df_markup[['posting_id', 'neigh', dist_col]].set_index(['posting_id', 'neigh']))

    df_final = reduce(lambda left,right: cudf.merge(left, right, left_index=True, right_index=True, how='outer'), dfs)
    df_final = df_final.to_pandas().reset_index()

    for feat in FEATURES:
        df_final[feat] = df_final.groupby('posting_id')[feat].transform(lambda group: group.fillna(group.max()))

    df_final[FEATURES] = (df_final[FEATURES] - df_final[FEATURES].mean(0)) / df_final[FEATURES].std(0)
    df_final['match'] = cv_coef*df_final[CV].mean(1) + tfidf_coef*df_final[TFIDF].mean(1) + text_coef*df_final[TEXT].mean(1)

    bool_unpopular = (df_final[df_final.match<thr].groupby(['posting_id'])['neigh'].count() <= numb_unpopular).values
    unpopular_posting_idx = (
        df_final[df_final.match<thr]
        .groupby(['posting_id'])['neigh']
        .count()
        .loc[bool_unpopular]
        .index.values
    )

    bool_popular = (df_final[df_final.match<thr].groupby(['posting_id'])['neigh'].count() > numb_popular).values
    popular_posting_idx = (
        df_final[df_final.match<thr]
        .groupby(['posting_id'])['neigh']
        .count()
        .loc[bool_popular]
        .index.values
    )

    df_pred = df_final[((df_final.posting_id.isin(unpopular_posting_idx)) & (df_final.match < thr_unpopular)) |
                       ((df_final.posting_id.isin(popular_posting_idx)) & (df_final.match < thr_popular)) |
                       ((~df_final.posting_id.isin(unpopular_posting_idx)) & (~df_final.posting_id.isin(popular_posting_idx)) & (df_final.match < thr))]#.reset_index()

    print("FOUND: ", df_pred.shape)

    df_pred = df_pred[['posting_id', 'neigh']]#.to_pandas()

    tmp = df_pred.groupby(['posting_id'])['neigh'].unique().to_dict()

    tmp = df_pred.groupby(['posting_id'])['neigh'].unique().to_dict()
    df_pred['pred_matches'] = df_pred['posting_id'].map(tmp)
    df_pred['pred_matches'] = df_pred['pred_matches'].apply(lambda x: ' '.join(x))

    df_pred = df_pred[['posting_id', 'pred_matches']].drop_duplicates()

    result = pd.concat([result, df_pred], axis=0).reset_index(drop=True)

THR:  -3.3
FOUND:  (7786, 8)
FOUND:  (5248, 8)


In [35]:
if GET_CV:
    result = pd.merge(result, df[['posting_id','matches']], on='posting_id', how='inner')
    result['f1'] = f1_score(result['matches'], result['pred_matches'])
    result['recall'] = recall_score(result['matches'], result['pred_matches'])
    result['precision'] = prec_score(result['matches'], result['pred_matches'])
    score = result['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    score = result['recall'].mean()
    print(f'Our final recall cv score is {score}')
    score = result['precision'].mean()
    print(f'Our final precision cv score is {score}')
    result['matches'] = result['pred_matches']
    result[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    result = result[['posting_id', 'pred_matches']].rename(columns={'pred_matches': 'matches'})
    result.to_csv('submission.csv', index = False)

Our final f1 cv score is 0.9019032518163933
Our final recall cv score is 0.9988877302745915
Our final precision cv score is 0.8579723597570302
