# About this Notebook

After carefull considerations and doing a lot of experiments with tfidf and Bert-Based models , I strongly feel Bert-based models might do better if trained and used in the right way. [This](https://www.kaggle.com/c/shopee-product-matching/discussion/231510) dicussion thread talks about the usage of different approaches for text and discusses why BERT-base model might be better.

I started with normal Hugging Face BERT type models , but I found Sentence transformers pre-trained models a better idea . As sentence transformer models were already trained in a siamese fashion especially for information retreival and semantic similarity tasks it's much better idea to start with them and then fine-tune it on our data. I have used <b> paraphrase-xlm-r-multilingual-v1 </b> from sentence transformers , one can try with other very good models also . I have uploaded all models for offline use [here](https://www.kaggle.com/tanulsingh077/sentence-transformer-models)

One more additional thing which has come as a result of experimentation is to train with full data instead of splitting and then saving models on eval set. To avoid overfitting one can use strong regularizers like using fully connected layer on top of bert output , weight decay,etc

This is the inference notebook , you can find the training notebook [here]()

# Additional Utils

I have added a function get_neighbours_cos_sim which is cosine similarity equivalent for ragnar's get_neighbours function for knn. Now one can use both to find the best threshold and see which similarity function works best for you ,<b> one thing to note however is that normalize the embeddings if you use cosine similarity</b>

In [1]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import transformers

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer

# Test Configuration

In [2]:
NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 42

device = torch.device('cuda')

################################################  ADJUSTING FOR CV OR SUBMIT ##############################################

CHECK_SUB = False
GET_CV = True

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>2: GET_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')


################################################# MODEL ####################################################################

transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

################################################ MODEL PATH ###############################################################

TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

# Reading Data

In [3]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        
    return df, df_cu

# Utils

In [4]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [5]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [6]:
def get_neighbors_knn(df, embeddings, KNN = 50):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        thresholds = list(np.arange(0,2,0.1))
        
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [7]:
def get_neighbours_cos_sim(df, embeddings, threshold):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if GET_CV:
        thresholds = list(np.arange(0.5,0.7,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
######################################################################################################################
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

# Generating Embeddings

In [8]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [9]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

# Generating Submission

In [10]:
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(device)

    text_dataset = ShopeeDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [11]:
df,df_cu = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [12]:
text_embeddings = get_text_embeddings(df)

100%|██████████| 1/1 [00:01<00:00,  1.65s/it]

Our text embeddings shape is (3, 768)





In [13]:
# 12.5 and 0.70 -> 0.719 
# 12.5 and 0.71 -> 0.721
# 12.5 and 0.73 -> 0.724
# 12.5 and 0.75 -> 0.725
# 12.5 and 0.76 -> 0.726
# 12.5 and 0.76 and tfidf -> 0.732
# 12.5 and 0.77 and tfidf -> 0.733
# 12.5 and 0.78 and tfidf -> 0.734
# 12.5 and 0.79 and tfidf -> 0.734 better
# 12.5 and 0.80 and tfidf -> 0.734 more better
# 12.5 and 0.81 and tfidf -> 0.735
# 12.5 and 0.82 and tfidf -> 0.735 better
# 12.5 and 0.83 and tfidf -> 0.736
# 12.5 and 0.84 and tfidf -> 0.735
threshold = 0.83
df,text_predictions = get_neighbours_cos_sim(df, text_embeddings, threshold)
test['preds'] = text_predictions
# print(test.head())
del text_predictions
gc.collect()

Finding similar texts...for threshold : 0.83
chunk 0 to 3


20

In [14]:
test.head()

Unnamed: 0,posting_id,image,image_phash,title,preds
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929]


# images

In [15]:
import numpy as np, pandas as pd, gc
import math
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
print('RAPIDS',cuml.__version__)
print('TF',tf.__version__)

RAPIDS 0.16.0
TF 2.4.1


In [16]:
# RESTRICT TENSORFLOW TO 1GB OF GPU RAM
# SO THAT WE HAVE 15GB RAM FOR RAPIDS
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 1GB GPU RAM
then RAPIDS can use 15GB GPU RAM


In [17]:
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [18]:
# !pip install --no-index --find-links '/kaggle/input/faiss-170-latest-cpu-gpu/' faiss-cpu==1.7.0
!pip install --no-index --find-links '../input/faiss170latestcpugpu/' faiss-gpu==1.7.0
!pip install ../input/arcface-baseline/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/arcface-baseline/efficientnet-1.1.0-py3-none-any.whl
import efficientnet.tfkeras as efn
import faiss
N_CLASSES = 11011
IMAGE_SIZE = [512, 512]
margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.7, 
            name='head/arc_margin', 
            dtype='float32'
            )

inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
label = tf.keras.layers.Input(shape = (), name = 'inp2')
x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = margin([x, label])

output = tf.keras.layers.Softmax(dtype='float32')(x)

model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
#     model.load_weights('../input/shopee-efficientnetb3-arcmarginproduct/EfficientNetB3_512_42.h5')
#     model.load_weights('../input/ragnar-best-val-score-weights/EfficientNetB3_512_42.h5')
model.load_weights('../input/ragnar715score/EfficientNetB3_512_42.h5')

model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
# model.summary()

Looking in links: ../input/faiss170latestcpugpu/
Processing /kaggle/input/faiss170latestcpugpu/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.0
Processing /kaggle/input/arcface-baseline/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/arcface-baseline/efficientnet-1.1.0-py3-none-any.whl
Installing collected packages: efficientnet
Successfully installed efficientnet-1.1.0


In [19]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE =8 

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
#     image = tf.image.resize_with_pad(image, target_width = IMAGE_SIZE[0], target_height = IMAGE_SIZE[1])
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image
# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [20]:
BASE = '../input/shopee-product-matching/test_images/'
if GET_CV: BASE = '../input/shopee-product-matching/train_images/'

# WGT = '../input/effnetb0/efficientnetb0_notop.h5'
# model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)
model = model

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
image_path='../input/shopee-product-matching/test_images/'+test['image']
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
#     test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    test_gen = get_dataset(image_path[a:b])
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
#     image_embeddings = model.predict(test_gen)
    embeds.append(image_embeddings)

    #if i>=1: break
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)

Computing image embeddings...
chunk 0 to 3
image embeddings shape (3, 1536)


In [21]:
KNN = 50
if len(test)==3: KNN = 2
# model = NearestNeighbors(n_neighbors=KNN)
# model.fit(image_embeddings)

model = faiss.IndexFlatL2(image_embeddings.shape[1])   # build the index
model.add(image_embeddings)    # add vectors to the index

In [22]:
preds = []
CHUNK = 1024*4
from tqdm import tqdm

print('Finding similar images...')
CTS = len(image_embeddings)//CHUNK
if len(image_embeddings)%CHUNK!=0: CTS += 1
# for j in range( CTS ):
    
#     a = j*CHUNK
#     b = (j+1)*CHUNK
#     b = min(b,len(image_embeddings))
#     print('chunk',a,'to',b)
#     distances, indices = model.kneighbors(image_embeddings[a:b,])
    
#     for k in range(b-a):
#         IDX = np.where(distances[k,]<6.0)[0]
# #         IDX = np.where(distances[k,]<3.6)[0]
#         IDS = indices[k,IDX]
#         o = test.iloc[IDS].posting_id.values
#         preds.append(o)
preds = []
#13.75 -> 0.728
#13 -> 0.729
#12.5 -> 0.730
#12 -> 0.729
image_threshold = 12.5#13#13.75#3.6
# distances, indices = model.kneighbors(image_embeddings)
distances, indices = model.search(image_embeddings, KNN)
for k in tqdm(range(image_embeddings.shape[0])):
    idx = np.where(distances[k,] < image_threshold)[0]
    ids = indices[k,idx]
    posting_ids = test['posting_id'].iloc[ids].values
    preds.append(posting_ids)

100%|██████████| 3/3 [00:00<00:00, 1483.48it/s]

Finding similar images...





In [23]:
del model, distances, indices, image_embeddings, embeds
_ = gc.collect()

In [24]:
test['preds2'] = preds
test.head()

Unnamed: 0,posting_id,image,image_phash,title,preds,preds2
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929]


# tfidf

In [25]:
print('Computing text embeddings...')
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
test_gf = cudf.DataFrame(test)
text_embeddings = model.fit_transform(test_gf.title).toarray()
print('text embeddings shape',text_embeddings.shape)

Computing text embeddings...
text embeddings shape (3, 26)


In [26]:
preds = []
CHUNK = 1024*4

# 12.5 and 0.71 -> 0.731
# 12.5 and 0.72 -> 0.731 better
# 12.5 and 0.73 -> 0.732 
# 12.5 and 0.74 -> 0.732
# 12.5 and 0.75 -> 0.732
# 12.5 and 0.76 -> 0.732 best
text_threshold = 0.76#0.7

print('Finding similar titles...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>text_threshold)[0]
        o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
del model, text_embeddings
_ = gc.collect()

Finding similar titles...
chunk 0 to 3


In [27]:
test['preds4'] = preds
test.head()

Unnamed: 0,posting_id,image,image_phash,title,preds,preds2,preds4
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929]


# phash

In [28]:
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['preds3'] = test.image_phash.map(tmp)
test.head()

Unnamed: 0,posting_id,image,image_phash,title,preds,preds2,preds4,preds3
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929]


# Submission

In [29]:
# def combine_for_sub(row):
#     x = np.concatenate([row.preds,row.preds2, row.preds3])
#     return ' '.join( np.unique(x) )
def combine_for_sub(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3, row.preds4])
    return ' '.join( np.unique(x) )

In [30]:
df['matches'] = test.apply(combine_for_sub,axis=1)
df[['posting_id','matches']].to_csv('submission.csv',index=False)