# About this Notebook

After carefull considerations and doing a lot of experiments with tfidf and Bert-Based models , I strongly feel Bert-based models might do better if trained and used in the right way. [This](https://www.kaggle.com/c/shopee-product-matching/discussion/231510) dicussion thread talks about the usage of different approaches for text and discusses why BERT-base model might be better.

I started with normal Hugging Face BERT type models , but I found Sentence transformers pre-trained models a better idea . As sentence transformer models were already trained in a siamese fashion especially for information retreival and semantic similarity tasks it's much better idea to start with them and then fine-tune it on our data. I have used <b> paraphrase-xlm-r-multilingual-v1 </b> from sentence transformers , one can try with other very good models also . I have uploaded all models for offline use [here](https://www.kaggle.com/tanulsingh077/sentence-transformer-models)

One more additional thing which has come as a result of experimentation is to train with full data instead of splitting and then saving models on eval set. To avoid overfitting one can use strong regularizers like using fully connected layer on top of bert output , weight decay,etc

This is the inference notebook , you can find the training notebook [here](https://www.kaggle.com/tanulsingh077/metric-learning-pipeline-only-text-sbert)

# Additional Utils

I have added a function get_neighbours_cos_sim which is cosine similarity equivalent for ragnar's get_neighbours function for knn. Now one can use both to find the best threshold and see which similarity function works best for you ,<b> one thing to note however is that normalize the embeddings if you use cosine similarity</b>

In [1]:
# !pip install --no-index --find-links '/kaggle/input/faiss-170-latest-cpu-gpu/' faiss-cpu==1.7.0
!pip install --no-index --find-links '../input/faiss170latestcpugpu/' faiss-gpu==1.7.0
!pip install ../input/arcface-baseline/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/arcface-baseline/efficientnet-1.1.0-py3-none-any.whl
import efficientnet.tfkeras as efn
import faiss

import numpy as np, pandas as pd, gc
import math
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
print('RAPIDS',cuml.__version__)
print('TF',tf.__version__)



# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import transformers

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer

Looking in links: ../input/faiss170latestcpugpu/
Processing /kaggle/input/faiss170latestcpugpu/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.0
Processing /kaggle/input/arcface-baseline/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/arcface-baseline/efficientnet-1.1.0-py3-none-any.whl
Installing collected packages: efficientnet
Successfully installed efficientnet-1.1.0
RAPIDS 0.16.0
TF 2.4.1


# Test Configuration

In [2]:
NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 42

device = torch.device('cuda')

################################################  ADJUSTING FOR CV OR SUBMIT ##############################################

CHECK_SUB = False
GET_CV = True

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>2: GET_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')


################################################# MODEL ###################################################################

# transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
transformer_model = '../input/paraphrase-xlm-r-multilingual-v1/paraphrase-xlm-r-multilingual-v1/'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

################################################ MODEL PATH ###############################################################

# TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'
TEXT_MODEL_PATH = '../input/sbert-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

# Reading Data

In [3]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        
    return df, df_cu

# Utils

In [4]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [5]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [6]:
def get_neighbors_knn(df, embeddings, KNN = 50, threshold=0.7):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

#     model = NearestNeighbors(n_neighbors = KNN)
#     model.fit(embeddings)
#     distances, indices = model.kneighbors(embeddings)
    
    model = faiss.IndexFlatIP(embeddings.shape[1])  # build the index cosine
    model.add(text_embeddings)    # add vectors to the index
    distances, indices = model.search(embeddings, KNN)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        thresholds = list(np.arange(0,2,0.1))
        
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] > threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [7]:
def get_neighbours_cos_sim(df, embeddings, threshold, KNN):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if GET_CV:
        thresholds = list(np.arange(0.5,0.7,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
######################################################################################################################
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4
#         threshold = 0.7

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0][:KNN]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
            
        del cts
        gc.collect()
    return df, preds

# Generating Embeddings

In [8]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [9]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

# Faiss KNN

In [10]:
def get_faiss_neighbors_knn(df, embeddings, KNN = 50, upper_threshold=0.7, lower_threshold =0.7, print_distances=False):
    faiss.normalize_L2(embeddings)
    model = faiss.IndexFlatIP(embeddings.shape[1])  # build the index cosine
    model = faiss.index_cpu_to_all_gpus(model)  
    model.add(embeddings)    # add vectors to the index
    distances, indices = model.search(embeddings, KNN)
    if print_distances:
        print(distances)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        thresholds = list(np.arange(0.65,0.95,0.01))
        
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] > threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        predictions_upper_threshold = [] 
        predictions_lower_threshold = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] > upper_threshold)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        predictions_upper_threshold = predictions
        predictions_lower_threshold = predictions
        
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
#         predictions = []
#         for k in tqdm(range(embeddings.shape[0])):
#             idx = np.where(distances[k,] > threshold)[0]
#             ids = indices[k,idx]
#             posting_ids = df['posting_id'].iloc[ids].values
#             predictions.append(posting_ids)
        predictions_upper_threshold = []
        predictions_lower_threshold = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] > upper_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions_upper_threshold.append(posting_ids)
            
            idx = np.where(distances[k,] > lower_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions_lower_threshold.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions_upper_threshold, predictions_lower_threshold

# Generating Submission

In [11]:
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(device)

    text_dataset = ShopeeDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [12]:
df,df_cu = read_dataset()
# df_cu = cudf.concat(10000*[df_cu], axis=0)
# df_cu.reset_index(drop = True, inplace = True)
del df_cu
gc.collect()
# df =  pd.concat(500*[df], axis=0)
# df =  pd.concat(11500*[df], axis=0)
# df =  pd.concat(15000*[df], axis=0)
# df.reset_index(drop = True, inplace = True)
# df.head(n=10)
len(df)

3

In [13]:
# df = pd.read_csv('../input/shopee-product-matching/train.csv')
text_embeddings = get_text_embeddings(df)

100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


Our text embeddings shape is (3, 768)


In [14]:
# 12.5 and 0.70 -> 0.719 
# 12.5 and 0.71 -> 0.721
# 12.5 and 0.73 -> 0.724
# 12.5 and 0.75 -> 0.725
# 12.5 and 0.76 -> 0.726
# 12.5 and 0.76 and tfidf -> 0.732
# 12.5 and 0.77 and tfidf -> 0.733
# 12.5 and 0.78 and tfidf -> 0.734
# 12.5 and 0.79 and tfidf -> 0.734 better
# 12.5 and 0.80 and tfidf -> 0.734 more better
# 12.5 and 0.81 and tfidf -> 0.735
# 12.5 and 0.82 and tfidf -> 0.735 better
# 12.5 and 0.83 and tfidf -> 0.736
# 12.5 and 0.84 and tfidf -> 0.735



# __, 12.5 , 0.79(norm) -> 0.727
# 0.83, 12.5 , 0.79(norm) -> 0.725
# 0.84, 12.5 , 0.79(norm) -> 0.724
# 0.82, 12.5 , 0.79(norm) -> 0.726
# 0.81, 12.5 , 0.79(norm) -> 0.726
# 0.80, 12.5 , 0.79(norm) -> 0.726
# 0.79, 12.5 , 0.79(norm) -> 0.727


# threshold = 0.79
# # df,text_predictions = get_neighbours_cos_sim(df, text_embeddings, threshold, KNN = 50)
# try:
#     df,text_predictions = get_neighbors_knn(df, text_embeddings, KNN=50, threshold=threshold)
# except:
#     df,text_predictions = get_neighbors_knn(df, text_embeddings, KNN=3, threshold=threshold)
# test = df
# test['preds'] = text_predictions
# del text_predictions, df, text_embeddings
# gc.collect()

#
#text_KNN =200 , 12.5 , 0.9, 0.75, tfidf=10000 -> 0.729
#text_KNN =200 , 12.5 , 0.9, 0.75, tfidf=5000 -> 0.724
#text_KNN =200 , 12.5 , 0.9, 0.65, tfidf=5000 -> 0.721
#text_KNN =200 , 12.5 , 0.9, 0.70, tfidf=10000 -> 0.730
#text_KNN =150 , 12.5 , 0.9, 0.70, tfidf=10000 -> 0.730
#text_KNN =150 , 12.5 , 0.87, 0.70, 0.9, 0.70, tfidf=10000 -> 0.730
#text_KNN =150 , 12.5 , 0.85, 0.70, 0.9, 0.70, tfidf=10000 -> 0.730
#text_KNN =150 , 12.5 , 0.85, 0.70, 0.9, 0.70, tfidf=10000  New model -> 0.691 
#text_KNN =150 , 12.5 , 0.85, 0.70, 0.9, 0.70, tfidf=10000 New model  better version1->
#text_KNN =100 , 12.5 , 0.85, 0.70, 0.9, 0.70, tfidf=10000
#image_KNN=50, 0.80 text_KNN =100 , 0.85, 0.70, 0.9, 0.70 tfidf=10000 L2norm -> 0.721
#image_KNN=50, 0.75 text_KNN =100 , 0.85, 0.70, 0.9, 0.70 tfidf=10000 L2norm -> 0.726

# KNN=50, 0.75, 0.65 , 0.85, 0.70, 0.9, 0.70 tfidf=10000 L2norm -> 0.730
# KNN=50, 0.75, 0.65 , 0.85, 0.70, 0.9, 0.70 tfidf=25000 L2norm -> 0.734
# KNN=50, 0.75, 0.65 , 0.85, 0.70, 0.85, 0.70 tfidf=25000 L2norm -> 0.735
# KNN=50, 0.75, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm -> 0.736
# KNN=50, 0.75, 0.65 , 0.75, 0.65, 0.75, 0.65 tfidf=25000 L2norm -> 0.729
# KNN=50, 0.75, 0.65 , 0.80, 0.65, 0.80, 0.65 tfidf=25000 L2norm -> 0.735
# KNN=50, 0.75, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm , quora -> 0.518
# KNN=50, 0.75, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm , +quora 0.9 -> 0.558
# KNN=50, 0.75, 0.65 , 0.80, 0.65, 0.85, 0.65 tfidf=25000 L2norm -> 0.734
# KNN=50, 0.75, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm -> 0.736
# KNN=50, 0.80, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm


text_KNN = 50#100#150#200
text_upper_threshold = 0.85#0.75#0.85#0.9
text_lower_threshold = 0.65#0.70
print_distance = False#True
df, text_upper_predictions, text_lower_predictions = get_faiss_neighbors_knn(df, text_embeddings, KNN=text_KNN,
                                                                             upper_threshold=text_upper_threshold, 
                                                                             lower_threshold=text_lower_threshold,
                                                                            print_distances=print_distance)

test = df
test['text_upper_pred_multi'] = text_upper_predictions
test['text_lower_pred_multi'] = text_lower_predictions

# print(test.head())
del text_upper_predictions, text_lower_predictions, df, text_embeddings
gc.collect()

100%|██████████| 3/3 [00:00<00:00, 1268.44it/s]


0

In [15]:
test.head()

Unnamed: 0,posting_id,image,image_phash,title,text_upper_pred_multi,text_lower_pred_multi
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929]


# Quora

In [16]:
# # transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
# transformer_model = '../input/quora-distilbert-multilingual/quora-distilbert-multilingual/'
# TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

# ################################################ MODEL PATH ###############################################################

# # TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'
# TEXT_MODEL_PATH = '../input/training-pipeline-xlm-t-better-version/sentence_transfomer_quora_xlm_best_loss_num_epochs_25_arcface.bin'

# model_params = {
#     'n_classes':4036,
#     'model_name':transformer_model,
#     'use_fc':False,
#     'fc_dim':512,
#     'dropout':0.3,
# }
# text_embeddings = get_text_embeddings(test)
# text_KNN = 50#100#150#200
# text_upper_threshold = 0.90#0.80#0.75#0.85#0.9
# text_lower_threshold = 0.65#0.70
# print_distance = False#True
# df, text_upper_predictions, text_lower_predictions = get_faiss_neighbors_knn(test, text_embeddings, KNN=text_KNN,
#                                                                              upper_threshold=text_upper_threshold, 
#                                                                              lower_threshold=text_lower_threshold,
#                                                                             print_distances=print_distance)

# test['text_upper_pred_multi_quora'] = text_upper_predictions
# test['text_lower_pred_multi_quora'] = text_lower_predictions

# # print(test.head())
# del text_upper_predictions, text_lower_predictions, df, text_embeddings
# gc.collect()
# test.head()

# Images

In [17]:
# RESTRICT TENSORFLOW TO 1GB OF GPU RAM
# SO THAT WE HAVE 15GB RAM FOR RAPIDS
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 1GB GPU RAM
then RAPIDS can use 15GB GPU RAM


In [18]:
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [19]:
N_CLASSES = 11011
IMAGE_SIZE = [512, 512]
margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.7, 
            name='head/arc_margin', 
            dtype='float32'
            )

inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
label = tf.keras.layers.Input(shape = (), name = 'inp2')
x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = margin([x, label])

output = tf.keras.layers.Softmax(dtype='float32')(x)

model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
#     model.load_weights('../input/shopee-efficientnetb3-arcmarginproduct/EfficientNetB3_512_42.h5')
#     model.load_weights('../input/ragnar-best-val-score-weights/EfficientNetB3_512_42.h5')
model.load_weights('../input/ragnar715score/EfficientNetB3_512_42.h5')
# print(model.summary())
model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
# model.summary()

In [20]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE =8 
# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
#     image = tf.image.resize_with_pad(image, target_width = IMAGE_SIZE[0], target_height = IMAGE_SIZE[1])
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image
# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [21]:
BASE = '../input/shopee-product-matching/test_images/'
if GET_CV: 
    BASE = '../input/shopee-product-matching/train_images/'
    test = df

# WGT = '../input/effnetb0/efficientnetb0_notop.h5'
# model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)
model = model

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
image_path=BASE+test['image']

for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
#     test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    test_gen = get_dataset(image_path[a:b])
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
#     image_embeddings = model.predict(test_gen)
    embeds.append(image_embeddings)

    #if i>=1: break
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)

Computing image embeddings...
chunk 0 to 3
image embeddings shape (3, 1536)


In [22]:
# KNN=50, 0.75, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm Eff_b0:0.75 0.65 -> 0.729
# KNN=50, 0.80, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm Eff_b0:0.80 0.65 -> 0.733
# KNN=50, 0.85, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm Eff_b0:0.85 0.65 -> 0.734
print_distances = False#True

image_KNN = 50#100#150#200
image_upper_threshold = 0.85#0.75
image_lower_threshold = 0.60
df, image_upper_predictions, image_lower_predictions = get_faiss_neighbors_knn(test, image_embeddings, KNN=image_KNN, 
                                                                               upper_threshold=image_upper_threshold, 
                                                                                lower_threshold=image_lower_threshold,
                                                                              print_distances=print_distances)

100%|██████████| 3/3 [00:00<00:00, 2516.58it/s]


In [23]:
# test['preds2'] = preds
test['image_upper_preds'] = image_upper_predictions
test['image_lower_preds'] = image_lower_predictions
del image_embeddings, image_upper_predictions, image_lower_predictions, df
_ = gc.collect()
test.head()

Unnamed: 0,posting_id,image,image_phash,title,text_upper_pred_multi,text_lower_pred_multi,image_upper_preds,image_lower_preds
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929]


# image model 2(Eff_B0)

In [24]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 5
# BATCH_SIZE = 4
IMAGE_SIZE = [384, 384]
# Seed
SEED = 42
# Learning rate
LR = 0.001
# Verbosity
VERBOSE = 2
N_CLASSES = 11011

inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3))
x = efn.EfficientNetB0(weights = None, include_top = False)(inp)
#     x = efn.EfficientNetB4(include_top = False, weights = 'imagenet')(inp)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(512, activation = 'relu')(x)
output = tf.keras.layers.Dense(N_CLASSES, activation = 'softmax')(x)

model = tf.keras.models.Model(inputs = [inp], outputs = [output])

model.load_weights('../input/shopee-training-baseline-efficientnetb4/EfficientNetB0_384_42.h5')
# model.summary()
# print('----------------------------')
model = tf.keras.models.Model(inputs = inp, outputs = model.layers[-4].output)
# model.summary()

In [25]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE =8 
# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
#     image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.image.resize_with_pad(image, target_width = IMAGE_SIZE[0], target_height = IMAGE_SIZE[1])
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image
# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [26]:
BASE = '../input/shopee-product-matching/test_images/'
if GET_CV: 
    BASE = '../input/shopee-product-matching/train_images/'
    test = df

# WGT = '../input/effnetb0/efficientnetb0_notop.h5'
# model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)
model = model

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
image_path=BASE+test['image']

for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
#     test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    test_gen = get_dataset(image_path[a:b])
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
#     image_embeddings = model.predict(test_gen)
    embeds.append(image_embeddings)

    #if i>=1: break
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)

Computing image embeddings...
chunk 0 to 3
image embeddings shape (3, 1280)


In [27]:
# KNN=50, 0.80, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm Eff_b0:0.9 ->0.736
# KNN=50, 0.80, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm Eff_b0:0.85 -> 0.735
# KNN=50, 0.80, 0.65 , 0.85, 0.65, 0.85, 0.65 tfidf=25000 L2norm Eff_b0:0.95 -> 0.736
print_distances = False#True 

image_KNN = 50#100#150#200
image_upper_threshold = 0.85#0.75#0.90#0.75
image_lower_threshold = 0.60
df, image_upper_predictions, image_lower_predictions = get_faiss_neighbors_knn(test, image_embeddings, KNN=image_KNN, 
                                                                               upper_threshold=image_upper_threshold, 
                                                                                lower_threshold=image_lower_threshold,
                                                                              print_distances=print_distances)

100%|██████████| 3/3 [00:00<00:00, 2775.85it/s]


In [28]:
# test['preds2'] = preds
test['image_upper_preds_b0'] = image_upper_predictions
test['image_lower_preds_b0'] = image_lower_predictions
del image_embeddings, image_upper_predictions, image_lower_predictions, df
_ = gc.collect()
test.head()

Unnamed: 0,posting_id,image,image_phash,title,text_upper_pred_multi,text_lower_pred_multi,image_upper_preds,image_lower_preds,image_upper_preds_b0,image_lower_preds_b0
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929]


# tfidf

In [29]:
# test = df
# test = pd.read_csv('../input/shopee-product-matching/train.csv')
# test =  pd.concat(2*[test], axis=0)
print('Computing text embeddings...')
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25000)
# model = TfidfVectorizer(stop_words='english', binary=True, max_features=10000)
# model = TfidfVectorizer(stop_words='english', binary=True, max_features=5000)
test_gf = cudf.DataFrame(test)
# text_embeddings = model.fit_transform(test_gf.title).toarray()
text_embeddings = model.fit_transform(test_gf.title.str.lower()).toarray()
print('text embeddings shape',text_embeddings.shape)
del model
_ = gc.collect()

Computing text embeddings...
text embeddings shape (3, 26)


In [30]:
text_embeddings = cupy.asnumpy(text_embeddings)

text_KNN = 50#100#150#200
text_upper_threshold = 0.85#0.80#0.75#0.85
text_lower_threshold = 0.65#0.70
df, text_upper_predictions, text_lower_predictions = get_faiss_neighbors_knn(test, text_embeddings, KNN=text_KNN, upper_threshold=text_upper_threshold, 
                                        lower_threshold=text_lower_threshold)

test['text_upper_pred_tfidf'] = text_upper_predictions
test['text_lower_pred_tfidf'] = text_lower_predictions

# print(test.head())
del text_upper_predictions, text_lower_predictions, text_embeddings
gc.collect()
test.head()

100%|██████████| 3/3 [00:00<00:00, 2799.31it/s]


Unnamed: 0,posting_id,image,image_phash,title,text_upper_pred_multi,text_lower_pred_multi,image_upper_preds,image_lower_preds,image_upper_preds_b0,image_lower_preds_b0,text_upper_pred_tfidf,text_lower_pred_tfidf
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929]


In [31]:
# preds = []
# CHUNK = 1024*4

# # 12.5 and 0.71 -> 0.731
# # 12.5 and 0.72 -> 0.731 better
# # 12.5 and 0.73 -> 0.732 
# # 12.5 and 0.74 -> 0.732
# # 12.5 and 0.75 -> 0.732
# # 12.5 and 0.76 -> 0.732 best
# text_threshold = 0.76#.76#0.7

# print('Finding similar titles...')
# CTS = len(test)//CHUNK
# if len(test)%CHUNK!=0: CTS += 1
# for j in range( CTS ):
    
#     a = j*CHUNK
#     b = (j+1)*CHUNK
#     b = min(b,len(test))
#     print('chunk',a,'to',b)
    
#     # COSINE SIMILARITY DISTANCE
#     cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
# #     print(cts)
#     for k in range(b-a):
#         IDX = cupy.where(cts[k,]>text_threshold)[0][:50]
# # #         print(IDX)
# # #         print(cts[k])
# #         temp_dict = {}
# #         IDX = cupy.asnumpy(IDX)
# #         for i in IDX:
# #             temp_dict[i]=cts[k][i]
# # #         print(temp_dict)
# #         temp_dict =dict(sorted(temp_dict.items(), key=lambda item: item[1], reverse=True))
# # #         print(temp_dict)
# # #         print('-----------')
# #         final_IDX = np.fromiter(temp_dict.keys(), dtype=int)[:50]
# #         o = test.iloc[final_IDX].posting_id.values
#         o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
#         preds.append(o)
# del text_embeddings
# _ = gc.collect()

In [32]:
# KNN = 50
# if len(test)==3: KNN = 3
# # model = NearestNeighbors(n_neighbors=KNN)
# # model.fit(image_embeddings)
# text_embeddings = cupy.asnumpy(text_embeddings)

# # model = faiss.IndexFlatL2(text_embeddings.shape[1])   # build the index
# model = faiss.IndexFlatIP(text_embeddings.shape[1])  # build the index cosine
# model.add(text_embeddings)    # add vectors to the index
# preds = []

# # __, 12.5 , 0.76 -> 0.726
# # 0.83, 12.5 , 0.76 -> 0.726
# # 0.83, 12.5 , 0.7 -> 0.719
# # __, 12.5 , 0.76(norm) -> 0.726
# # __, 12.5 , 0.77(norm) -> 0.727
# # __, 12.5 , 0.78(norm) -> 0.727
# # __, 12.5 , 0.79(norm) -> 0.727
# # __, 12.5 , 0.80(norm) -> 0.726

# text_threshold = 0.79
# # distances, indices = model.kneighbors(image_embeddings)
# distances, indices = model.search(text_embeddings, KNN)
# # print(distances)
# for k in tqdm(range(text_embeddings.shape[0])):
#     idx = np.where(distances[k,] > text_threshold)[0]
#     ids = indices[k,idx]
#     posting_ids = test['posting_id'].iloc[ids].values
#     preds.append(posting_ids)    

# del model, text_embeddings
# _ = gc.collect()

In [33]:
# text_embeddings_np = cupy.asnumpy(text_embeddings)
# # print(text_embeddings)
# # text_embeddings = text_embeddings/np.max(text_embeddings)
# # faiss.normalize_L2(text_embeddings_np)
# print(np.max(text_embeddings_np))
# model = faiss.IndexFlatIP(text_embeddings_np.shape[1])   # build the index
# model.add(text_embeddings_np)    # add vectors to the index
# distances, indices = model.search(text_embeddings_np, KNN)
# print(distances)
# del model

In [34]:
# test['preds4'] = preds
# test.head()

# phash

In [35]:
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['preds3'] = test.image_phash.map(tmp)
test.head()

Unnamed: 0,posting_id,image,image_phash,title,text_upper_pred_multi,text_lower_pred_multi,image_upper_preds,image_lower_preds,image_upper_preds_b0,image_lower_preds_b0,text_upper_pred_tfidf,text_lower_pred_tfidf,preds3
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929],[test_4015706929]


# submission

In [36]:
# def combine_for_sub(row):
#     x = np.concatenate([row.preds, row.preds2, row.preds3, row.preds4])
#     return ' '.join( np.unique(x) )

# def combine_for_sub(row):
#     multi = set(row['text_lower_pred_multi'])
#     tfidf = set(row['text_lower_pred_tfidf'])
#     final_text_preds = list(multi.intersection(tfidf))
#     x = np.concatenate([row.preds2, row.preds3, row['text_upper_pred_tfidf'], row['text_upper_pred_multi'],final_text_preds ])
#     return ' '.join( np.unique(x) )

# def combine_for_sub(row):
#     multi = set(row['text_lower_pred_multi'])
#     tfidf = set(row['text_lower_pred_tfidf'])
#     image_lower = set(row['image_lower_preds'])
#     final_text_preds = list(multi.intersection(tfidf))
# #     image_multi_preds = list(multi.intersection(image_lower))
# #     image_tfidf_preds = list(tfidf.intersection(image_lower))
#     x = np.concatenate([row['image_upper_preds'], row.preds3, row['text_upper_pred_tfidf'], 
#                         row['text_upper_pred_multi'], final_text_preds])# image_multi_preds, image_tfidf_preds ])
#     return ' '.join( np.unique(x) )

def combine_for_sub(row):
    multi = set(row['text_lower_pred_multi'])
    tfidf = set(row['text_lower_pred_tfidf'])
    image_lower = set(row['image_lower_preds'])
    image_lower_b0 = set(row['image_lower_preds_b0'])
    final_text_preds = list(multi.intersection(tfidf))
    final_image_preds = list(image_lower_b0.intersection(image_lower))
    x = np.concatenate([row['image_upper_preds'], row.preds3, row['text_upper_pred_tfidf'], 
                        row['text_upper_pred_multi'], final_text_preds, final_image_preds,
                       row['image_upper_preds_b0']])
    return ' '.join( np.unique(x) )

In [37]:
test['matches'] = test.apply(combine_for_sub,axis=1)
test[['posting_id','matches']].to_csv('submission.csv',index=False)

In [38]:
# test.head()