# References

- https://www.kaggle.com/finlay/unsupervised-image-text-baseline-in-20min
- https://www.kaggle.com/pytorch/resnet152
- https://www.kaggle.com/kwisatzhaderach/glove2word2vec
- https://sigir-ecom.github.io/ecom20DCPapers/SIGIR_eCom20_DC_paper_7.pdf
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


# Goal

1) To use [RESNET152] pretrained model as the backbone to train SHOPEE image dataset for product matching by [IMAGE].

2) To use [TF-DIF Vectorizer] sklearn model as the backone to train SHOPEE metadata for product matching by [Title].

3) To retrieve product matches(neighbours) by defining distance-based threshhold for text, image embeddings.

4) Combine predictions of text, image from Step-[3]

# Import libraries

In [76]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

from PIL import Image

import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from sklearn.preprocessing import normalize

# Helper functions

In [77]:
def evaluate_model_performance(attribute):
    def measureF1(sample):
        n = len( np.intersect1d(sample.target,row[attribute]) )
        return ((2*n) / (len(sample.target)+len(row[attribute])))
    return measureF1

# Load data 

In [78]:
path = '../input/shopee-product-matching/'

# Store target by Label_group_id

In [80]:
Training = False #Toggle this to train the model

test = pd.read_csv(path + 'test.csv')

if len(test)>3: Training = False
    
else: print('Configuring CV Settings for training data..')

if Training: #Use training data.
    train = pd.read_csv(path + 'train.csv')
    train['image'] = path + 'train_images/' + train['image']
    
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    
else: #Use test-data for reference.
    train = pd.read_csv(path + 'test.csv')
    train['image'] = path + 'test_images/' + train['image'] 

print()
print('******* Target(Matching) images for training data *******')

train

Configuring CV Settings for training data..

******* Target(Matching) images for training data *******


Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,../input/shopee-product-matching/test_images/0...,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,../input/shopee-product-matching/test_images/0...,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,../input/shopee-product-matching/test_images/0...,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


# Store image hash by Label_group_id

In [81]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['groupedByID_image_phash'] = train.image_phash.map(tmp)
print('******* Image phash for training data grouped by label_groups*******')
train['groupedByID_image_phash'].head(2)

******* Image phash for training data grouped by label_groups*******


0    [test_2255846744]
1    [test_3588702337]
Name: groupedByID_image_phash, dtype: object

# Compute Cross-validation score with unique imagehash

In [82]:
if Training:
    train['f1'] = train.apply(evaluate_model_performance('groupedBy_image_phash'),axis=1)
    print('CV score without training with hash data',train.f1.mean())

# CNN RESNET152 - Find similar products using images

**Note:** Adapted code structure from https://www.kaggle.com/finlay/unsupervised-image-text-baseline-in-20min

In [83]:
class ProductImages(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, index):
        img = Image.open(self.img_path[index]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)

In [84]:
productImages = ProductImages(
    train['image'].values,
    transforms.Compose([
        transforms.Resize((512, 512)), #resize to be processed by 512x512 RESNET152
        transforms.ToTensor(),
        transforms.Normalize([0.50, 0.50, 0.50], [0.25, 0.25, 0.25])
]))
    
getProductImages = torch.utils.data.DataLoader(
    productImages,
    batch_size=100, shuffle=True, num_workers=2
)

In [85]:
class ProductImage_Embedding_Net(nn.Module):
    def __init__(self):
        super(ProductImage_Embedding_Net, self).__init__()
              
        model = models.resnet152(True) #Can be changed with any resnet
        
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        
        model = nn.Sequential(*list(model.children())[:-1])
        
        model.eval()
        
        self.model = model
        
    def forward(self, img):        
        out = self.model(img)
        return out

In [86]:
DEVICE = 'cuda'

product_Image_Embedding_Net = ProductImage_Embedding_Net()
product_Image_Embedding_Net = product_Image_Embedding_Net.to(DEVICE) #load to GPU cuda device.

extracted_image_features = []

with torch.no_grad():
    for data in tqdm_notebook(getProductImages):
        data = data.to(DEVICE)
        feature = product_Image_Embedding_Net(data)
        feature = feature.reshape(feature.shape[0], feature.shape[1])
        feature = feature.data.cpu().numpy()
        extracted_image_features.append(feature)

print(extracted_image_features)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


  0%|          | 0/1 [00:00<?, ?it/s]

[array([[0.43484437, 0.947295  , 3.708315  , ..., 1.5865095 , 1.3522434 ,
        1.5033988 ],
       [1.7952293 , 1.576125  , 1.4603299 , ..., 1.3755738 , 2.4345665 ,
        3.3643942 ],
       [1.2823381 , 1.2496889 , 3.1288214 , ..., 2.2141232 , 3.9003263 ,
        0.47788632]], dtype=float32)]


In [87]:
stacked_image_features = np.vstack(extracted_image_features)
stacked_image_features = normalize(stacked_image_features)

In [88]:
image_embeddings = torch.from_numpy(stacked_image_features)
image_embeddings = image_embeddings.cuda()

In [89]:
preds = []

IMAGE_CHUNK_SIZE = 512*8

print('Finding similar images...')

IMAGE_CHUNKS = len(image_embeddings)//IMAGE_CHUNK_SIZE

if len(image_embeddings)%IMAGE_CHUNK_SIZE!=0: IMAGE_CHUNKS += 1
for j in range( IMAGE_CHUNKS ):
    
    a = j*IMAGE_CHUNK_SIZE
    
    b = (j+1)*IMAGE_CHUNK_SIZE
    
    b = min(b, len(image_embeddings))
    
    print('Processing product image CHUNKS:',a,'to',b)
    
    distances = torch.matmul(image_embeddings, image_embeddings[a:b].T).T
    
    distances = distances.data.cpu().numpy()
    
    for k in range(b-a):
        IDX = np.where(distances[k,]>0.75)[0][:] #retrieve neighbors within a threshhold.
        pred = train.iloc[IDX].posting_id.values
        preds.append(pred)
        
del image_embeddings, product_Image_Embedding_Net

Finding similar images...
Processing product image CHUNKS: 0 to 3


In [90]:
train['pred_CNN'] = preds

if Training:
    train['f1'] = train.apply(evaluate_model_performance('pred_CNN'),axis=1)
    print('CV score for RESNET152 Baseline =',train.f1.mean())

# TFIDF - Find similar products by title 

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words=None, binary=True, max_features=40000)
text_embeddings = model.fit_transform(train.title).toarray()
print('text embeddings shape',text_embeddings.shape)

text embeddings shape (3, 28)


In [92]:
text_embeddings = torch.from_numpy(text_embeddings)
text_embeddings = text_embeddings.cuda()

In [93]:
preds = []
TEXT_CHUNK_SIZE = 512*8

print('Finding similar titles...')

TEXT_CHUNKS = len(train)//TEXT_CHUNK_SIZE

if len(train)%TEXT_CHUNK_SIZE!=0: TEXT_CHUNKS += 1
    
TEXT_CHUNK_index = 0

for j in range( TEXT_CHUNKS ):
    
    a = j*TEXT_CHUNK_SIZE
    b = (j+1)*TEXT_CHUNK_SIZE
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = cts.data.cpu().numpy()
    print(cts.shape)
    for k in range(b-a):
        IDX = np.where(cts[k,]>0.75)[0] #retrieve neighbors within a threshhold.
        pred = train.iloc[IDX].posting_id.values
        preds.append(pred)
        TEXT_CHUNK_index += 1


Finding similar titles...
chunk 0 to 3
(3, 3)


In [94]:
train['pred_TFDIF'] = preds

if Training:
    train['f1'] = train.apply(evaluate_model_performance('pred_TFDIF'),axis=1)
    print('CV score for TFDIF baseline =',train.f1.mean())

In [95]:
def merge_Text_Image_Predictions(sample):
    x = np.concatenate([sample.pred_TFDIF,sample.pred_CNN, sample.groupedByID_image_phash])
    return ' '.join( np.unique(x) )

def merge_Text_Image_CV(sample):
    x = np.concatenate([sample.pred_TFDIF,sample.pred_CNN, sample.groupedByID_image_phash])
    return np.unique(x)

In [96]:
if Training:
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    train['pred'] = train.apply(merge_Text_Image_CV,axis=1)
    train['f1'] = train.apply(evaluate_model_performance('pred_CNN'),axis=1)
    print('CV Score =', train.f1.mean() )

train['matches'] = train.apply(merge_Text_Image_Predictions,axis=1)

In [97]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744 test_3588702337 test_4015706929
1,test_3588702337,test_2255846744 test_3588702337 test_4015706929
2,test_4015706929,test_2255846744 test_3588702337 test_4015706929


# Inferences

Combined predictions using [RESNET152](image processing)along with [TF-DIF Vectorizer](text processing) provided better F1 score over stand-alone predictions.

#### LEARNING
1) Using pretrained models to train the SHOPEE Dataset and make predictions(product match).

2) How to retrieve matched images(neighbours) by defining distance threshhold).

#### Step-3 
To replace [RESNET152] model with the EfficientNet [3,5,6] models and train them on the SHOPEE Dataset, to see if the current [F1] Score can be improved.