In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
import numpy as np 
import pandas as pd 

import pickle
import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import transformers

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

from sklearn.ensemble import RandomForestClassifier

In [None]:
class CFG:
    
    img_size = 512
    batch_size = 32
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l0'
    model_path_ranger = '../input/shopee-inference/arcface_512x512_ecanfnet_f1_epoch6_fold0.pth'
    model_path = '../input/shopee-inference-nfnet/arcface_512x512_nfnet_l0_epoch8_fold1_weights.pth'
    model_path_2 = '../input/shopee-inference-nfnet/arcface_512x512_nfnet_l0_epoch7_fold2_weights.pth'
    
    scale = 30 
    margin = 0.5

    
TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'
transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

model_params_bert = {
    'n_classes':11014,
    'model_name':transformer_model,
    'pooling':'mean_pooling',
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.0,
}

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', 'ml', 'gr', 'mm', 
'kg', 'pcs', 'ply', 'inch', 'cm']

def clean_text(x):
    x = str(x).replace("\n","")
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def string_escape(s, encoding='utf-8'):
    return (
        s.encode('latin1')  # To bytes, required by 'unicode-escape'
        .decode('unicode-escape')  # Perform the actual octal-escaping decode
        .encode('latin1')  # 1:1 mapping back to bytes
        .decode(encoding))

In [None]:
def read_dataset_bert():
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    return df

In [None]:
def read_dataset_tfidf():
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    df['title'] = df['title'].apply(string_escape)
    df['title'] = df['title'].apply(lambda x: clean_text(x.lower()))
    df_cu = cudf.DataFrame(df)
    image_paths = '../input/shopee-product-matching/test_images/' + df['image']
    return df, df_cu, image_paths

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['text_predictions_bert']])
    return ' '.join( np.unique(x))

In [None]:
def get_image_predictions(df,embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        if len(predictions[-1]) == 1:
            idx = np.where((distances[k,] > threshold) & (distances[k,] <= threshold+0.12))[0]
            if len(idx) == 0:
                continue
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions[-1] = np.concatenate((predictions[-1] , posting_ids))
        
    del model, distances, indices
    gc.collect()
    return predictions

In [None]:
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size+32,CFG.img_size+32,always_apply=True),
            A.CenterCrop(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [None]:
class ShopeeDatasetText(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = False,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'dm_nfnet_f0' or model_name == 'eca_nfnet_l0' or model_name == 'nf_resnet50':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc
        
        if self.use_fc:
            self.dropout = nn.Dropout(p=0.0)
            self.fc1 = nn.Linear(final_in_features, fc_dim+512)
            self.bn1 = nn.BatchNorm1d(fc_dim+512)
            self.fc = nn.Linear(fc_dim+512, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self.relu = nn.ReLU()
            self._init_params()
            final_in_features = fc_dim

        self.final = ArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return F.normalize(feature)

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            #x = self.dropout(x)
            x = self.fc1(x)
            x = self.bn1(x)
            x = self.relu(x)
            x = self.fc(x)
            x = self.bn(x)
            x = self.relu(x)
            
        return x

In [None]:
class ShopeeNetText(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 pooling='mean_pooling',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNetText, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.pooling = pooling
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self.relu = nn.ReLU()
            self._init_params()
            final_in_features = fc_dim

        #self.final = nn.Linear(final_in_features, n_classes)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        if self.pooling == 'mean_pooling':
            features = self.mean_pooling(x,attention_mask)
        elif self.pooling == 'max_pooling':
            festures = self.max_pooling(x,attention_mask)
        else:
            features = x[0]
            features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)
            features = self.relu(features)

        return features
    
    def max_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
        max_over_time = torch.max(token_embeddings, 1)[0]
        return max_over_time
    
    def mean_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

In [None]:
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

In [None]:
def get_image_embeddings(image_paths, model_name = CFG.model_name):
    embeds = []
    
    model = ShopeeModel(model_name = model_name)
    model.eval()
    
#     if model_name == 'dm_nfnet_f0' or model_name == 'eca_nfnet_l0':
#         model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(CFG.model_path), strict=False)
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            feat2 = model(img.flip(-1),label)
            image_embeddings = (feat.detach().cpu().numpy() + feat2.detach().cpu().numpy())/2
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    
    
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def get_image_embeddings_fold2(image_paths, model_name = CFG.model_name):
    embeds = []
    
    model = ShopeeModel(model_name = model_name)
    model.eval()
    
#     if model_name == 'dm_nfnet_f0' or model_name == 'eca_nfnet_l0':
#         model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(CFG.model_path_2), strict=False)
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            feat2 = model(img.flip(-1),label)
            image_embeddings = (feat.detach().cpu().numpy() + feat2.detach().cpu().numpy())/2
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def get_image_embeddings_ranger(image_paths, model_name = CFG.model_name):
    embeds = []
    
    model = ShopeeModel(model_name = model_name)
    model.eval()
    
    if model_name == 'dm_nfnet_f0' or model_name == 'eca_nfnet_l0':
        model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(CFG.model_path_ranger), strict=False)
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            feat2 = model(img.flip(-1),label)
            image_embeddings = (feat.detach().cpu().numpy() + feat2.detach().cpu().numpy())/2
            embeds.append(image_embeddings)
    
    del model
    image_embeddings = np.concatenate(embeds)
    
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def get_text_embeddings_bert(df):
    embeds = []
    
    model = ShopeeNetText(**model_params_bert)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))#strict=False)
    model = model.to(CFG.device)

    text_dataset = ShopeeDatasetText(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=32,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
def get_text_predictions(df, max_features = 25_000):
    
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>0.85)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
            if len(preds[-1]) == 1:
                IDX = cupy.where((cts[k,] >= 0.82) & (cts[k,] < 0.85))[0]
                if len(IDX) == 0:
                    continue
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds[-1] = np.concatenate((preds[-1], o))
    
    del model,text_embeddings
    gc.collect()
    return preds

In [None]:
def get_neighbours_cos_sim(df,embeddings):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)

    preds = []
    CHUNK = 1024*4
    threshold = 0.85

    print('Finding similar texts...for threshold :',threshold)
    CTS = len(embeddings)//CHUNK
    if len(embeddings)%CHUNK!=0: 
        CTS += 1

    for j in range( CTS ):
        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(embeddings))
        print('chunk',a,'to',b)

        cts = cupy.matmul(embeddings,embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
                
            if len(preds[-1]) == 1:
                IDX = cupy.where((cts[k,] >= threshold-0.03) & (cts[k,] < threshold))[0]
                if len(IDX) == 0:
                    continue
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds[-1] = np.concatenate((preds[-1], o))
                    
return preds

In [None]:
def filter_by_sim(df, image_embeddings):
    predictions = []
    for i in tqdm(range(df.shape[0])):
        if len(df.iloc[i].text_predictions) - len(df.iloc[i].image_predictions) > 1:
            text_idxs = []
            for text_id in df.iloc[i].text_predictions:
                idx = df.loc[df['posting_id'] == text_id].index[0]
                text_idxs.append(idx)
            
            img_idxs = []
            for text_id in df.iloc[i].image_predictions:
                idx = df.loc[df['posting_id'] == text_id].index[0]
                img_idxs.append(idx)
            
            sim_scores = np.sum(np.dot(image_embeddings[img_idxs],image_embeddings[text_idxs].T),axis=0)/len(img_idxs)
            filt_ind = np.where(sim_scores > 0.25)[0]
            new_idxs = np.asarray(text_idxs)[filt_ind]
            
            predictions.append(df.iloc[new_idxs].posting_id.values)
        else:
            predictions.append(df.iloc[i].text_predictions)
            
    return predictions

In [None]:
def make_dictionaries():
    indo_to_eng_dict = {}
    eng_to_indo_dict = {}
    with open('../input/offline-translator-indonesean-to-english-reverse/indonesean_english_dict.txt') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            if "\"" in line:
                print("ignoring: ", line) # skip non-sense line
                continue
            inputs = line[2:-3].split("', '")
            indo_word = inputs[0].lower()
            eng_word = inputs[1].lower()
            
            indo_to_eng_dict[indo_word] = eng_word
            eng_to_indo_dict[eng_word] = indo_word
    f.close()
    return indo_to_eng_dict, eng_to_indo_dict



def translate_indo_to_eng(text):
    words = text.lower().split()
    translated_words = list(map(lambda x : x if x not in INDO_TO_ENG_DICT else INDO_TO_ENG_DICT[x], words))
    return " ".join(translated_words)


def translate_eng_to_indo(text):
    words = text.lower().split()
    translated_words = list(map(lambda x : x if x not in ENG_TO_INDO_DICT else ENG_TO_INDO_DICT[x], words))
    return " ".join(translated_words)


INDO_TO_ENG_DICT, ENG_TO_INDO_DICT = make_dictionaries()

In [None]:
df,df_cu,image_paths = read_dataset_tfidf()
df["title"] = df["title"].apply(translate_indo_to_eng)
df.head()

In [None]:
df_bert = read_dataset_bert()
df_bert.head()

In [None]:
image_embeddings_fold1 = get_image_embeddings(image_paths.values)
image_embeddings_fold2 = get_image_embeddings_fold2(image_paths.values)
image_embeddings = np.concatenate((image_embeddings_fold1, image_embeddings_fold2),1)
print(image_embeddings.shape)

text_embeddings_bert = get_text_embeddings_bert(df_bert)

In [None]:
image_predictions = get_image_predictions(df, image_embeddings, threshold = 0.3)
text_predictions = get_text_predictions(df, max_features = 25_000)
text_predictions_bert = get_neighbours_cos_sim(df_bert,text_embeddings_bert)

In [None]:
df['image_predictions'] = image_predictions
df['text_predictions'] = text_predictions

In [None]:
filtered_predictions = filter_by_sim(df, image_embeddings)

In [None]:
df['text_predictions'] = filtered_predictions
df['text_predictions_bert'] = text_predictions_bert
df['matches'] = df.apply(combine_predictions, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)