In [None]:
!pip install transformers datasets

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import pandas as pd
import os
import re
import csv
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import AutoTokenizer,create_optimizer,TFAutoModel

In [None]:
BATCH_SIZE=128
MAX_LENGTH=64

# Data Preparation

## Downloading

In [None]:
!pip install aicrowd-cli
!aicrowd login

In [None]:
!aicrowd dataset download -c esci-challenge-for-improving-product-search

In [None]:
!unzip "/content/train-v0.3.csv.zip" -d "/content/dataset/"

In [None]:
!unzip "/content/product_catalogue-v0.3.csv.zip" -d "/content/dataset/"

In [None]:
!cp /content/dataset/data/processed/public/task_3_product_substitute_identification/train-v0.3.csv /content/drive/MyDrive/datasets/kdd_cup

In [None]:
!cp /content/dataset/data/processed/public/task_3_product_substitute_identification/product_catalogue-v0.3.csv /content/drive/MyDrive/datasets/kdd_cup

## Loading

In [None]:
filepath_train='/content/drive/MyDrive/datasets/kdd_cup/train-v0.3_1.csv'

In [None]:
filepath_catalogue='/content/drive/MyDrive/datasets/kdd_cup/product_catalogue-v0.3.csv'

In [None]:
df_catalogue=pd.read_csv(filepath_catalogue)

In [None]:
df_train=pd.read_csv(filepath_train)

In [None]:
df_train.loc[df_train['query_locale'] == 'jp']

In [None]:
df_catalogue

# Dataset Generation

In [None]:
def get_product_title(product_id):
  options=[]
  options.append(str(product_id))
  matched_row=df_catalogue.loc[df_catalogue['product_id'].isin(options)]
  return matched_row['product_title'][int(str(matched_row['product_title']).split("   ")[0])]

In [None]:
get_product_title("B08N6NCKRX")

In [None]:
header = ['query', 'product', 'label']

with open('/content/drive/MyDrive/datasets/kdd_cup/csv_file_9.csv', 'w', encoding='UTF8') as f:

  # create the csv writer
  writer = csv.writer(f)

  # write the header
  writer.writerow(header)

  for i in range(0,781636):

    data=[df_train[i:i+1]['query'][i],get_product_title(df_train[i:i+1]['product_id'][i]),df_train[i:i+1]['esci_label'][i]]
    # write the data
    writer.writerow(data)


In [None]:
filepath="/content/drive/MyDrive/datasets/kdd_cup/csv_file_9.csv"
train_data=pd.read_csv(filepath)

In [None]:
train_data

## Preparation

In [None]:
filepaths=["/content/drive/MyDrive/datasets/kdd_cup/csv_file_"+str(i)+".csv" for i in range(10)]
print(filepaths)

In [None]:
dataset = load_dataset('csv', data_files=filepaths)

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
def get_label(label):
  if label=='exact':
    return 1.0
  elif label=='substitute':
    return 0.7
  elif label=='complement':
    return 0.5
  else:
    return 0.0

In [None]:
model_id="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
def preprocess(dataset):
  if dataset['product']==None:
    dataset['product']=dataset['query']

  dataset['input_ids_query']=[]
  dataset['token_type_ids_query']=[]
  dataset['attention_mask_query']=[]

  dataset['input_ids_product']=[]
  dataset['token_type_ids_product']=[]
  dataset['attention_mask_product']=[]

  tokenized_output_query=tokenizer(dataset['query'],max_length=MAX_LENGTH,padding='max_length',truncation=True)
  tokenized_output_product=tokenizer(dataset['product'],max_length=MAX_LENGTH,padding='max_length',truncation=True)

  dataset['input_ids_query'].append(tokenized_output_query['input_ids'])
  dataset['token_type_ids_query'].append(tokenized_output_query['token_type_ids'])
  dataset['attention_mask_query'].append(tokenized_output_query['attention_mask'])

  dataset['input_ids_product'].append(tokenized_output_product['input_ids'])
  dataset['token_type_ids_product'].append(tokenized_output_product['token_type_ids'])
  dataset['attention_mask_product'].append(tokenized_output_product['attention_mask'])

  dataset['label']=get_label(dataset['label'])

  return dataset

In [None]:
prep_dataset=dataset.map(preprocess)

In [None]:
prep_dataset['train'][21:22]

In [None]:
tf_dataset = prep_dataset["train"].to_tf_dataset(
    columns=['input_ids_query', 'token_type_ids_query', 'attention_mask_query','input_ids_product', 'token_type_ids_product', 'attention_mask_product', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

In [None]:
tf_dataset

In [None]:
for i in tf_dataset.take(1):
  print(i)

# Model

In [None]:
model = TFAutoModel.from_pretrained(model_id)
model.summary()

In [None]:
class SentenceTransformer(tf.keras.Model):
  def __init__(self,model):
    super(SentenceTransformer,self).__init__()
    self.model=model
    self.dense=Dense(1,activation='sigmoid')

  def compile(self,optimizer,loss_fn):
    super(SentenceTransformer,self).compile()
    self.optimizer=optimizer
    self.loss_fn=loss_fn
    self.loss_metric=tf.keras.metrics.Mean(name='loss')

  @property
  def metrics(self):
    return [self.loss_metric]

  def mean_pooling(self, model_output, attention_mask):
    token_embeddings = model_output[0]

    input_mask_expanded = tf.cast(
        tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
        tf.float32
    )
    return tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)/tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

  def train_step(self,train_data):

    query={'input_ids':train_data['input_ids_query'][:,0,:],
           'token_type_ids':train_data['token_type_ids_query'][:,0,:],
           'attention_mask':train_data['attention_mask_query'][:,0,:]}

    product={'input_ids':train_data['input_ids_product'][:,0,:],
             'token_type_ids':train_data['token_type_ids_product'][:,0,:],
             'attention_mask':train_data['attention_mask_product'][:,0,:]}

    labels=train_data['label']

    with tf.GradientTape() as recorder:
      query_predictions=self.model(query)
      pred_query=self.mean_pooling(query_predictions,train_data['attention_mask_query'][:,0,:])

      product_predictions=self.model(product)
      pred_product=self.mean_pooling(product_predictions,train_data['attention_mask_product'][:,0,:])

      pred_concat=tf.concat([pred_query,pred_product,tf.abs(pred_query-pred_product)],axis=-1)

      predictions=self.dense(pred_concat)
      loss=self.loss_fn(labels,predictions)


    partial_derivatives = recorder.gradient(loss,self.model.trainable_weights)
    self.optimizer.apply_gradients(zip(partial_derivatives, self.model.trainable_weights))


    self.loss_metric.update_state(loss)

    return {'loss':self.loss_metric.result(),}

# Training

In [None]:
stransformer=SentenceTransformer(model)
stransformer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5,),
    loss_fn=tf.keras.losses.BinaryCrossentropy(),
)

In [None]:
EPOCHS=5
history=stransformer.fit(tf_dataset,epochs=EPOCHS,)

In [None]:
model_path='/content/drive/MyDrive/stransformer/stransformers.h5'

In [None]:
#stransformer.model.save_weights(model_path)

In [None]:
model.load_weights(model_path)

In [None]:
def mean_pooling(model_output, attention_mask):
  token_embeddings = model_output[0]

  input_mask_expanded = tf.cast(
      tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
      tf.float32
  )
  return tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)/tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

# Testing

In [None]:
filepath_catalogue='/content/drive/MyDrive/datasets/kdd_cup/product_catalogue-v0.3.csv'
df_catalogue=pd.read_csv(filepath_catalogue)

In [None]:
df_catalogue['product_title'][1000000]

In [None]:
product_titles=[str(df_catalogue['product_title'][i]) for i in range(len(df_catalogue))]
#print(product_titles)

In [None]:
print(len(product_titles))

In [None]:
embeddings=[]

In [None]:
INFERENCE_BATCH_SIZE=640
len(product_titles)//INFERENCE_BATCH_SIZE

In [None]:
for i in range(len(product_titles)//INFERENCE_BATCH_SIZE):
  tokenized_output=tokenizer(
      product_titles[INFERENCE_BATCH_SIZE*i:INFERENCE_BATCH_SIZE*(i+1)],max_length=MAX_LENGTH,padding='max_length',truncation=True,return_tensors="tf")
  model_output=model(tokenized_output)
  embedding=mean_pooling(model_output,tokenized_output['attention_mask'])
  embeddings.append(embedding)
  if i%100==0:
    print(i)

In [None]:
embeddings

In [None]:
#np.savez_compressed('embeddings.npz', embeddings)
#np.savez_compressed('product_titles.npz',product_titles)

In [None]:
!cp '/content/embeddings.npz' '/content/drive/MyDrive/stransformer/'
!cp '/content/product_titles.npz' '/content/drive/MyDrive/stransformer/'

# Using Emeddings

In [None]:
loaded_embedding=np.load('/content/drive/MyDrive/stransformer/embeddings.npz')
loaded_embedding_array=np.array(loaded_embedding['arr_0'])

In [None]:
loaded_titles=np.load('/content/drive/MyDrive/stransformer/product_titles.npz')
loaded_titles_array=np.array(loaded_titles['arr_0'])

In [None]:
loaded_embedding_array.shape

In [None]:
loaded_embedding_array=loaded_embedding_array.reshape(-1,loaded_embedding_array.shape[2])
print(loaded_embedding_array.shape)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
inputs = tokenizer(["針なしほっちきす"],max_length=MAX_LENGTH,padding='max_length',truncation=True,return_tensors="tf")

logits = model(**inputs)
out_embedding=mean_pooling(logits,inputs['attention_mask'])
print(out_embedding.shape)

In [None]:
u_dot_v=np.matmul(loaded_embedding_array,(np.array(out_embedding).T))
print(u_dot_v.shape)

In [None]:
u_magnitude=np.sqrt(np.sum(loaded_embedding_array*loaded_embedding_array,axis=-1))
print(u_magnitude.shape)
print(u_magnitude)

In [None]:
v_magnitude=np.sqrt(np.sum(out_embedding*out_embedding,axis=-1))
print(v_magnitude.shape)
print(v_magnitude)

In [None]:
cosine_similarity=u_dot_v.T/(u_magnitude*v_magnitude)
print(cosine_similarity)

In [None]:
sorted_indices=np.argsort(cosine_similarity,axis=-1)
print(sorted_indices)

In [None]:
for i in range(25):
  print(i,loaded_titles_array[sorted_indices[:,len(sorted_indices[0])-i-1]])

In [None]:
small_dataset = dataset.filter(lambda example: example["query"].startswith("A"))
print(len(small_dataset))