In [1]:
import argparse
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
dataset_path = '../shopping_queries_dataset/'
locale ="us"
model_save_path = f"./models_{locale}"
output_path = f"{model_save_path}_tmp"
random_state = 42
n_dev_queries = 200
train_batch_size = 32

In [3]:
""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
to_print = "".join(['-']*40)
print(to_print)
print(f"---------> {device} is activated <----------")
print(to_print)
esci_label2gain = {
    'E' : 1.0,
    'S' : 0.1,
    'C' : 0.01,
    'I' : 0.0,
}

----------------------------------------
---------> cuda is activated <----------
----------------------------------------


In [4]:
""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_split] == "train"]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])

list_query_id = df_examples_products[col_query_id].unique()
dev_size = n_dev_queries / len(list_query_id)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=dev_size, random_state=random_state)

df_examples_products = df_examples_products[[col_query_id, col_query, col_product_title, col_gain]]
df_train = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_train)]
df_dev = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_dev)]

In [5]:
""" 2. Prepare data loaders """
train_samples = []
for (_, row) in df_train.iterrows():
    train_samples.append(InputExample(texts=[row[col_query], row[col_product_title]], label=float(row[col_gain])))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
if locale == "us":
    dev_samples = {}
    query2id = {}
    for (_, row) in df_dev.iterrows():
        try:
            qid = query2id[row[col_query]]
        except KeyError:
            qid = len(query2id)
            query2id[row[col_query]] = qid
        if qid not in dev_samples:
            dev_samples[qid] = {'query': row[col_query], 'positive': set(), 'negative': set()}
        if row[col_gain] > 0:
            dev_samples[qid]['positive'].add(row[col_product_title])
        else:
            dev_samples[qid]['negative'].add(row[col_product_title])
    evaluator = CERerankingEvaluator(dev_samples, name='train-eval')

    """ 3. Prepare Cross-enconder model:
        https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_cross-encoder_kd.py
    """
    model_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
    num_epochs = 1
    num_labels = 1
    max_length = 512
    default_activation_function = torch.nn.Identity()
    model = CrossEncoder(
        model_name, 
        num_labels=num_labels, 
        max_length=max_length, 
        default_activation_function=default_activation_function, 
        device=device
    )
    loss_fct=torch.nn.MSELoss()
    evaluation_steps = 5000
    warmup_steps = 5000
    lr = 7e-6
    """ 4. Train Cross-encoder model """
    model.fit(
        train_dataloader=train_dataloader,
        loss_fct=loss_fct,
        evaluator=evaluator,
        epochs=num_epochs,
        evaluation_steps=evaluation_steps,
        warmup_steps=warmup_steps,
        output_path=output_path,
        optimizer_params={'lr': lr},
    )
    model.save(model_save_path)
else:
    dev_queries = df_dev[col_query].to_list()
    dev_titles = df_dev[col_product_title].to_list()
    dev_scores = df_dev[col_gain].to_list()   
    evaluator = evaluation.EmbeddingSimilarityEvaluator(dev_queries, dev_titles, dev_scores)

    """ 3. Prepare sentence transformers model: 
        https://www.sbert.net/docs/training/overview.html 
    """
    model_name = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
    model = SentenceTransformer(model_name)
    train_loss = losses.CosineSimilarityLoss(model=model)
    num_epochs = 1
    evaluation_steps = 1000
    """ 4. Train Sentence transformer model """
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=num_epochs,
        evaluation_steps=evaluation_steps,
        output_path=output_path,
    )
    model.save(model_save_path)

  return self.fget.__get__(instance, owner)()


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/12989 [00:00<?, ?it/s]

# Test

In [6]:
import argparse
import numpy as np
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from tqdm import tqdm

In [7]:
dataset_path = '../shopping_queries_dataset/'
locale = 'us'
model_path = model_save_path
hypothesis_path_file = f"./hypothesis/hypothesis_{locale}.csv"
batch_size = 256

""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_small_version = "small_version"
col_split = "split"
col_scores = "scores"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))

df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_split] == "test"]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])

features_query = df_examples_products[col_query].to_list()
features_product = df_examples_products[col_product_title].to_list()
n_examples = len(features_query)
scores = np.zeros(n_examples)

if locale == "us":
    """ 2. Prepare Cross-encoder model """
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    """ 3. Generate hypothesis """
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            features = tokenizer(features_query_, features_product_,  padding=True, truncation=True, return_tensors="pt").to(device)
            scores[i:j] = np.squeeze(model(**features).logits.cpu().detach().numpy())
            i = j
else :
    """ 2. Prepare Sentence transformer model """
    model = AutoModel.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # CLS Pooling - Take output from first token
    def cls_pooling(model_output):
        return model_output.last_hidden_state[:,0]
    # Encode text
    def encode(texts):
        # Tokenize sentences
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input, return_dict=True)
        # Perform pooling
        embeddings = cls_pooling(model_output)
        return embeddings
    model.eval()

    """ 3. Generate hypothesis """
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            query_emb = encode(features_query_)
            product_emb = encode(features_product_)
            scores[i:j] = torch.diagonal(torch.mm(query_emb, product_emb.transpose(0, 1)).to('cpu'))
            i = j

""" 4. Prepare hypothesis file """   
df_hypothesis = pd.DataFrame({
    col_query_id : df_examples_products[col_query_id].to_list(),
    col_product_id : df_examples_products[col_product_id].to_list(),
    col_esci_label : df_examples_products[col_esci_label].to_list(),
    col_scores : scores,
})
df_hypothesis = df_hypothesis.sort_values(by=[col_query_id, col_scores], ascending=False)

100%|██████████| 710/710 [03:04<00:00,  3.86it/s]


In [12]:
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score
from collections import OrderedDict
from pprint import pprint


def calculate_metrics(df, col, hit_at_n=[1, 5, 10], pure_python=False):
    """ Calculatye Metrics: MRR and Hits@n
    It uses Ampligraph based on Tensorflow: https://docs.ampligraph.org/en/latest/index.html
    If you prefer to do calculation based on pure Python, set pure_python=True
    """
    result = OrderedDict()
    df.loc[:, 'rank'] = df.groupby('query_id')[col].rank(method='min', ascending=False).values
    first_hit_rank_position = df.groupby('query_id')[['gain', 'rank']] \
        .apply(lambda x: x[x.gain == 1.0]['rank'].min()).values
    
    first_hit_rank_position = np.nan_to_num(first_hit_rank_position, nan=1000)
    
    if not pure_python:
        result["MRR"] = mrr_score(first_hit_rank_position).round(4)
        for h in hit_at_n:
            result[f"Hits@{h}"] = hits_at_n_score(first_hit_rank_position, n=h).round(4)
        
    else:
        n_queries = first_hit_rank_position.shape[0]
        result["MRR"] = np.divide(np.divide(1, first_hit_rank_position).sum(), 
                                  n_queries).round(4)
        for h in hit_at_n:
            result[f"Hits@{h}"] = np.divide((first_hit_rank_position <= h).sum(),
                                            n_queries).round(4)
    return result

target_cols = ['scores']
df_hypothesis[col_gain] = df_hypothesis[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])
print(f"--> MRR for baseline models: \n")
for col in target_cols:    
    pprint({col: calculate_metrics(df_hypothesis, col)})
    

--> MRR for baseline models: 

{'scores': OrderedDict([('MRR', 0.8204),
                        ('Hits@1', 0.7211),
                        ('Hits@5', 0.9447),
                        ('Hits@10', 0.9834)])}
