In [1]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from tqdm import tqdm

import os
import pandas as pd
import numpy as np
import torch

import faiss

In [2]:
dataset_path = '../shopping_queries_dataset/'
locale ="us"
model_save_path = f"./models_{locale}"
output_path = f"{model_save_path}_training"
random_state = 42
n_dev_queries = 200
train_batch_size = 32
train = False

In [3]:
""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'
col_features = [col_product_id]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
to_print = "".join(['-']*40)
print(to_print)
print(f"---------> {device} is activated <----------")
print(to_print)
esci_label2gain = {
    'E' : 1.0,
    'S' : 0.1,
    'C' : 0.01,
    'I' : 0.0,
}

----------------------------------------
---------> cuda is activated <----------
----------------------------------------


In [4]:
""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])

df_train = df_examples_products[[col_query_id, col_query, *col_features, col_gain]][df_examples_products[col_split] == "train"]
list_query_id = df_train[col_query_id].unique()
dev_size = n_dev_queries / len(list_query_id)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=dev_size, random_state=random_state)

df_train = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_train)]
df_dev = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_dev)]
df_test = df_examples_products[df_examples_products[col_split] == "test"]

# This part of the code is for indexing and it is assumed the only input feature is product_title.
# Otherwise it shoudl be updated asccordingly
id_features_product_test = df_test[[col_product_id, col_product_title]].drop_duplicates(subset=col_product_title)

features_product_test = id_features_product_test[col_product_title].to_list()
id_product_test = id_features_product_test[col_product_id].to_list()

# Inferencing Pretrained Models

In [5]:
def retrieval_inference(model_path, text=None, batch_scoring=False, query_result_pair=None, batch_size=256):
    """ Embeddings for the trained bi-encoder models """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModel.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # CLS Pooling - Take output from first token
    def cls_pooling(model_output):
        return model_output.last_hidden_state[:,0]
    # Encode text
    def encode(texts):
        # Tokenize sentences
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input, return_dict=True)
        # Perform pooling
        embeddings = cls_pooling(model_output)
        return embeddings
    model.eval()
    if not batch_scoring:
        return encode(text)
    features_query, features_product = query_result_pair
    n_examples = len(features_query)
    scores = np.zeros(n_examples)
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            query_emb = encode(features_query_)
            product_emb = encode(features_product_)
            scores[i:j] = torch.diagonal(torch.mm(query_emb, product_emb.transpose(0, 1)).to('cpu'))
            i = j
    return scores

In [11]:
def reranking_inference(model_path, features_query, features_product, batch_size=256):
    """ Scoring for the trained cross-encoder models """
    n_examples = len(features_query)
    scores = np.zeros(n_examples)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.eval()
    
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            features = tokenizer(features_query_, features_product_, 
                                 padding=True, truncation=True, return_tensors="pt").to(device)
            scores[i:j] = np.squeeze(model(**features).logits.cpu().detach().numpy())
            i = j
    return scores


In [12]:
features_query = df_test[col_query]
features_products = df_test[col_product_title]

retrieval_model_paths = [
    'sentence-transformers/multi-qa-mpnet-base-dot-v1', # specific for semantic search
    'sentence-transformers/all-mpnet-base-v2' # general purpose model
]
ranking_model_paths = [
    'cross-encoder/ms-marco-MiniLM-L-12-v2',
    # './models_us_training_reranking_cross-encoder/stsb-roberta-large'
]

for retrieval_model_path in retrieval_model_paths:
    scores = retrieval_inference(retrieval_model_path, batch_scoring=True, 
                                 query_result_pair=(features_query.to_list(), 
                                                    features_products.to_list()))
    df_test.loc[:, f"retrieval_{retrieval_model_path.split('/')[-1]}"] = scores.copy()
    
for ranking_model_path in ranking_model_paths:     
    scores = reranking_inference(ranking_model_path, 
                                 features_query.to_list(), 
                                 features_products.to_list())
    df_test.loc[:, f"ranking_{ranking_model_path.split('/')[-1]}"] = scores.copy()

100%|██████████| 710/710 [09:46<00:00,  1.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, f"retrieval_{retrieval_model_path.split('/')[-1]}"] = scores.copy()
100%|██████████| 710/710 [09:46<00:00,  1.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, f"retrieval_{retrieval_model_path.split('/')[-1]}"] = scores.copy()
  return self.fget.__get__(instance, owner)()
100%|██████████| 710/710 [03:03<00:00,  3.88it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] =

In [13]:
df_test.to_csv("./scores_from_pretrained_models.csv")

In [14]:
df_test[['retrieval_multi-qa-mpnet-base-dot-v1',
         'retrieval_all-mpnet-base-v2',
         'ranking_ms-marco-MiniLM-L-12-v2']]

Unnamed: 0,retrieval_multi-qa-mpnet-base-dot-v1,retrieval_all-mpnet-base-v2,ranking_ms-marco-MiniLM-L-12-v2
32,19.696350,2.835467,-11.331335
33,22.133604,3.423887,-11.169686
34,21.063263,3.756823,-10.976269
35,21.452841,4.178871,-9.424541
36,21.738901,4.446245,-9.285576
...,...,...,...
2614589,12.671896,1.713727,-11.321844
2614590,9.298196,1.120765,-11.325283
2614591,10.096718,1.734619,-11.307865
2614592,12.950536,1.788636,-11.316772


## Evaluating Results and Model Selection

In [18]:
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score
from collections import OrderedDict
from pprint import pprint


def calculate_metrics(df, col, hit_at_n=[1, 5, 10], pure_python=False):
    """ Calculatye Metrics: MRR and Hits@n
    It uses Ampligraph based on Tensorflow: https://docs.ampligraph.org/en/latest/index.html
    If you prefer to do calculation based on pure Python, set pure_python=True
    """
    result = OrderedDict()
    df.loc[:, 'rank'] = df.groupby('query_id')[col].rank(method='min', ascending=False).values
    first_hit_rank_position = df.groupby('query_id')[['gain', 'rank']] \
        .apply(lambda x: x[x.gain == 1.0]['rank'].min()).values
    
    first_hit_rank_position = np.nan_to_num(first_hit_rank_position, nan=1000)
    
    if not pure_python:
        result["MRR"] = mrr_score(first_hit_rank_position).round(4)
        for h in hit_at_n:
            result[f"Hits@{h}"] = hits_at_n_score(first_hit_rank_position, n=h).round(4)
        
    else:
        n_queries = first_hit_rank_position.shape[0]
        result["MRR"] = np.divide(np.divide(1, first_hit_rank_position).sum(), 
                                  n_queries).round(4)
        for h in hit_at_n:
            result[f"Hits@{h}"] = np.divide((first_hit_rank_position <= h).sum(),
                                            n_queries).round(4)
    return result

target_cols = ['retrieval_multi-qa-mpnet-base-dot-v1',
               'retrieval_all-mpnet-base-v2',
               'ranking_ms-marco-MiniLM-L-12-v2']
metrics = OrderedDict()
print(f"--> MRR for trained models: \n")
for col in target_cols:    
    pprint({col: calculate_metrics(df_test, col)})
    metrics[col] = calculate_metrics(df_test, col).copy()

--> MRR for trained models: 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'rank'] = df.groupby('query_id')[col].rank(method='min', ascending=False).values


{'retrieval_multi-qa-mpnet-base-dot-v1': OrderedDict([('MRR', 0.7767),
                                                      ('Hits@1', 0.6556),
                                                      ('Hits@5', 0.932),
                                                      ('Hits@10', 0.9792)])}
{'retrieval_all-mpnet-base-v2': OrderedDict([('MRR', 0.7648),
                                             ('Hits@1', 0.6426),
                                             ('Hits@5', 0.9252),
                                             ('Hits@10', 0.979)])}
{'ranking_ms-marco-MiniLM-L-12-v2': OrderedDict([('MRR', 0.7898),
                                                 ('Hits@1', 0.6738),
                                                 ('Hits@5', 0.9358),
                                                 ('Hits@10', 0.9774)])}


# Indexing with FAISS 

Note: Just indexing test set to save time and for evaluation purposes

In [19]:
model_path = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'

def indexing_faiss(list_to_index, model_path, index_file_name, batch_size=256):
    embedding_size = 768
    n_examples = len(list_to_index)
    index = faiss.IndexIDMap(faiss.IndexFlatIP(embedding_size))
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            list_to_index_ = list_to_index[i:j]
            index.add_with_ids(
                retrieval_inference(model_path=model_path, text=list_to_index_) \
                    .to('cpu').numpy().astype('float32'), 
                np.array(range(i, j))
            )
    assert index.ntotal == n_examples, "Not all the inputs are indexed"
    faiss.write_index(index, index_file_name)

def global_index_file_name(model_path, locale):
    if locale: return f"./{locale}_{model_path.split('/')[-1]}.index"
    return f"./{model_path.split('/')[-1]}.index"

index_file_name = f"./pretrained_multi-qa-mpnet-base-dot-v1.index"
if not os.path.isfile(index_file_name):
    indexing_faiss(list_to_index=features_product_test, 
                   model_path=model_path,
                   index_file_name=index_file_name, 
                   batch_size=256
                  )
else:
    print(f"The index file exist {index_file_name}")

100%|██████████| 639/639 [18:47<00:00,  1.76s/it]


## Inference Retrieval-Indexing Results

In [41]:
import time 
from pprint import pprint

model_path = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'

def fetch_id_product(indices):
    return [{"Product ID": id_product_test[i], "Product Title": features_product_test[i]} for i in indices]

def retriev(query, top_k=5, locale="us", model_path=model_path):
    index = faiss.read_index(f"pretrained_{model_path.split('/')[-1]}.index")
    tick = time.time()
    query_vector = retrieval_inference(model_path, query).to('cpu').numpy().astype('float32')
    top_k = index.search(query_vector, top_k)
    print(f"Results in Total Time: {time.time() - tick}")
    top_k_ids = top_k[1].tolist()[0]
    return fetch_id_product(top_k_ids)

query = df_test[col_query].iloc[0]
pprint({"query": query, "retrieval results": retriev(query)})



Results in Total Time: 1.0279555320739746
{'query': '!qscreen fence without holes',
 'retrieval results': [{'Product ID': 'B01N1P9MYW', 'Product Title': 'Fences'},
                       {'Product ID': 'B07R6P8TK8',
                        'Product Title': "Amgo 4' x 50' Black Fence Privacy "
                                         'Screen Windscreen,with Bindings & '
                                         'Grommets, Heavy Duty for Commercial '
                                         'and Residential, 90% Blockage, Cable '
                                         'Zip Ties Included, (Available for '
                                         'Custom Sizes)'},
                       {'Product ID': 'B07XCGC4ZM',
                        'Product Title': 'Good Fences'},
                       {'Product ID': 'B00ZBE9IMQ',
                        'Product Title': 'The Fence'},
                       {'Product ID': 'B07R3TNQDM',
                        'Product Title': "Amgo 6' x 50' Black 

## End-to-End System Performance 

In [42]:
import time 
from pprint import pprint

# Selected Retrieval Model
model_path = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
n_batches=10
batch_size=10
top_k=30

def fetch_id_product(row, indices):
    return [{col_query_id: row[col_query_id],
             col_query: row[col_query],
             col_product_id: id_product_test[i], 
             col_product_title: features_product_test[i]} for i in indices]

def retriev(row, top_k=5, locale="us", model_path=model_path):
    index = faiss.read_index(f"pretrained_{model_path.split('/')[-1]}.index")
    query_vector = retrieval_inference(model_path, row[col_query]).to('cpu').numpy().astype('float32')
    top_k = index.search(query_vector, top_k)
    top_k_ids = top_k[1].tolist()[0]
    return fetch_id_product(row, top_k_ids)

def sampling_retrieval(model_path, df_queries, n_batches=n_batches, batch_size=batch_size, top_k=top_k):
    result = []
    for i in tqdm(range(n_batches)):
        features_queries = df_queries.sample(n=batch_size) # default replacement is False
        for (_, row) in features_queries.iterrows():
            result.append(retriev(row, top_k=top_k, locale="us", model_path=model_path))       
    return result

df_queries = df_test[[col_query_id, col_query, col_gain]].drop_duplicates()
result = sampling_retrieval(model_path, df_queries)

100%|██████████| 10/10 [02:41<00:00, 16.10s/it]


In [43]:
from itertools import chain

import random


def flatten_chain(matrix):
    return list(chain.from_iterable(matrix))

ranking_model_path = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
col = 'ranking_ms-marco-MiniLM-L-12-v2'
sample_size = 20 # less than 100
sample_result = []
n_iterations = 30

for i in range(n_iterations):
    df_ = pd.DataFrame(flatten_chain(random.sample(result, sample_size)))
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        scores = reranking_inference(ranking_model_path, 
                                     df_[col_query].to_list(), 
                                     df_[col_product_title].to_list())
    score_col = f"ranking_{ranking_model_path.split('/')[-1]}"
    df_.loc[:, score_col] = scores.copy()
    df_.loc[:, "rank"] = df_.groupby(col_query_id)[score_col].rank(method='min', ascending=False).values
    df_ = df_[df_['rank'] <= 10]
    df_rank = df_test[df_test[col_query_id].isin(df_[col_query_id].unique())][[col_query_id, col_product_id, col_gain]].merge(
        df_,
        how='left',
        on=[col_query_id, col_product_id]
    )
    df_rank.loc[:, 'rank'] = df_rank.groupby('query_id')[col]. \
    rank(method='min', ascending=False).values
    df_rank['ranking_ms-marco-MiniLM-L-12-v2'] = df_rank['ranking_ms-marco-MiniLM-L-12-v2'].fillna(0)
    sample_result.append(calculate_metrics(df_rank, col).copy())

100%|██████████| 3/3 [00:00<00:00,  5.82it/s]
100%|██████████| 3/3 [00:00<00:00,  4.86it/s]
100%|██████████| 3/3 [00:00<00:00,  5.67it/s]
100%|██████████| 3/3 [00:00<00:00,  5.81it/s]
100%|██████████| 3/3 [00:00<00:00,  4.78it/s]
100%|██████████| 3/3 [00:00<00:00,  5.16it/s]
100%|██████████| 3/3 [00:00<00:00,  4.66it/s]
100%|██████████| 3/3 [00:00<00:00,  5.43it/s]
100%|██████████| 3/3 [00:00<00:00,  5.42it/s]
100%|██████████| 3/3 [00:00<00:00,  5.52it/s]
100%|██████████| 3/3 [00:00<00:00,  4.66it/s]
100%|██████████| 3/3 [00:00<00:00,  4.28it/s]
100%|██████████| 3/3 [00:00<00:00,  5.06it/s]
100%|██████████| 3/3 [00:00<00:00,  4.87it/s]
100%|██████████| 3/3 [00:00<00:00,  5.19it/s]
100%|██████████| 3/3 [00:00<00:00,  5.42it/s]
100%|██████████| 3/3 [00:00<00:00,  4.89it/s]
100%|██████████| 3/3 [00:00<00:00,  5.05it/s]
100%|██████████| 3/3 [00:00<00:00,  5.82it/s]
100%|██████████| 3/3 [00:00<00:00,  4.86it/s]
100%|██████████| 3/3 [00:00<00:00,  5.69it/s]
100%|██████████| 3/3 [00:00<00:00,

In [44]:
pd.DataFrame(sample_result).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MRR,30.0,0.896953,0.05395,0.7542,0.865875,0.89585,0.927325,1.0
Hits@1,30.0,0.821667,0.092553,0.55,0.8,0.8,0.8875,1.0
Hits@5,30.0,0.988333,0.021509,0.95,1.0,1.0,1.0,1.0
Hits@10,30.0,0.988333,0.021509,0.95,1.0,1.0,1.0,1.0
