In [1]:
!pip install faiss-cpu

Looking in indexes: https://artifacts.dell.com/artifactory/api/pypi/python/simple, https://artifacts.dell.com/artifactory/api/pypi/ailfc-1003745-pypi-prd-local/simple, https://artifacts.dell.com/artifactory/api/pypi/aia-1001238-pypi-prd-local/simple, https://artifacts.dell.com/artifactory/api/pypi/aiops-1002685-pypi-prd-local/simple


In [2]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from tqdm import tqdm

import os
import pandas as pd
import numpy as np
import torch

import faiss

In [3]:
dataset_path = '../shopping_queries_dataset/'
locale ="us"
model_save_path = f"./models_{locale}"
output_path = f"{model_save_path}_training"
random_state = 42
n_dev_queries = 200
train_batch_size = 32
train = False

In [4]:
""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'
col_features = [col_product_id]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
to_print = "".join(['-']*40)
print(to_print)
print(f"---------> {device} is activated <----------")
print(to_print)
esci_label2gain = {
    'E' : 1.0,
    'S' : 0.1,
    'C' : 0.01,
    'I' : 0.0,
}

----------------------------------------
---------> cuda is activated <----------
----------------------------------------


In [5]:
""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])

df_train = df_examples_products[[col_query_id, col_query, *col_features, col_gain]][df_examples_products[col_split] == "train"]
list_query_id = df_train[col_query_id].unique()
dev_size = n_dev_queries / len(list_query_id)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=dev_size, random_state=random_state)

df_train = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_train)]
df_dev = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_dev)]
df_test = df_examples_products[df_examples_products[col_split] == "test"]

# This part of the code is for indexing and it is assumed the only input feature is product_title.
# Otherwise it shoudl be updated asccordingly
id_features_product_test = df_test[[col_product_id, col_product_title]].drop_duplicates(subset=col_product_title)

features_product_test = id_features_product_test[col_product_title].to_list()
id_product_test = id_features_product_test[col_product_id].to_list()

# Training Retrieval Bi-encoder Models and Indexing with FAISS

## Retrieval Training

In [6]:
if train:
    """ Prepare data loaders """
    train_samples = []
    for (_, row) in df_train.iterrows():
        train_samples.append(InputExample(texts=[row[col_query], row[col_product_title]], label=float(row[col_gain])))
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)

    dev_queries = df_dev[col_query].to_list()
    dev_titles = df_dev[col_product_title].to_list()
    dev_scores = df_dev[col_gain].to_list()   
    evaluator = evaluation.EmbeddingSimilarityEvaluator(dev_queries, dev_titles, dev_scores)

    """ Prepare sentence transformers model: 
        https://www.sbert.net/docs/training/overview.html 
    """
    model_names = [
        'sentence-transformers/multi-qa-mpnet-base-dot-v1', # specific for semantic search
        'sentence-transformers/all-mpnet-base-v2' # general purpose model
    ]

    for model_name in model_names:
        model = SentenceTransformer(model_name)
        train_loss = losses.CosineSimilarityLoss(model=model)
        num_epochs = 1
        evaluation_steps = 1000
        """ 4. Train Sentence transformer model """
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            output_path=f"{output_path}_retrieval_{model_name}",
        )
        model.save(f"{model_save_path}_retrieval_{model_name}")

## Retrieval Inference

In [7]:
def retrieval_inference(model_path, text=None, batch_scoring=False, query_result_pair=None, batch_size=256):
    """ Embeddings for the trained bi-encoder models """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModel.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # CLS Pooling - Take output from first token
    def cls_pooling(model_output):
        return model_output.last_hidden_state[:,0]
    # Encode text
    def encode(texts):
        # Tokenize sentences
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input, return_dict=True)
        # Perform pooling
        embeddings = cls_pooling(model_output)
        return embeddings
    model.eval()
    if not batch_scoring:
        return encode(text)
    features_query, features_product = query_result_pair
    n_examples = len(features_query)
    scores = np.zeros(n_examples)
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            query_emb = encode(features_query_)
            product_emb = encode(features_product_)
            scores[i:j] = torch.diagonal(torch.mm(query_emb, product_emb.transpose(0, 1)).to('cpu'))
            i = j
    return scores

In [8]:
model_path = './models_us_retrieval_sentence-transformers/multi-qa-mpnet-base-dot-v1'
retrieval_inference(model_path, features_product_test[0:5])

tensor([[ 0.0909, -0.3888, -0.1402,  ..., -0.3611,  0.3270, -0.5371],
        [-0.1767,  0.0056, -0.2427,  ..., -0.3935,  0.0651, -0.4160],
        [-0.1198, -0.2018, -0.2021,  ..., -0.3449,  0.1465, -0.3667],
        [-0.3555, -0.5554, -0.2343,  ...,  0.0134,  0.1727, -0.4320],
        [-0.2690, -0.6249, -0.2034,  ..., -0.0024,  0.0924, -0.3410]],
       device='cuda:0')

In [9]:
model_path = './models_us_retrieval_sentence-transformers/multi-qa-mpnet-base-dot-v1'
retrieval_inference(model_path, batch_scoring=True, 
                    query_result_pair=(df_test[col_query].to_list()[0:30],
                                       df_test[col_product_title].to_list()[0:30]))

100%|██████████| 1/1 [00:00<00:00,  7.72it/s]


array([14.44833279, 14.23288059, 15.44969654, 16.10173988, 15.30306435,
        4.43290424,  8.929492  , 15.40648079, 12.94305611, 15.44424057,
       15.81794357, 18.69197273,  7.63707542, 14.20569992, 12.38149071,
       12.53182888, 13.94957447,  1.99389601,  5.57536268, 18.19939423,
        6.5950985 ,  5.41637468, 10.56408119, 25.58846283, 26.34155083,
       17.76875305, 15.93656158,  5.68320227, 15.00357914,  6.81787443])

# Indexing with FAISS 

Note: Just indexing test set to save time and for evaluation purposes

In [10]:
def indexing_faiss(list_to_index, model_path, index_file_name, batch_size=256):
    embedding_size = 768
    n_examples = len(list_to_index)
    index = faiss.IndexIDMap(faiss.IndexFlatIP(embedding_size))
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            list_to_index_ = list_to_index[i:j]
            index.add_with_ids(
                retrieval_inference(model_path=model_path, text=list_to_index_) \
                    .to('cpu').numpy().astype('float32'), 
                np.array(range(i, j))
            )
    assert index.ntotal == n_examples, "Not all the inputs are indexed"
    faiss.write_index(index, index_file_name)

model_path = './models_us_retrieval_sentence-transformers/multi-qa-mpnet-base-dot-v1'

def global_index_file_name(model_path, locale):
    if locale: return f"./{locale}_{model_path.split('/')[-1]}.index"
    return f"./{model_path.split('/')[-1]}.index"

index_file_name = f"./{locale}_{model_path.split('/')[-1]}.index"
if not os.path.isfile(index_file_name):
    indexing_faiss(list_to_index=features_product_test, 
                   model_path=model_path,
                   index_file_name=f"./{locale}_{model_path.split('/')[-1]}.index", 
                   batch_size=256
                  )
else:
    print(f"The index file exist {index_file_name}")

The index file exist ./us_multi-qa-mpnet-base-dot-v1.index


## Inference Retrieval-Indexing Results

In [11]:
import time 
from pprint import pprint

model_path = './models_us_retrieval_sentence-transformers/multi-qa-mpnet-base-dot-v1'

def fetch_id_product(indices):
    return [{"Product ID": id_product_test[i], "Product Title": features_product_test[i]} for i in indices]

def retriev(query, top_k=5, locale="us", model_path=model_path):
    index = faiss.read_index(global_index_file_name(model_path, locale))
    tick = time.time()
    query_vector = retrieval_inference(model_path, query).to('cpu').numpy().astype('float32')
    top_k = index.search(query_vector, top_k)
    print(f"Results in Total Time: {time.time() - tick}")
    top_k_ids = top_k[1].tolist()[0]
    return fetch_id_product(top_k_ids)

query = df_test[col_query].iloc[0]
pprint({"query": query, "retrieval results": retriev(query)})

Results in Total Time: 1.3086068630218506
{'query': '!qscreen fence without holes',
 'retrieval results': [{'Product ID': '1933054395',
                        'Product Title': 'Qwirkle Board Game'},
                       {'Product ID': 'B018UP8Z26',
                        'Product Title': 'QMX Desktop Model Hoverboard'},
                       {'Product ID': 'B018JYCD9Y',
                        'Product Title': 'Zippity Outdoor Products ZP19002 No '
                                         'Dig Fence Newport, 36"H x 72"W, '
                                         'White'},
                       {'Product ID': 'B001OJXVKW',
                        'Product Title': 'Windscreen4less Heavy Duty Privacy '
                                         "Screen Fence in Color Solid Black 6' "
                                         "x 50' Brass Grommets 150 GSM - "
                                         'Customized'},
                       {'Product ID': 'B07QZ8BXVB',
                    

# Training Re-Ranking Cross-Encoder Models

## Training

In [12]:
if train:
    """ Prepare data loaders """
    train_samples = []
    for (_, row) in df_train.iterrows():
        train_samples.append(InputExample(texts=[row[col_query], row[col_product_title]], label=float(row[col_gain])))
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)

    dev_samples = {}
    query2id = {}
    for (_, row) in df_dev.iterrows():
        try:
            qid = query2id[row[col_query]]
        except KeyError:
            qid = len(query2id)
            query2id[row[col_query]] = qid
        if qid not in dev_samples:
            dev_samples[qid] = {'query': row[col_query], 'positive': set(), 'negative': set()}
        if row[col_gain] > 0:
            dev_samples[qid]['positive'].add(row[col_product_title])
        else:
            dev_samples[qid]['negative'].add(row[col_product_title])
    evaluator = CERerankingEvaluator(dev_samples, name='train-eval')

    """ Prepare Cross-enconder model:
        https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_cross-encoder_kd.py
    """
    model_names = ['cross-encoder/ms-marco-MiniLM-L-12-v2', 'cross-encoder/stsb-roberta-large']
    num_epochs = 1
    num_labels = 1
    max_length = 512
    default_activation_function = torch.nn.Identity()

    for model_name in model_names:
        model = CrossEncoder(
            model_name, 
            num_labels=num_labels, 
            max_length=max_length, 
            default_activation_function=default_activation_function, 
            device=device
        )
        loss_fct=torch.nn.MSELoss()
        evaluation_steps = 5000
        warmup_steps = 5000
        lr = 7e-6
        """ Train Cross-encoder model """
        model.fit(
            train_dataloader=train_dataloader,
            loss_fct=loss_fct,
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            warmup_steps=warmup_steps,
            output_path=f"{output_path}_reranking_{model_name}",
            optimizer_params={'lr': lr},
        )
        model.save(f"{model_save_path}_reranking_{model_name}")

## Re-Ranking Inference

In [13]:
def reranking_inference(model_path, features_query, features_product, batch_size=256):
    """ Scoring for the trained cross-encoder models """
    n_examples = len(features_query)
    scores = np.zeros(n_examples)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.eval()
    
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            features = tokenizer(features_query_, features_product_, 
                                 padding=True, truncation=True, return_tensors="pt").to(device)
            scores[i:j] = np.squeeze(model(**features).logits.cpu().detach().numpy())
            i = j
    return scores


In [14]:
model_path = './models_us_reranking_cross-encoder/ms-marco-MiniLM-L-12-v2'
reranking_inference(model_path, 
                    features_query=df_test[col_query].to_list()[0:10], 
                    features_product=features_product_test[0:10])

100%|██████████| 1/1 [00:00<00:00, 63.20it/s]


array([0.09063359, 0.21230277, 0.32579595, 0.32453728, 0.31910166,
       0.05987772, 0.29635704, 0.40455851, 0.23366484, 0.40120405])

# Testing 

## Trained Performance of Models

In [15]:
features_query = df_test[col_query]
features_products = df_test[col_product_title]

retrieval_model_paths = [
    './models_us_training_retrieval_sentence-transformers/multi-qa-mpnet-base-dot-v1',
    './models_us_training_retrieval_sentence-transformers/all-mpnet-base-v2'
]
ranking_model_paths = [
    './models_us_training_reranking_cross-encoder/ms-marco-MiniLM-L-12-v2',
    # './models_us_training_reranking_cross-encoder/stsb-roberta-large'
]

for retrieval_model_path in retrieval_model_paths:
    scores = retrieval_inference(retrieval_model_path, batch_scoring=True, 
                                 query_result_pair=(features_query.to_list(), 
                                                    features_products.to_list()))
    df_test.loc[:, f"retrieval_{retrieval_model_path.split('/')[-1]}"] = scores.copy()
    
for ranking_model_path in ranking_model_paths:     
    scores = reranking_inference(ranking_model_path, 
                                 features_query.to_list(), 
                                 features_products.to_list())
    df_test.loc[:, f"ranking_{ranking_model_path.split('/')[-1]}"] = scores.copy()

100%|██████████| 710/710 [09:50<00:00,  1.20it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, f"retrieval_{retrieval_model_path.split('/')[-1]}"] = scores.copy()
100%|██████████| 710/710 [09:49<00:00,  1.20it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, f"retrieval_{retrieval_model_path.split('/')[-1]}"] = scores.copy()
100%|██████████| 710/710 [03:04<00:00,  3.86it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documen

In [16]:
df_test.to_csv("./scores_from_trained_models.csv")

In [19]:
df_test[['retrieval_multi-qa-mpnet-base-dot-v1',
        'retrieval_all-mpnet-base-v2',
        'ranking_ms-marco-MiniLM-L-12-v2']]

Unnamed: 0,retrieval_multi-qa-mpnet-base-dot-v1,retrieval_all-mpnet-base-v2,ranking_ms-marco-MiniLM-L-12-v2
32,16.829475,3.194620,0.224360
33,20.914606,3.413810,0.315618
34,21.134993,3.965578,0.383481
35,19.273392,3.752367,0.412195
36,19.473473,3.713756,0.411183
...,...,...,...
2614589,11.579144,3.313579,0.142542
2614590,12.583954,3.255656,0.187810
2614591,9.215116,2.284095,0.280399
2614592,15.037724,3.381042,0.189536


In [23]:
# !pip install ampligraph

In [25]:
from ampligraph.evaluation.metrics import mrr_score, hits_at_n_score
from collections import OrderedDict


def calculate_metrics(df, col, hit_at_n=[1, 5, 10], pure_python=False):
    """ Calculatye Metrics: MRR and Hits@n
    It uses Ampligraph based on Tensorflow: https://docs.ampligraph.org/en/latest/index.html
    If you prefer to do calculation based on pure Python, set pure_python=True
    """
    result = OrderedDict()
    df.loc[:, 'rank'] = df.groupby('query_id')[col].rank(method='min', ascending=False).values
    first_hit_rank_position = df.groupby('query_id')[['gain', 'rank']] \
        .apply(lambda x: x[x.gain == 1.0]['rank'].min()).values
    
    first_hit_rank_position = np.nan_to_num(first_hit_rank_position, nan=1000)
    
    if not pure_python:
        result["MRR"] = mrr_score(first_hit_rank_position).round(4)
        for h in hit_at_n:
            result[f"Hits@{h}"] = hits_at_n_score(first_hit_rank_position, n=h).round(4)
        
    else:
        n_queries = first_hit_rank_position.shape[0]
        result["MRR"] = np.divide(np.divide(1, first_hit_rank_position).sum(), 
                                  n_queries).round(4)
        for h in hit_at_n:
            result[f"Hits@{h}"] = np.divide((first_hit_rank_position <= h).sum(),
                                            n_queries).round(4)
    return result

target_cols = ['retrieval_multi-qa-mpnet-base-dot-v1',
               'retrieval_all-mpnet-base-v2',
               'ranking_ms-marco-MiniLM-L-12-v2']
metrics = OrderedDict()
print(f"--> MRR for trained models: \n")
for col in target_cols:    
    pprint({col: calculate_metrics(df_test, col)})
    metrics[col] = calculate_metrics(df_test, col).copy()

--> MRR for trained models: 

{'retrieval_multi-qa-mpnet-base-dot-v1': OrderedDict([('MRR', 0.803),
                                                      ('Hits@1', 0.697),
                                                      ('Hits@5', 0.9395),
                                                      ('Hits@10', 0.9797)])}
{'retrieval_all-mpnet-base-v2': OrderedDict([('MRR', 0.7943),
                                             ('Hits@1', 0.6812),
                                             ('Hits@5', 0.9378),
                                             ('Hits@10', 0.9817)])}
{'ranking_ms-marco-MiniLM-L-12-v2': OrderedDict([('MRR', 0.8109),
                                                 ('Hits@1', 0.7067),
                                                 ('Hits@5', 0.9442),
                                                 ('Hits@10', 0.9825)])}


## End-to-End System Performance 

In [26]:
 # Cleaning GPU 
import gc

gc.collect()
torch.cuda.empty_cache()

In [27]:
df_test[col_query].nunique()

8956

In [None]:
import time 
from pprint import pprint

# Selected Retrieval Model
model_path = './models_us_retrieval_sentence-transformers/multi-qa-mpnet-base-dot-v1'
n_batches=100
batch_size=80
top_k=30

def fetch_id_product(row, indices):
    return [{col_query_id: row[col_query_id],
             col_query: row[col_query],
             col_product_id: id_product_test[i], 
             col_product_title: features_product_test[i]} for i in indices]

def retriev(row, top_k=5, locale="us", model_path=model_path):
    index = faiss.read_index(global_index_file_name(model_path, locale))
    query_vector = retrieval_inference(model_path, row[col_query]).to('cpu').numpy().astype('float32')
    top_k = index.search(query_vector, top_k)
    top_k_ids = top_k[1].tolist()[0]
    return fetch_id_product(row, top_k_ids)

def sampling_retrieval(model_path, df_queries, n_batches=n_batches, batch_size=batch_size, top_k=top_k):
    result = []
    for i in tqdm(range(n_batches)):
        features_queries = df_queries.sample(n=batch_size) # default replacement is False
        for (_, row) in features_queries.iterrows():
            result.append(retriev(row, top_k=top_k, locale="us", model_path=model_path))       
    return result

df_queries = df_test[[col_query_id, col_query, col_gain]].drop_duplicates()
result = sampling_retrieval(model_path, df_queries)

  5%|▌         | 5/100 [13:21<4:12:29, 159.47s/it]

In [None]:
from itertools import chain

import random


def flatten_chain(matrix):
    return list(chain.from_iterable(matrix))

ranking_model_path = './models_us_training_reranking_cross-encoder/ms-marco-MiniLM-L-12-v2'
col = 'ranking_ms-marco-MiniLM-L-12-v2'
sample_size = 50 
sample_result = []
n_iterations = 1000

for i in range(n_iterations):
    df_ = pd.DataFrame(flatten_chain(random.sample(result, sample_size)))
    scores = reranking_inference(ranking_model_path, 
                                 df_[col_query].to_list(), 
                                 df_[col_product_title].to_list())
    score_col = f"ranking_{ranking_model_path.split('/')[-1]}"
    df_.loc[:, score_col] = scores.copy()
    df_.loc[:, "rank"] = df_.groupby(col_query_id)[score_col].rank(method='min', ascending=False).values
    df_ = df_[df_['rank'] <= 10]
    df_rank = df_test[df_test[col_query_id].isin(df_[col_query_id].unique())][[col_query_id, col_product_id, col_gain]].merge(
        df_,
        how='left',
        on=[col_query_id, col_product_id]
    )
    df_rank.loc[:, 'rank'] = df_rank.groupby('query_id')[col]. \
    rank(method='min', ascending=False).values
    df_rank['ranking_ms-marco-MiniLM-L-12-v2'] = df_rank['ranking_ms-marco-MiniLM-L-12-v2'].fillna(0)
    sample_result.append(calculate_metrics(df_rank, col).copy())

In [None]:
pd.DataFrame(sample_result).describe().T