In [1]:
!pip install -U transformers faiss-gpu rank_bm25 --quiet

In [2]:
import transformers

transformers.__version__

'4.36.2'

In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, Pipeline
from datasets import Dataset, load_dataset
import numpy as np
import pandas as pd

2023-12-20 20:03:27.777298: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Dataset

In [4]:
ds = load_dataset("squad_v2")["train"]
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})

In [5]:
df = pd.DataFrame(ds["context"])
df.columns = ["context"]
len(df.context.unique()), len(df)

(19029, 130319)

In [6]:
# Get unique contexts and create id
df = pd.DataFrame(columns=["context"], data=df.context.unique())
df["id"] = [i for i in range(len(df))]

In [7]:
# Helper to map question to unique context
def get_context_id(row: str) -> int:
    num = df[df["context"] == row]["id"]
    return num

In [8]:
# Remove rows without answer
ds = ds.filter(lambda row: len(row["answers"].get("text")) > 0)
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 86821
})

In [9]:
# Map questions to unique context id
ds = ds.map(lambda row: {"context_id": get_context_id(row["context"])})

In [10]:
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'context_id'],
    num_rows: 86821
})

In [11]:
# Extract questions with id to iterratable DataFrame
questions = pd.DataFrame(ds)[["question", "context_id"]]
questions.head(2)

Unnamed: 0,question,context_id
0,When did Beyonce start becoming popular?,[0]
1,What areas did Beyonce compete in when she was...,[0]


In [12]:
# Create searchable "database"
database = Dataset.from_pandas(df)
database

Dataset({
    features: ['context', 'id'],
    num_rows: 19029
})

# Model

In [13]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [14]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class EmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}


    def preprocess(self, text):
        encoded_text = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
        return encoded_text


    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}


    def postprocess(self, model_outputs):
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings[0].numpy()

In [15]:
def get_true_values(q_id: int, preds: list) -> list:
    return [1 if i == q_id else 0 for i in preds]

In [None]:
from tqdm import tqdm
from sklearn.metrics import ndcg_score


result = []

encoder_list = [
    "BAAI/bge-large-en-v1.5",
    "WhereIsAI/UAE-Large-V1",
    "sentence-transformers/all-mpnet-base-v2"
]

with tqdm(total=len(encoder_list)) as pbar:    
    for model_id in encoder_list:
        y_true = []
        y_pred = []
        model = AutoModel.from_pretrained(model_id).to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        encoder = EmbeddingPipeline(model=model, tokenizer=tokenizer, device=device)
        database = database.map(lambda row: {"embedding": encoder(row["context"])})
        database = database.add_faiss_index(column="embedding")

        for i, (q, cid) in questions.iterrows():
            question = encoder(q)
            scores, docs = database.get_nearest_examples('embedding', question, k=5)
            scores = np.divide(1, scores)
            relevance = get_true_values(cid[0], docs["id"])

            y_true.append(relevance)
            y_pred.append(scores)

        res = ndcg_score(y_true, y_pred)
        result.append(
            {
                "model": model_id,
                "score": res
            }
        )
        
        pbar.update(1)

  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/19029 [00:00<?, ? examples/s]



  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
# TODO: BM25

In [None]:
result_df = pd.DataFrame(result)
result_df.to_csv("results.csv")
result_df