In [2]:
import transformers

transformers.__version__

'4.36.2'

In [3]:
import chromadb
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, Pipeline
from datasets import Dataset, load_dataset
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckeibel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Dataset

In [4]:
ds = load_dataset("squad_v2")["train"]
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})

## Semantic search prep

In [5]:
contexts = pd.DataFrame(ds["context"])
contexts.columns = ["context"]
len(contexts.context.unique()), len(contexts)

(19029, 130319)

In [6]:
# Get unique contexts and create id
contexts = pd.DataFrame(columns=["context"], data=contexts.context.unique())
contexts["id"] = [i for i in range(len(contexts))]

In [7]:
# Helper to map question to unique context
def get_context_id(row: str) -> int:
    num = contexts[contexts["context"] == row]["id"]
    return num

In [8]:
# Remove rows without answer
ds = ds.filter(lambda row: len(row["answers"].get("text")) > 0)
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 86821
})

In [9]:
# Map questions to unique context id
ds = ds.map(lambda row: {"context_id": get_context_id(row["context"])})

In [10]:
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'context_id'],
    num_rows: 86821
})

In [11]:
# Extract questions with id to iterratable DataFrame
questions = pd.DataFrame(ds)[["question", "context_id"]]
questions.head(2)

Unnamed: 0,question,context_id
0,When did Beyonce start becoming popular?,[0]
1,What areas did Beyonce compete in when she was...,[0]


In [12]:
contexts.head(2)

Unnamed: 0,context,id
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,0
1,Following the disbandment of Destiny's Child i...,1


## Keyword search prep

In [13]:
def bm25_tokenize(doc: str) -> list[str]:
    tokens = nltk.word_tokenize(doc)
    return tokens

# Model

In [14]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [15]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class EmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}


    def preprocess(self, text):
        encoded_text = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
        return encoded_text


    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}


    def postprocess(self, model_outputs):
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings[0].numpy()

In [16]:
def get_true_values(q_id: int, preds: list) -> list:
    return [1 if i == q_id else 0 for i in preds]

In [18]:
def hybrid_score(semantic_score: float, keyword_score: float, alpha: float = 0.5) -> float:
    hybrid_score = (1 - alpha) * keyword_score + alpha * semantic_score
    return hybrid_score

In [76]:
chroma_client.delete_collection(name="squad_v2")

In [77]:
from tqdm import tqdm
from sklearn.metrics import ndcg_score

# init chroma client
chroma_client = chromadb.Client()

eval = []

encoder_list = [
    "BAAI/bge-large-en-v1.5",
    "WhereIsAI/UAE-Large-V1",
    "sentence-transformers/all-mpnet-base-v2"
]

for model_id in encoder_list:
    # Store preds and true relevance for metrics
    semantic_true = []
    semantic_pred = []
    hybrid_true = []
    hybrid_pred = [] 
    # Init chroma collection
    collection = chroma_client.create_collection(name="squad_v2", metadata={"hnsw:space": "cosine"})
    # init model
    model = AutoModel.from_pretrained(model_id).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    encoder = EmbeddingPipeline(model=model, tokenizer=tokenizer, device=device)

    # Add embeddings to chroma
    for i, (context, cid) in contexts.iterrows():
        embedding_vector = encoder(context).tolist()
        collection.add(
            embeddings=[embedding_vector],
            documents=[context],
            metadatas=[{"context_id": cid}],
            ids=[str(i)]
        )

    for i2, (q, cid) in questions.iterrows():
        question = encoder(q).tolist()
        result = collection.query(
            query_embeddings=question,
            n_results=5,
        )
        # doc results
        doc_ids = [metadata["context_id"] for metadata in result["metadatas"][0]]
        scores = result["distances"][0]
        retrieved_docs = {did: score for did, score in zip(doc_ids, scores)}
        
        # without bm25
        ## true relevance
        relevance = get_true_values(cid[0], retrieved_docs.keys())
        semantic_true.append(relevance)
        ## preds
        semantic_pred.append(list(retrieved_docs.values()))

        # hybrid
        ## bm25 scores
        tokenized_docs = [bm25_tokenize(doc) for doc in result["documents"][0]] 
        bm25 = BM25Okapi(tokenized_docs)
        tokenized_query = bm25_tokenize(q)
        bm25_scores = bm25.get_scores(tokenized_query)
        # Calcualte hybrid score
        hybrid_docs = {
            key: hybrid_score(s_score, bm25_score) 
            for key, s_score, bm25_score 
            in zip(retrieved_docs.keys(), list(retrieved_docs.values()), bm25_scores)
        }
        # Sort by score
        hybrid_docs = {k: v for k, v in sorted(hybrid_docs.items(), key=lambda item: item[1], reverse=True)}
        
        ## true relevance
        relevance = get_true_values(cid[0], hybrid_docs.keys())
        hybrid_true.append(relevance)
        ## preds
        hybrid_pred.append(list(hybrid_docs.values()))
    
    semantic_res = ndcg_score(semantic_true, semantic_pred)
    hybrid_res = ndcg_score(hybrid_true, hybrid_pred)
    eval.append(
        {
            "model": model_id,
            "semantic_score": semantic_res,
            "hybrid_score": hybrid_res
        }
    )
    print(f"Semantic: {semantic_res} - Hybrid: {hybrid_res}")
    chroma_client.delete_collection(name="squad_v2")

Chroma: 1/19029
Questions: 1/86821
Semantic: 0.337729806375078 - Hybrid: 0.7150613800281623
Chroma: 1/19029




Questions: 1/86821
Semantic: 0.33024802195630615 - Hybrid: 0.7005957366790988
Chroma: 1/19029




Questions: 1/86821
Semantic: 0.341901282067171 - Hybrid: 0.7122281567391423


In [78]:
result_df = pd.DataFrame(eval)
result_df.to_csv("results-bm25.csv")
result_df

Unnamed: 0,model,semantic_score,hybrid_score
0,BAAI/bge-large-en-v1.5,0.33773,0.715061
1,WhereIsAI/UAE-Large-V1,0.330248,0.700596
2,sentence-transformers/all-mpnet-base-v2,0.341901,0.712228
