In [1]:
#!pip install -U transformers rank_bm25 evaluate unstructured --quiet

In [2]:
import transformers
transformers.__version__

'4.38.1'

In [3]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from model import EncoderModel, DecoderModel, BM25Model
from store import VectorStore
from tqdm import tqdm
import torch
from sklearn.metrics import ndcg_score

[nltk_data] Downloading package wordnet to /home/chkei001/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset

In [5]:
ds = load_dataset("squad_v2")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [6]:
df_train = ds["train"].to_pandas()[["context", "question", "answers"]]
display(df_train.head(3))
df_val = ds["validation"].to_pandas()[["context", "question", "answers"]]
display(df_val.head(3))

Unnamed: 0,context,question,answers
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start'..."
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': ['singing and dancing'], 'answer_star..."
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': ['2003'], 'answer_start': [526]}"


Unnamed: 0,context,question,answers
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'Franc..."
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries', 'in the 1..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"{'text': ['Denmark, Iceland and Norway', 'Denm..."


In [7]:
extract_answers = lambda answer: "" if len(answer['text']) == 0 else answer['text'][0]
v_extract_answers = np.vectorize(extract_answers)

In [8]:
df_train["answers"] = v_extract_answers(df_train["answers"].values)
df_val["answers"] = v_extract_answers(df_val["answers"].values)

print(f"{df_train[df_train['answers'] == ''].shape[0]}/{df_train.shape[0]}")
print(f"{df_val[df_val['answers'] == ''].shape[0]}/{df_val.shape[0]}")

43498/130319
5945/11873


In [9]:
test_set = df_val[df_val['answers'] != ''].sample(n=500, random_state=1)

In [10]:
def true_binary_relevance(result_idxs, original_id):
    return [1 if i == original_id else 0 for i in result_idxs]

In [None]:
retriever_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "BAAI/bge-base-en-v1.5",
    "WhereIsAI/UAE-Large-V1",
    "BAAI/bge-m3"
]
causal_models = [
    "HuggingFaceH4/zephyr-7b-beta",
    "google/gemma-7b-it",
    "mistralai/Mistral-7B-Instruct-v0.2",
    #"mistralai/Mixtral-8x7B-Instruct-v0.1"
    "meta-llama/Llama-2-7b-chat-hf"
]

retriever_results = []
causal_lm_results = []

for hybrid in [False, True]:
    for retriever_id in retriever_models:
        db = VectorStore(retriever_id, hybrid)
        db.add_documents(test_set["context"].values.tolist(), test_set.index.tolist())
        
        for causal_id in causal_models:
            causal_lm = DecoderModel(causal_id, device="cuda")
            with tqdm(total=len(test_set.question.values)) as pbar:
                for document_id, query in zip(test_set.index.tolist(), test_set.question.values):
                    best_contexts = ""
                    best_ndcg = 0
                    for distance_metric in ["cosine", "ip", "l2"]:
                        results = db.search(query)
                        # unpack results
                        idxs = [result["id"] for result in results]
                        scores = [result["score"] for result in results]
                        contexts = [result["document"] for result in results]
                        
                        # retriever results
                        true_relevance = true_binary_relevance(idxs, document_id)
                        ndcg = ndcg_score(true_relevance, scores)
                        
                        retriever_results.append({
                            "model": retriever_id,
                            "ndcg": ndcg,
                            "metric": distance_metric,
                            "hybrid": "yes" if hybrid else "no"
                        })
                        
                        best_ndcg = ndcg if ndcg > best_ndcg else best_ndcg
                        if ndcg > best_ndcg:
                            best_ndcg = ndcg
                            best_contexts = contexts
                    
                    contexts = "\n\n".join(best_contexts)
                    
                    
                    answer = causal_lm(query, contexts)
                    
                    causal_lm_results.append(
                        {
                            "model": causal_id,
                            "question": query,
                            "answer": answer,
                            "context": contexts
                        }
                    )
                    pbar.update(1)
                del causal_lm
                torch.cuda.empty_cache()
        del db
        torch.cuda.empty_cache()

  return self.fget.__get__(instance, owner)()


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
pd.DataFrame(causal_models).to_csv("causal_lm_results.csv")
pd.DataFrame(causal_models).to_csv("retriever_results.csv")

# Evaluation

In [None]:
bleu = evaluate.load("bleu") # https://huggingface.co/spaces/evaluate-metric/bleu
