In [1]:
#!pip install -U transformers rank_bm25 evaluate unstructured --quiet

In [1]:
import transformers

transformers.logging.disable_progress_bar()
transformers.logging.set_verbosity_error()

transformers.__version__

'4.38.1'

In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from model import EncoderModel, DecoderModel, BM25Model
from store import VectorStore
from tqdm import tqdm
import torch
from sklearn.metrics import ndcg_score

[nltk_data] Downloading package wordnet to /home/chkei001/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset

In [None]:
ds = load_dataset("squad_v2")
ds

In [None]:
df_val = ds["validation"].to_pandas()[["context", "question", "answers"]]
display(df_val.head(3))

In [None]:
extract_answers = lambda answer: "" if len(answer['text']) == 0 else answer['text'][0]
v_extract_answers = np.vectorize(extract_answers)

In [None]:
df_val["answers"] = v_extract_answers(df_val["answers"].values)

print(f"{df_val[df_val['answers'] == ''].shape[0]}/{df_val.shape[0]}")

In [9]:
test_set_answerable = df_val[df_val['answers'] != ''].sample(n=400, random_state=1)
test_set_not_answerable = df_val[df_val['answers'] == ''].sample(n=100, random_state=1)
test_set = pd.concat([test_set_answerable, test_set_not_answerable])
test_set

Unnamed: 0,context,question,answers
6719,According to PolitiFact the top 400 richest Am...,What did the richest 400 Americans have as chi...,grew up in substantial privilege
11420,"The British failures in North America, combine...",How many of the Pitt's planned expeditions wer...,"Two of the expeditions were successful, with F..."
7963,At the same time the Mongols imported Central ...,Who did the Mongols send to Bukhara as adminis...,Han Chinese and Khitans
9256,The other third of the water flows through the...,Where does the Nederrijn change it's name?,Wijk bij Duurstede
6749,"In Marxian analysis, capitalist firms increasi...",What do capitalist firms substitute equipment ...,labor inputs
...,...,...,...
4613,The Very high-speed Backbone Network Service (...,What were select locations connected to?,
257,"When considering computational problems, a pro...",What is a string over a Greek number when cons...,
233,Closely related fields in theoretical computer...,What is the process that asks a more specific ...,
4784,A variety of alternatives to the Y. pestis hav...,In what year was Scott and Duncan's research p...,


In [10]:
def true_binary_relevance(result_idxs, original_id):
    return [1 if i == original_id else 0 for i in result_idxs]

In [11]:
import warnings
warnings.filterwarnings('ignore')

retriever_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "BAAI/bge-base-en-v1.5",
    "WhereIsAI/UAE-Large-V1",
    "BAAI/bge-m3"
]
causal_models = [
    "google/gemma-7b-it",
    "HuggingFaceH4/zephyr-7b-beta",
    "mistralai/Mistral-7B-Instruct-v0.2",
    #"mistralai/Mixtral-8x7B-Instruct-v0.1"
    "meta-llama/Llama-2-7b-chat-hf"
]

retriever_results = []
causal_lm_results = []

for causal_id in causal_models:
    causal_lm = DecoderModel(causal_id, device="cuda")
    
    for retriever_id in retriever_models:
        for hybrid in [True, False]:
            db = VectorStore(retriever_id, hybrid)
            db.add_documents(test_set["context"].values.tolist(), test_set.index.tolist())

            print(f"Retriever: {retriever_id} - Causal LM: {causal_id} - hybrid: {'yes' if hybrid else 'no'}")
            with tqdm(total=len(test_set.question.values)) as pbar:
                for document_id, (_, query, correct_answer) in test_set.iterrows():
                    
                    best_contexts = ""
                    best_ndcg = 0
                    for distance_metric in ["cosine", "ip", "l2"]:
                        results = db.search(query)
                        # unpack results
                        idxs = [result["id"] for result in results]
                        scores = [result["score"] for result in results]
                        contexts = [result["document"] for result in results]

                        # retriever results
                        true_relevance = true_binary_relevance(idxs, document_id)
                        ndcg = ndcg_score([true_relevance], [scores])
                        
                        if correct_answer != "":
                            retriever_results.append({
                                "model": retriever_id,
                                "ndcg": ndcg,
                                "metric": distance_metric,
                                "hybrid": "yes" if hybrid else "no"
                            })

                        best_ndcg = ndcg if ndcg > best_ndcg else best_ndcg
                        if ndcg > best_ndcg:
                            best_ndcg = ndcg
                            best_contexts = contexts

                    contexts = "\n\n".join(best_contexts)


                    answer = causal_lm(query, contexts)

                    causal_lm_results.append(
                        {
                            "model": causal_id,
                            "question": query,
                            "answer": answer,
                            "context": contexts,
                            "correct_answer": correct_answer if correct_answer != "" else "Not answerable from the given context."
                        }
                    )
                    pbar.update(1)
                del db
                torch.cuda.empty_cache()
    del causal_lm
    torch.cuda.empty_cache()

KeyboardInterrupt: 

In [None]:
pd.DataFrame(causal_lm_results).to_csv("causal_lm_results_v2.csv")
pd.DataFrame(retriever_results).to_csv("retriever_results_v2.csv")

# Evaluation

In [None]:
bleu = evaluate.load("bleu") # https://huggingface.co/spaces/evaluate-metric/bleu


In [12]:
pd.DataFrame(causal_lm_results)

Unnamed: 0,model,question,answer,context,correct_answer
0,google/gemma-7b-it,What did the richest 400 Americans have as chi...,<pad><pad><pad><eos>,,grew up in substantial privilege
1,google/gemma-7b-it,How many of the Pitt's planned expeditions wer...,I do know. The provided text does not contain ...,,"Two of the expeditions were successful, with F..."
2,google/gemma-7b-it,Who did the Mongols send to Bukhara as adminis...,"Sure, here is the answer to the question:\n\nT...",,Han Chinese and Khitans
3,google/gemma-7b-it,Where does the Nederrijn change it's name?,I do know. The text does not provide informati...,,Wijk bij Duurstede
4,google/gemma-7b-it,What do capitalist firms substitute equipment ...,"Sure, here is the answer to the question:\n\nI...",,labor inputs
...,...,...,...,...,...
10995,mistralai/Mistral-7B-Instruct-v0.2,What were select locations connected to?,I donot have enough context to provide an answ...,,Not answerable from the given context.
10996,mistralai/Mistral-7B-Instruct-v0.2,What is a string over a Greek number when cons...,I donot know. The context does not provide eno...,,Not answerable from the given context.
10997,mistralai/Mistral-7B-Instruct-v0.2,What is the process that asks a more specific ...,I donotknow. The context provided does not con...,,Not answerable from the given context.
10998,mistralai/Mistral-7B-Instruct-v0.2,In what year was Scott and Duncan's research p...,I donot have access to the specific context pr...,,Not answerable from the given context.


In [18]:
pd.DataFrame(retriever_results)

Unnamed: 0,model,ndcg,metric,hybrid
0,sentence-transformers/all-MiniLM-L6-v2,1.00000,cosine,no
1,sentence-transformers/all-MiniLM-L6-v2,1.00000,ip,no
2,sentence-transformers/all-MiniLM-L6-v2,1.00000,l2,no
3,sentence-transformers/all-MiniLM-L6-v2,1.00000,cosine,no
4,sentence-transformers/all-MiniLM-L6-v2,1.00000,ip,no
...,...,...,...,...
17995,WhereIsAI/UAE-Large-V1,0.63093,ip,no
17996,WhereIsAI/UAE-Large-V1,0.63093,l2,no
17997,WhereIsAI/UAE-Large-V1,1.00000,cosine,no
17998,WhereIsAI/UAE-Large-V1,1.00000,ip,no


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("retriever_results.csv")

In [6]:
df[df["model"]=="sentence-transformers/all-MiniLM-L6-v2"]["ndcg"].mean()

0.8272514504055871