In [1]:
#!pip install -U transformers rank_bm25 evaluate unstructured bitsandbytes rouge_score python-dotenv --quiet

In [2]:
import transformers

transformers.logging.disable_progress_bar()
transformers.logging.set_verbosity_error()

transformers.__version__

'4.38.2'

In [3]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from model import EncoderModel, DecoderModel, BM25Model
from store import VectorStore
from tqdm import tqdm
import torch
from sklearn.metrics import ndcg_score

[nltk_data] Downloading package wordnet to /home/chkei001/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
hf_auth = os.getenv("HF")

login(token=hf_auth)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/chkei001/.cache/huggingface/token
Login successful


# Dataset

In [5]:
ds = load_dataset("squad_v2")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [6]:
df_val = ds["validation"].to_pandas()[["context", "question", "answers"]]
display(df_val.head(3))

Unnamed: 0,context,question,answers
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'Franc..."
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries', 'in the 1..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"{'text': ['Denmark, Iceland and Norway', 'Denm..."


In [7]:
# extract first answer of answer list
extract_answers = lambda answer: "" if len(answer['text']) == 0 else answer['text'][0]
v_extract_answers = np.vectorize(extract_answers)

In [8]:
df_val["answers"] = v_extract_answers(df_val["answers"].values)

print(f"{df_val[df_val['answers'] == ''].shape[0]}/{df_val.shape[0]}")

5945/11873


In [9]:
# sample 400 answerable examples and 100 unanswerable examples 
test_set_answerable = df_val[df_val['answers'] != ''].sample(n=400, random_state=1)
test_set_not_answerable = df_val[df_val['answers'] == ''].sample(n=100, random_state=1)
test_set = pd.concat([test_set_answerable, test_set_not_answerable])
test_set

Unnamed: 0,context,question,answers
6719,According to PolitiFact the top 400 richest Am...,What did the richest 400 Americans have as chi...,grew up in substantial privilege
11420,"The British failures in North America, combine...",How many of the Pitt's planned expeditions wer...,"Two of the expeditions were successful, with F..."
7963,At the same time the Mongols imported Central ...,Who did the Mongols send to Bukhara as adminis...,Han Chinese and Khitans
9256,The other third of the water flows through the...,Where does the Nederrijn change it's name?,Wijk bij Duurstede
6749,"In Marxian analysis, capitalist firms increasi...",What do capitalist firms substitute equipment ...,labor inputs
...,...,...,...
4613,The Very high-speed Backbone Network Service (...,What were select locations connected to?,
257,"When considering computational problems, a pro...",What is a string over a Greek number when cons...,
233,Closely related fields in theoretical computer...,What is the process that asks a more specific ...,
4784,A variety of alternatives to the Y. pestis hav...,In what year was Scott and Duncan's research p...,


In [11]:
# calcualte true binary relevance for ndcg
def true_binary_relevance(result_idxs, original_id):
    return [1 if i == original_id else 0 for i in result_idxs]

In [None]:
import warnings
warnings.filterwarnings('ignore')

retriever_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "BAAI/bge-base-en-v1.5",
    "WhereIsAI/UAE-Large-V1",
    "BAAI/bge-m3"
]
causal_models = [
    "HuggingFaceH4/zephyr-7b-beta",
    "mistralai/Mistral-7B-Instruct-v0.2",
    "HuggingFaceH4/zephyr-7b-gemma-v0.1",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "google/gemma-2b-it",
    "google/gemma-7b-it",
]

retriever_results = []

# causal models loop
for causal_id in causal_models:
    causal_lm = DecoderModel(causal_id, device="auto")
    
    # retriever models loop
    for retriever_id in retriever_models:
        
        # retriever setup loop
        for hybrid in [True, False]:
            # init new vector store with retriever
            db = VectorStore(retriever_id, hybrid)
            # embed documents
            db.add_documents(test_set["context"].values.tolist(), test_set.index.tolist(), batch_size=50)
            
            is_hybrid = 'yes' if hybrid else 'no'
            causal_lm_results = []
            
            print(f"Retriever: {retriever_id} - Causal LM: {causal_id} - hybrid: {is_hybrid}")
            
            with tqdm(total=len(test_set.question.values)) as pbar:
                # loop through dataset
                for document_id, (_, query, correct_answer) in test_set.iterrows():
                    
                    best_contexts = ""
                    best_ndcg = -1000000
                    
                    # loop distance metrics
                    for distance_metric in ["cosine", "ip", "l2"]:
                        # retrieve documents
                        results = db.search(query)
                        
                        # unpack results
                        idxs = [result["id"] for result in results]
                        scores = [result["score"] for result in results]
                        contexts = [result["document"] for result in results]
                        
                        # retriever results
                        true_relevance = true_binary_relevance(idxs, document_id)
                        ndcg = ndcg_score([true_relevance], [scores])
                        
                        # Only save results for examples for which a context could be found
                        if correct_answer != "":
                            retriever_results.append({
                                "model": retriever_id,
                                "ndcg": ndcg,
                                "metric": distance_metric,
                                "hybrid": is_hybrid
                            })
                        
                        
                        # caching to give generator best possible context
                        if ndcg > best_ndcg:
                            best_ndcg = ndcg
                            best_contexts = contexts

                    # concatenate list of contexts to one string
                    context_input = "\n\n".join(best_contexts)
                    
                    # generate an answer
                    answer = causal_lm(query, context_input)

                    causal_lm_results.append(
                        {
                            "model": causal_id,
                            "question": query,
                            "answer": answer,
                            "context": context_input,
                            "correct_answer": correct_answer if correct_answer != "" else "Not answerable from the given context."
                        }
                    )
                    pbar.update(1)
                del db
                torch.cuda.empty_cache()
                pd.DataFrame(causal_lm_results).to_csv(f"./results/{causal_id.replace('/', '-')}_({retriever_id.replace('/', '-')})_{is_hybrid}.csv")
    del causal_lm
    torch.cuda.empty_cache()

In [None]:
pd.DataFrame(retriever_results).to_csv("retriever_results_v2.csv")

# Evaluation

In [12]:
import pandas as pd
import numpy as np

retriever_results = pd.read_csv("retriever_results_v2.csv", index_col=0)

## Retriever

In [13]:
retriever_results.groupby(['model', 'metric', 'hybrid']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ndcg
model,metric,hybrid,Unnamed: 3_level_1
BAAI/bge-base-en-v1.5,cosine,no,0.855504
BAAI/bge-base-en-v1.5,cosine,yes,0.869621
BAAI/bge-base-en-v1.5,ip,no,0.855504
BAAI/bge-base-en-v1.5,ip,yes,0.869621
BAAI/bge-base-en-v1.5,l2,no,0.855504
BAAI/bge-base-en-v1.5,l2,yes,0.869621
BAAI/bge-m3,cosine,no,0.795663
BAAI/bge-m3,cosine,yes,0.859185
BAAI/bge-m3,ip,no,0.795663
BAAI/bge-m3,ip,yes,0.859185


# Decoder

In [14]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=False)

In [16]:
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

results_path = Path("./results/")

result_rows = []

for file in results_path.glob("*.csv"):
    df = pd.read_csv(file, index_col=0)
    model = df.model.values[0]
    model_result = {
        "bleu": [],
        "rouge_1_precision": [],
        "rouge_1_recall": [],
        "rouge_1_fmeasure": [],
        "rouge_L_precision": [],
        "rouge_L_recall": [],
        "rouge_L_fmeasure": [],
    }
    for i, (model_id, question, answer, context, correct_answer) in df.iterrows():
        bleu = sentence_bleu(
            references=correct_answer,
            hypothesis=answer
        )
    
        scores = scorer.score(correct_answer, answer)
        precision, recall, fmeasure = scores["rouge1"]
        precision_L, recall_L, fmeasure_L = scores["rougeL"]
        
        model_result["bleu"].append(bleu)
        model_result["rouge_1_precision"].append(precision)
        model_result["rouge_1_recall"].append(recall)
        model_result["rouge_1_fmeasure"].append(fmeasure)
        model_result["rouge_L_precision"].append(precision_L)
        model_result["rouge_L_recall"].append(recall_L)
        model_result["rouge_L_fmeasure"].append(fmeasure_L)
        
    result_rows.append(
        {
            "model": model,
            "bleu": np.mean(model_result["bleu"]),
            "rouge_1_precision": np.mean(model_result["rouge_1_precision"]),
            "rouge_1_recall": np.mean(model_result["rouge_1_recall"]),
            "rouge_1_fmeasure": np.mean(model_result["rouge_1_fmeasure"]),
            "rouge_L_precision": np.mean(model_result["rouge_L_precision"]),
            "rouge_L_recall": np.mean(model_result["rouge_L_recall"]),
            "rouge_L_fmeasure": np.mean(model_result["rouge_L_fmeasure"])
        }
    )
pd.DataFrame(result_rows)

Unnamed: 0,model,bleu,rouge_1_precision,rouge_1_recall,rouge_1_fmeasure,rouge_L_precision,rouge_L_recall,rouge_L_fmeasure
0,google/gemma-2b-it,9.45266e-232,0.090218,0.394386,0.133061,0.080232,0.364439,0.118982
1,mistralai/Mistral-7B-Instruct-v0.2,8.661046e-232,0.067992,0.510451,0.107381,0.063086,0.476818,0.099192
2,HuggingFaceH4/zephyr-7b-beta,7.555362e-232,0.040825,0.559209,0.071184,0.037134,0.526515,0.064916
3,google/gemma-7b-it,1.036681e-231,0.000559,0.00125,0.000444,0.000559,0.00125,0.000444
4,mistralai/Mixtral-8x7B-Instruct-v0.1,9.063576e-232,0.076472,0.522214,0.121311,0.070003,0.483617,0.110831
5,HuggingFaceH4/zephyr-7b-gemma-v0.1,7.211556e-232,0.028043,0.474737,0.049678,0.026126,0.453578,0.04642


# Judging LLM-as-a-Judge

Results can be found in `llm-as-a-judge.ipynb`.