In [None]:
import nltk
nltk.download('punkt_tab')
import os
import torch


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:

from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Load question encoder
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# Load context encoder
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")


  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if y

In [6]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [9]:
import faiss
import numpy as np

def encode_contexts(context):
    ctx_vectors, titles, texts = [], [], []
    for title, sentences in context:
        text = " ".join(sentences)
        inputs = ctx_tokenizer(title, text, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():
            emb = ctx_encoder(**inputs).pooler_output.squeeze().cpu().numpy()
        ctx_vectors.append(emb)
        titles.append(title)
        texts.append(text)
    return np.array(ctx_vectors), titles, texts

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # inner product for DPR
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    return index


In [10]:
def retrieve_dpr(question, context, k=5):
    ctx_vecs, titles, texts = encode_contexts(context)
    index = build_faiss_index(ctx_vecs)

    q_inputs = q_tokenizer(question, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        q_vec = q_encoder(**q_inputs).pooler_output.cpu().numpy()
    faiss.normalize_L2(q_vec)

    D, I = index.search(q_vec, k)
    return [(titles[i], texts[i]) for i in I[0]]


In [11]:
def generate_answer(question, retrieved_docs, max_len=512):
    context_text = " ".join([text for _, text in retrieved_docs])
    prompt = f"question: {question} context: {context_text}"

    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_len, truncation=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [12]:
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer

def compute_f1(pred, gold):
    pred_tokens = pred.lower().split()
    gold_tokens = gold.lower().split()
    common = set(pred_tokens) & set(gold_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def compute_mrr(pred_titles, gold_titles):
    for rank, title in enumerate(pred_titles, start=1):
        if title in gold_titles:
            return 1 / rank
    return 0.0


In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load FLAN-T5
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)


In [18]:
import json
from tqdm.notebook import tqdm
with open("data/dev.json") as f:
    data = json.load(f)

retrieval_f1s, retrieval_mrrs = [], []
gen_f1s, rouge_ls = [], []

for item in tqdm(data):  # subset for quick test
    q = item["question"]
    context = item["context"]
    gold_titles = [t for t, _ in item["supporting_facts"]]
    gold_answer = item["answer"]

    top_docs = retrieve_dpr(q, context, k=5)
    pred_titles = [title for title, _ in top_docs]
    
    # Retrieval evaluation
    retrieval_f1s.append(compute_f1(" ".join(pred_titles), " ".join(gold_titles)))
    retrieval_mrrs.append(compute_mrr(pred_titles, gold_titles))
    
    # Generation
    gen_ans = generate_answer(q, top_docs)
    gen_f1s.append(compute_f1(gen_ans, gold_answer))
    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    rouge_ls.append(rouge.score(gen_ans, gold_answer)["rougeL"].fmeasure)



  0%|          | 0/12576 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [20]:
print(f"DPR Retrieval F1: {sum(retrieval_f1s)/len(retrieval_f1s):.4f}")
print(f"DPR Retrieval MRR: {sum(retrieval_mrrs)/len(retrieval_mrrs):.4f}")
print(f"Generation F1: {sum(gen_f1s)/len(gen_f1s):.4f}")
print(f"Generation ROUGE-L: {sum(rouge_ls)/len(rouge_ls):.4f}")

DPR Retrieval F1: 0.5340
DPR Retrieval MRR: 0.9113
Generation F1: 0.3723
Generation ROUGE-L: 0.3926
