In [1]:
!pip install rank_bm25 nltk rouge-score tqdm --quiet

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [1]:
import json
import os
import nltk
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import numpy as np
from tqdm.notebook import tqdm
import torch
nltk.download('punkt_tab')
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:

with open("data/dev.json") as f:
    data = json.load(f)
    

data[0]


{'_id': '8813f87c0bdd11eba7f7acde48001122',
 'type': 'compositional',
 'question': 'Who is the mother of the director of film Polish-Russian War (Film)?',
 'context': [['Maheen Khan',
   ['Maheen Khan is a Pakistani fashion and costume designer, also an award winner fashion designer for fashion labels like" The Embroidery HouseMaheen" and" Gulabo".',
    'She has done many national and international fashion events and shows.',
    'She undertook embroidery for the film Snow White and the Huntsman and television series',
    'The Jewel in the Crown.']],
  ['Viktor Yeliseyev',
   ['Viktor Petrovich Yeliseyev( born June 9, 1950) is a Russian general, orchestra conductor and music teacher.',
    'He is the director of the Ministry of the Interior Ensemble, one of the two Russian Red Army Choirs.']],
  ['Alice Washburn',
   ['Alice Washburn( 1860- 1929) was an American stage and film actress.',
    'She worked at the Edison, Vitagraph and Kalem studios.',
    'Her final film Snow White was 

In [3]:
def prepare_corpus(context):
    corpus = []
    doc_map = []
    for title, sentences in context:
        doc_text = ' '.join(sentences)
        corpus.append(word_tokenize(doc_text.lower()))
        doc_map.append((title, doc_text))
    return corpus, doc_map

def retrieve_top_k(question, context, k=5):
    corpus_tokens, doc_map = prepare_corpus(context)
    bm25 = BM25Okapi(corpus_tokens)
    question_tokens = word_tokenize(question.lower())
    scores = bm25.get_scores(question_tokens)
    top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [doc_map[i] for i in top_k_indices]


In [4]:
def compute_f1(pred_titles, gold_titles):
    pred_set = set(pred_titles)
    gold_set = set(gold_titles)
    if not pred_set and not gold_set:
        return 1.0
    if not pred_set or not gold_set:
        return 0.0
    tp = len(pred_set & gold_set)
    precision = tp / len(pred_set) if pred_set else 0
    recall = tp / len(gold_set) if gold_set else 0
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def compute_mrr(pred_titles, gold_titles):
    for rank, title in enumerate(pred_titles, start=1):
        if title in gold_titles:
            return 1.0 / rank
    return 0.0


In [5]:
import json
import torch
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load FLAN-T5
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)


In [6]:
f1s, mrrs, rouge_scores, f1_scores = [], [], [], []
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def generate_answer(question, retrieved_docs, max_len=512):
    context_text = " ".join([doc for  _, doc in retrieved_docs])
    prompt = f"question: {question} context: {context_text}"

    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_len, truncation=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

for item in tqdm(data):
    question = item["question"]
    context = item["context"]
    gold_titles = [title for title, _ in item["supporting_facts"]]
    gold_answer = item["answer"]

    top_docs = retrieve_top_k(question, context, k=5)
    pred_titles = [title for title, _ in top_docs]
    pred_answer= generate_answer(question, top_docs)
    f1s.append(compute_f1(pred_titles, gold_titles))
    mrrs.append(compute_mrr(pred_titles, gold_titles))
    f1_scores.append(compute_f1(pred_answer, gold_answer))
    rouge_scores.append(rouge.score(gold_answer,pred_answer)["rougeL"].fmeasure)


  0%|          | 0/12576 [00:00<?, ?it/s]

In [8]:
print(f"Retrieval F1: {np.mean(f1s):.4f}")
print(f"Retrieval MRR: {np.mean(mrrs):.4f}")
print(f"Generation F1: {sum(f1_scores)/len(f1_scores):.4f}")
print(f"Generation ROUGE-L: {sum(rouge_scores)/len(rouge_scores):.4f}")
# print(f"Generation ROUGE-L (dummy): {np.mean(rouge_scores):.4f}")


Retrieval F1: 0.4841
Retrieval MRR: 0.8312
Generation F1: 0.5669
Generation ROUGE-L: 0.3340


In [27]:
def prepare_corpus(context):
    doc_texts, corpus_tokens, doc_map = [], [], []
    for title, sentences in context:
        text = " ".join(sentences)
        doc_texts.append(text)
        corpus_tokens.append(word_tokenize(text.lower()))
        doc_map.append(title)
    return corpus_tokens, doc_texts, doc_map

def retrieve_top_k(question, context, k=5):
    corpus_tokens, doc_texts, doc_titles = prepare_corpus(context)
    bm25 = BM25Okapi(corpus_tokens)
    q_tokens = word_tokenize(question.lower())
    scores = bm25.get_scores(q_tokens)
    ranked = sorted(zip(scores, doc_titles, doc_texts), reverse=True)
    return ranked[:k]


In [28]:
def generate_answer(question, retrieved_docs, max_len=512):
    context_text = " ".join([doc for _, _, doc in retrieved_docs])
    prompt = f"question: {question} context: {context_text}"

    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_len, truncation=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [29]:
def compute_f1(pred, gold):
    pred_tokens = pred.lower().split()
    gold_tokens = gold.lower().split()
    common = set(pred_tokens) & set(gold_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)


In [30]:
with open("data/train.json") as f:
    data = json.load(f)

f1_scores = []
rouge_scores = []

for item in data[:20]:  # Limit for speed
    question = item["question"]
    context = item["context"]
    gold_answer = item["answer"]

    top_docs = retrieve_top_k(question, context, k=5)
    pred_answer = generate_answer(question, top_docs)

    f1_scores.append(compute_f1(pred_answer, gold_answer))
    rougeL = rouge.score(pred_answer, gold_answer)["rougeL"].fmeasure
    rouge_scores.append(rougeL)
print(f"Retrieval F1: {np.mean(f1s):.4f}")
print(f"Retrieval MRR: {np.mean(mrrs):.4f}")
print(f"Generation F1: {sum(f1_scores)/len(f1_scores):.4f}")
print(f"Generation ROUGE-L: {sum(rouge_scores)/len(rouge_scores):.4f}")


 Generation F1: 0.2300
Generation ROUGE-L: 0.2333
