In [None]:
# Imports
import os
import random
from typing import cast
from datasets import load_dataset, Dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
import torch
import string
from dotenv import load_dotenv


In [3]:
# Load or set environment variables
# os.environ["OPENAI_API_KEY"] = ""
load_dotenv()

False

In [4]:
# Config (tweak these)
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
K_RETRIEVE = 5
N_EVAL = 5
SENTENCE_TRANSFORMER_MODEL = "all-mpnet-base-v2"
CHAT_MODEL = "gpt-4o-mini"


# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [5]:
# Load HotpotQA: use TRAIN for corpus, VALIDATION for evaluation
print("Loading HotpotQA...")
ds_train = cast(Dataset, load_dataset("hotpot_qa", "distractor", split="train", streaming=False)) # cast to Dataset to avoid pylance error
ds_val = cast(Dataset, load_dataset("hotpot_qa", "distractor", split="validation", streaming=False))

print("Train size:", len(ds_train))
print("Validation size:", len(ds_val))


Loading HotpotQA...
Train size: 90447
Validation size: 7405


In [6]:
# Build a corpus from TRAIN context only
corpus_rows = []
for example in ds_train:
    titles = example["context"]["title"]
    sentences_lists = example["context"]["sentences"]
    for title, sents in zip(titles, sentences_lists):
        paragraph_text = " ".join(sents)
        corpus_rows.append({"title": title, "text": paragraph_text})

print("Paragraphs:", len(corpus_rows))


Paragraphs: 899667


In [7]:
# Chunk with RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

texts, metas = [], []
for r in corpus_rows:
    chunks = text_splitter.split_text(r['text'])
    texts.extend(chunks)
    metas.extend([{"title": r['title']} for _ in chunks])

print("Chunks indexed:", len(texts))


Chunks indexed: 979285


In [8]:
# Build FAISS vector store with 
embedding_model = HuggingFaceEmbeddings(
    model_name=SENTENCE_TRANSFORMER_MODEL, 
    model_kwargs={"device": device}, # Use GPU if available
    encode_kwargs={"normalize_embeddings": True}, 
    show_progress=True
)

vector_store = FAISS.from_texts(
    texts,
    embedding_model, 
    metadatas=metas
) 

Batches:   0%|          | 0/30603 [00:00<?, ?it/s]

In [9]:
# Let's test retrieval on 1 example
query = "Which magazine was started first Arthur's Magazine or First for Women?"
docs = vector_store.similarity_search(query, k=5)
for i, doc in enumerate(docs):
    print(f"Document {i+1}:")
    print(doc.page_content)
    print("Metadata:", doc.metadata)
    print()


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document 1:
Arthur magazine was a bi-monthly periodical that was founded in October 2002, by publisher Laris Kreslins and editor Jay Babcock.  It received favorable attention from other periodicals such as "L.A. Weekly", "Print", "Punk Planet" and "Rolling Stone".  "Arthur" featured photography and artwork from Spike Jonze, Art Spiegelman, Susannah Breslin, Gary Panter and Godspeed You!  Black Emperor.  Arthur's regular columnists included Byron Coley, Thurston Moore, Daniel Pinchbeck, Paul Cullum, Douglas Rushkoff, and T-Model Ford.
Metadata: {'title': 'Arthur (magazine)'}

Document 2:
Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.  Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.  In May 1846 it was merged into "Godey's Lady's Book".
Metadata: {'title': "Arthur's Magazine"}

Document 3:
Arthur's Magazine (1844–1846) was an American literary periodi

In [25]:
# Answer with LLM + Retrieved Docs
llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)

SYSTEM_PROMPT = ("You are a precise QA assistant. Return just the short answer phrase (no explanation).")

def build_user_prompt(question, passages):
    bundle = "\n\n".join([f"PASSAGE {i+1}:\n{p}" for i, p in enumerate(passages)])
    return f"{bundle}\n\nQUESTION: {question}\nANSWER:"

def singlehop_answer(question, k = K_RETRIEVE):
    docs = vector_store.similarity_search(question, k=k)
    # Keep only the page content to reduce tokens
    passages = [d.page_content for d in docs]
    user_prompt = build_user_prompt(question, passages)
    resp = llm.invoke([{"role":"system","content": SYSTEM_PROMPT},
                       {"role":"user","content": user_prompt}])
    pred = resp.content
    return pred, passages

In [26]:
# EM/F1 evaluation
_ARTICLES = {"a", "an", "the"}
_PUNCT = set(string.punctuation)

def _normalize(s: str) -> str:
    s = s.lower().strip()
    # remove punctuation
    s = "".join(ch for ch in s if ch not in _PUNCT)
    # remove articles
    tokens = [t for t in s.split() if t not in _ARTICLES]
    return " ".join(tokens)

def exact_match(pred: str, gold: str) -> float:
    return 1.0 if _normalize(pred) == _normalize(gold) else 0.0

def f1_score(pred: str, gold: str) -> float:
    pred_tokens = _normalize(pred).split()
    gold_tokens = _normalize(gold).split()
    if len(pred_tokens) == 0 and len(gold_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0
    common = {}
    for t in gold_tokens:
        common[t] = common.get(t, 0) + 1
    num_same = 0
    for t in pred_tokens:
        if common.get(t, 0) > 0:
            num_same += 1
            common[t] -= 1
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall + 1e-12)

In [None]:
def eval(ds, n, k=K_RETRIEVE):
    # idxs = list(range(min(n, len(ds)))) # first n examples
    idxs = random.sample(range(len(ds)), min(n, len(ds)))  # random n examples

    ems, f1s = [], []

    for i in idxs:
        ex = ds[i]
        q = ex["question"]
        gold_answer = ex["answer"]

        # Predictions from your singlehop system
        pred, _ = singlehop_answer(q, k=k)
        print(f"Q: {q}")
        print(f"Pred: {pred}")
        print(f"Gold: {gold_answer}")

        ems.append(exact_match(pred, gold_answer))
        f1s.append(f1_score(pred, gold_answer))

    m = len(idxs) if idxs else 1
    return {
        "n": len(idxs),
        "k": k,
        "EM": sum(ems)/m,
        "F1": sum(f1s)/m,
    }

# Run eval
metrics = eval(ds_val, N_EVAL, k=K_RETRIEVE)
print("Metrics:", metrics)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Q: Were Scott Derrickson and Ed Wood of the same nationality?
Pred: No.
Gold: yes


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Q: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
Pred: None
Gold: Chief of Protocol


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Q: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Pred: The Wess'har Wars
Gold: Animorphs


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Q: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
Pred: No.
Gold: no


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Q: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
Pred: Not specified.
Gold: Greenwich Village, New York City


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Metrics: {'n': 5, 'k': 5, 'EM': 0.2, 'F1': 0.19999999999989998, 'hit@k_any': 0.0, 'hit@k_all': 0.0}
