In [1]:
# Imports
import os
import random
import re
from collections import Counter
from typing import cast
from datasets import load_dataset, Dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
import torch
import string
from dotenv import load_dotenv


In [2]:
# Load or set environment variables
# os.environ["OPENAI_API_KEY"] = ""
load_dotenv()

True

In [3]:
# Config (tweak these)
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
K_RETRIEVE = 5
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
CHAT_MODEL = "gpt-4o-mini"

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cpu


In [12]:
# Load HotpotQA: use TRAIN for corpus, VALIDATION for evaluation
print("Loading HotpotQA...")
ds_train = cast(Dataset, load_dataset("hotpot_qa", "fullwiki", split="train", streaming=False)) # cast to Dataset to avoid pylance error
ds_val = cast(Dataset, load_dataset("hotpot_qa", "fullwiki", split="validation", streaming=False))

print("Train size:", len(ds_train))
print("Validation size:", len(ds_val))


Loading HotpotQA...


fullwiki/train-00000-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

fullwiki/train-00001-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

fullwiki/validation-00000-of-00001.parqu(…):   0%|          | 0.00/28.0M [00:00<?, ?B/s]

fullwiki/test-00000-of-00001.parquet:   0%|          | 0.00/27.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Train size: 90447
Validation size: 7405


In [None]:
# Build a corpus
corpus_rows = []
for example in ds_train:
    titles = example["context"]["title"]
    sentences_lists = example["context"]["sentences"]
    for title, sents in zip(titles, sentences_lists):
        paragraph_text = " ".join(sents)
        corpus_rows.append({"title": title, "text": paragraph_text})

# Remove duplicates
unique_seen = set()
unique_rows = []
for row in corpus_rows:
    clean_text = re.sub(r"\s+", " ", row["text"]).strip().lower()
    key = (row["title"], clean_text)
    if key not in unique_seen:
        unique_seen.add(key)
        unique_rows.append({"title": row["title"], "text": row["text"]})

corpus_rows = unique_rows
print("Paragraphs:", len(corpus_rows))


Paragraphs: 508826


In [6]:
# Chunk with RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

texts, metas = [], []
for r in corpus_rows:
    chunks = text_splitter.split_text(r['text'])
    texts.extend(chunks)
    metas.extend([{"title": r['title']} for _ in chunks])

print("Chunks indexed:", len(texts))


Chunks indexed: 505435


In [20]:
# Build or load FAISS vector store (TODO: move this and the code b4 to a separate script to reuse later)
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL, 
    model_kwargs={"device": device}, # Use GPU if available
    encode_kwargs={"normalize_embeddings": True}, 
    # show_progress=True
)

if os.path.exists("faiss_hotpotqa"):
    print("Loading existing FAISS vector store from faiss_hotpotqa...")
    vector_store = FAISS.load_local("faiss_hotpotqa", embedding_model, allow_dangerous_deserialization=True)
else:
    print("Creating new FAISS vector store...")
    vector_store = FAISS.from_texts(
        texts,
        embedding_model, 
        metadatas=metas
    )

    # Save vector store to disk for future use
    vector_store.save_local("faiss_hotpotqa")
    print("FAISS vector store saved to faiss_hotpotqa")

Loading existing FAISS vector store from faiss_hotpotqa...


In [None]:
# Answer with LLM + Retrieved Docs
llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)

SYSTEM_PROMPT = ("You are a precise QA assistant. Return just the short answer phrase with no explanation, and no full sentences.")

def build_user_prompt(question, passages):
    bundle = "\n\n".join([f"PASSAGE {i+1}:\n{p}" for i, p in enumerate(passages)])
    return f"{bundle}\n\nQUESTION: {question}\nANSWER:"

def singlehop_answer(question, k = K_RETRIEVE):
    docs = vector_store.similarity_search(question, k=k)
    # Keep only the page content to reduce tokens
    passages = [d.page_content for d in docs]
    user_prompt = build_user_prompt(question, passages)
    resp = llm.invoke([{"role":"system","content": SYSTEM_PROMPT},
                       {"role":"user","content": user_prompt}])
    pred = resp.content
    return pred, passages

In [22]:
# Let's test on one example
query = "The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?"
pred, passages = singlehop_answer(query, k=K_RETRIEVE)
print("Question:", query)
print("Predicted Answer:", pred)
print("Retrieved Passages:")
for i, p in enumerate(passages):
    print(f"Passage {i+1}:\n{p}\n")

Question: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
Predicted Answer: Not applicable.
Retrieved Passages:
Passage 1:
Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society.  Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s.  The film had its world premiere at the Virginia Film Festival on November 6, 2014.

Passage 2:
Little Manhattan is a 2005 American romantic comedy film directed and written by husband and wife Mark Levin and Jennifer Flackett.  Though Levin is credited as the director and Flackett as the writer, in the film's DVD commentary the two reveal that they collaborated on both tasks.  "Little Manhattan" depicts the story of ten-year-old Gabe's realization that girls can be pretty and nice to be with.  The story t

In [10]:
# EM/F1 evaluation
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    if normalized_prediction in ['yes', 'no'] and normalized_prediction != normalized_ground_truth:
        return 0
    if normalized_ground_truth in ['yes', 'no'] and normalized_prediction != normalized_ground_truth:
        return 0

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return 1.0 if (normalize_answer(prediction) == normalize_answer(ground_truth)) else 0.0

In [23]:
def eval(ds, n, k=K_RETRIEVE):
    idxs = list(range(min(n, len(ds)))) # first n examples

    ems, f1s = [], []

    for i in idxs:
        ex = ds[i]
        q = ex["question"]
        ground_truth = ex["answer"]

        # Predictions from your singlehop system
        pred, _ = singlehop_answer(q, k=k)
        print(f"Q: {q}")
        print(f"Pred: {pred}")
        print(f"Ground Truth: {ground_truth}")

        ems.append(exact_match_score(pred, ground_truth))
        f1s.append(f1_score(pred, ground_truth))

    m = len(idxs) if idxs else 1
    return {
        "n": len(idxs),
        "k": k,
        "EM": sum(ems)/m,
        "F1": sum(f1s)/m,
    }

# Run eval
metrics = eval(ds_val, 100, k=K_RETRIEVE) # TODO: change N to ds_val size for full eval later
print("Metrics:", metrics)

Q: Were Scott Derrickson and Ed Wood of the same nationality?
Pred: Yes.
Ground Truth: yes
Q: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
Pred: None
Ground Truth: Chief of Protocol
Q: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Pred: Animorphs
Ground Truth: Animorphs
Q: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
Pred: No.
Ground Truth: no
Q: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
Pred: Not applicable
Ground Truth: Greenwich Village, New York City
Q: 2014 S/S is the debut album of a South Korean boy group that was formed by who?
Pred: YG Entertainment
Ground Truth: YG Entertainment
Q: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?
Pred: Eenasul Fateh
Ground Truth: Eenasul Fateh
Q: Th