In [1]:
# Imports
import os
import random
import re
from collections import Counter
from typing import cast
from datasets import load_dataset, Dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from pydantic import Field, conlist, create_model
import torch
import string
from dotenv import load_dotenv


In [2]:
# Load or set environment variables
# os.environ["OPENAI_API_KEY"] = ""
load_dotenv()

True

In [None]:
# Config
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
K_RETRIEVE = 5
MAX_HOPS = 3 # realistcally only 2 hops needed for these datasets
MAX_DOCS_PER_HOP = 3
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
CHAT_MODEL = "gpt-4o-mini"

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cpu


In [4]:
# Load HotpotQA: use TRAIN for corpus, VALIDATION for evaluation
print("Loading HotpotQA...")
ds_train = cast(Dataset, load_dataset("hotpot_qa", "distractor", split="train", streaming=False)) # cast to Dataset to avoid pylance error
ds_val = cast(Dataset, load_dataset("hotpot_qa", "distractor", split="validation", streaming=False))

print("Train size:", len(ds_train))
print("Validation size:", len(ds_val))


Loading HotpotQA...
Train size: 90447
Validation size: 7405


In [5]:
# Build a corpus from TRAIN context only
corpus_rows = []
for example in ds_train:
    titles = example["context"]["title"]
    sentences_lists = example["context"]["sentences"]
    for title, sents in zip(titles, sentences_lists):
        paragraph_text = " ".join(sents)
        corpus_rows.append({"title": title, "text": paragraph_text})

# Remove duplicates
unique_seen = set()
unique_rows = []
for row in corpus_rows:
    clean_text = re.sub(r"\s+", " ", row["text"]).strip().lower()
    key = (row["title"], clean_text)
    if key not in unique_seen:
        unique_seen.add(key)
        unique_rows.append({"title": row["title"], "text": row["text"]})

corpus_rows = unique_rows
print("Paragraphs:", len(corpus_rows))


Paragraphs: 482021


In [6]:
# Chunk with RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

texts, metas = [], []
for r in corpus_rows:
    chunks = text_splitter.split_text(r['text'])
    texts.extend(chunks)
    metas.extend([{"title": r['title']} for _ in chunks])

print("Chunks indexed:", len(texts))


Chunks indexed: 505435


In [7]:
# Build or load FAISS vector store (TODO: move this and the code b4 to a separate script to reuse later)
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL, 
    model_kwargs={"device": device}, # Use GPU if available
    encode_kwargs={"normalize_embeddings": True}, 
    # show_progress=True
)

if os.path.exists("faiss_hotpotqa"):
    print("Loading existing FAISS vector store from faiss_hotpotqa...")
    vector_store = FAISS.load_local("faiss_hotpotqa", embedding_model, allow_dangerous_deserialization=True)
else:
    print("Creating new FAISS vector store...")
    vector_store = FAISS.from_texts(
        texts,
        embedding_model, 
        metadatas=metas
    )

    # Save vector store to disk for future use
    vector_store.save_local("faiss_hotpotqa")
    print("FAISS vector store saved to faiss_hotpotqa")

Loading existing FAISS vector store from faiss_hotpotqa...


In [51]:
# LLM question decomposition
# Build a structured LLM that enforces: 2..max_subqs subquestions
def make_decomposer(llm, max_subqs=MAX_HOPS):
    # Dynamic schema so you can set max_subqs at runtime
    DecompSchema = create_model(
        "DecompSchema",
        subquestions=(conlist(str, min_length=2, max_length=max_subqs),
                      Field(description="Ordered sub-questions to solve the original in sequence."))
    )

    # Use strict structured output (no heuristics, no fallback)
    structured_llm = llm.bind_tools(
        tools=[],                     # no tools needed; we just want the schema
        response_format=DecompSchema, # pydantic schema
        strict=True
    )

    SYSTEM = "You have to break complex questions into concise, sequential sub-questions."
    USER_TMPL = (
        f"- Produce BETWEEN 2 and {max_subqs} sub-questions that will help answer the main question.\n"
        "- Each sub-question MUST be under 18 words.\n"
        "- EVERY sub-question MUST be SELF-CONTAINED and independently retrievable.\n"
        "- NO pronouns like this/that/these/they/she/he/it/there; repeat names explicitly.\n"
        "- Preserve ALL important details from the original question:"
        "• entities (people, works, places)"
        "• attributes (nationality, birthplace, year, etc.)"
        "• relationships (comparisons, cause/effect, part-whole, join)"
        "• qualifiers or facets (genre, time, style, perspective, etc.)"
        "- Order the list so answering in order solves the original question.\n"
        "- No extra keys. No commentary. No markdown.\n\n"
        "QUESTION: {q}"
    )

    def decompose(question):
        msgs = [
            {"role": "system", "content": SYSTEM},
            {"role": "user", "content": USER_TMPL.format(q=question)},
        ]
        resp = structured_llm.invoke(msgs)
        # LangChain stores the parsed Pydantic object here:
        parsed = resp.additional_kwargs["parsed"]
        # -> parsed is a dict with key "subquestions"
        return parsed.subquestions

    return decompose

In [52]:
# Multi-hop QA Pipeline
llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)
decomposer = make_decomposer(llm, MAX_HOPS)

def build_user_prompt(question, passages, subquestions):
    subquestion_bundle = "\n".join([f"{i+1}. {sq}" for i, sq in enumerate(subquestions)])
    passage_bundle = "\n\n".join([f"PASSAGE {i+1}:\n{p}" for i, p in enumerate(passages)])
    return f"Context:\n{passage_bundle}\n\nSUB-QUESTIONS:\n{subquestion_bundle}\n\nQUESTION: {question}\nANSWER:"

def multi_hop_qa(question, max_docs_per_hop=MAX_DOCS_PER_HOP, k=K_RETRIEVE):
    subquestions = decomposer(question)
    hops = []

    for subq in subquestions:
        docs = vector_store.similarity_search(subq, k=k)
        passages = [d.page_content for d in docs]
        hops.append(passages)
        # print(f"Sub-question: {subq}")
        # print(f"Retrieved passages:")
        # for i, p in enumerate(passages):
        #     print(f"Passage {i+1}:\n{p}\n")

    # Take the top max_docs_per_hop passages from each hop
    all_passages = []
    for hop_passages in hops:
        all_passages.extend(hop_passages[:max_docs_per_hop])

    # Remove duplicate passages
    unique_passages = list(dict.fromkeys(all_passages))

    # Build final prompt
    SYSTEM_PROMPT = ("You are a precise QA assistant. You are given sub-questions to help guide you through you're thought process. Using the context, return just the short answer phrase with no explanation, and no full sentences.")
    user_prompt = build_user_prompt(question, unique_passages, subquestions)
    resp = llm.invoke([{"role":"system","content": SYSTEM_PROMPT},
                       {"role":"user","content": user_prompt}])
    pred = resp.content
    return pred, unique_passages


In [50]:
# Let's test on one example
query = "What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?"
pred, passages = multi_hop_qa(query, MAX_DOCS_PER_HOP, k=K_RETRIEVE)
print("Question:", query)
print("Predicted Answer:", pred)
print("Retrieved Passages:")
for i, p in enumerate(passages):
    print(f"Passage {i+1}:\n{p}\n")

Sub-question: What is the title of the science fantasy young adult series told in first person?
Retrieved passages:
Passage 1:
Animorphs is a science fantasy series of young adult books written by Katherine Applegate and her husband Michael Grant, writing together under the name K. A. Applegate, and published by Scholastic.  It is told in first person, with all six main characters taking turns narrating the books through their own perspectives.  Horror, war, dehumanization, sanity, morality, innocence, leadership, freedom and growing up are the core themes of the series.

Passage 2:
The Mortal Instruments is a series of six young adult fantasy novels written by Cassandra Clare, the last of which was published May 27, 2014.  "The Mortal Instruments" is chronologically the third series of a proposed five in "The Shadowhunter Chronicles" but it was the first one published.  It follows Clary Fray (who interacts with a group of nephilim known as Shadowhunters) while also discovering her own

In [53]:
# EM/F1 evaluation
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    if normalized_prediction in ['yes', 'no'] and normalized_prediction != normalized_ground_truth:
        return 0
    if normalized_ground_truth in ['yes', 'no'] and normalized_prediction != normalized_ground_truth:
        return 0

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return 1.0 if (normalize_answer(prediction) == normalize_answer(ground_truth)) else 0.0

In [54]:
def eval(ds, n, k=K_RETRIEVE):
    idxs = list(range(min(n, len(ds)))) # first n examples

    ems, f1s = [], []

    for i in idxs:
        ex = ds[i]
        q = ex["question"]
        ground_truth = ex["answer"]

        # Predictions from your singlehop system
        pred, _ = multi_hop_qa(q, MAX_DOCS_PER_HOP, k=k)
        print(f"Q: {q}")
        print(f"Pred: {pred}")
        print(f"Ground Truth: {ground_truth}")

        ems.append(exact_match_score(pred, ground_truth))
        f1s.append(f1_score(pred, ground_truth))

    m = len(idxs) if idxs else 1
    return {
        "n": len(idxs),
        "k": k,
        "EM": sum(ems)/m,
        "F1": sum(f1s)/m,
    }

# Run eval
metrics = eval(ds_val, 100, k=K_RETRIEVE) # TODO: change N to ds_val size for full eval later
print("Metrics:", metrics)

Q: Were Scott Derrickson and Ed Wood of the same nationality?
Pred: Yes
Ground Truth: yes
Q: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
Pred: Information and Broadcasting and Textiles Minister
Ground Truth: Chief of Protocol
Q: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Pred: Animorphs
Ground Truth: Animorphs
Q: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
Pred: No
Ground Truth: no
Q: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
Pred: Not specified
Ground Truth: Greenwich Village, New York City
Q: 2014 S/S is the debut album of a South Korean boy group that was formed by who?
Pred: JYP Entertainment
Ground Truth: YG Entertainment
Q: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?
Pred: No 