In [32]:
# If you already installed these in 01_ingestion_retrieval, you can skip.
!pip install -q transformers sentence-transformers faiss-cpu pypdf


In [33]:
# set up project path (Colab + local)

import os
import sys

def in_colab():
    try:
        import google.colab  # type: ignore
        return True
    except ImportError:
        return False

if in_colab():
    # Clone the repo once per runtime
    if not os.path.exists("/content/MultiDocRAG"):
        !git clone https://github.com/ChengWu-Data/MultiDocRAG.git
    %cd /content/MultiDocRAG

PROJECT_ROOT = os.getcwd()
sys.path.append(PROJECT_ROOT)

print("Project root:", PROJECT_ROOT)
print("Dir listing:", os.listdir(PROJECT_ROOT))


/content/MultiDocRAG
Project root: /content/MultiDocRAG
Dir listing: ['notebooks', 'index_store', 'src', '.gitignore', 'LICENSE', '.git', 'README.md']


In [34]:
# load retriever + index from module #1

from src.retriever import MultiDocRetriever

INDEX_DIR = os.path.join(PROJECT_ROOT, "index_store")

retriever = MultiDocRetriever(
    model_name="all-MiniLM-L6-v2",
    max_chars=800,
    overlap_chars=150,
)
retriever.load(INDEX_DIR)

print("Index loaded from:", INDEX_DIR)
print("Total chunks:", len(retriever.chunks))


[INFO] Loaded index from /content/MultiDocRAG/index_store
Index loaded from: /content/MultiDocRAG/index_store
Total chunks: 420


In [35]:
# helper functions for RAG (context + prompt)

def get_context_for_query(
    retriever: MultiDocRetriever,
    question: str,
    k: int = 6,
) -> str:
    """
    Retrieve top-k chunks and format them into a single context string.
    """
    chunks = retriever.retrieve(question, k=k)
    blocks = []
    for c in chunks:
        header = f"[{c['doc_id']} — chunk {c['chunk_id']}]"
        blocks.append(header + "\n" + c["text"])
    return "\n\n".join(blocks)


def build_rag_prompt(question: str, context: str) -> str:
    """
    Build the prompt given retrieved context + user question.
    The goal is to keep the model grounded in the provided text.
    """
    return f"""You are a teaching assistant for a graduate-level financial economics course.

You are given several excerpts from academic finance papers.
Use ONLY this context to answer the question.
If the context does not contain enough information, say:
"The context does not provide enough information to answer this fully."
Do not introduce facts that are not supported by the context.

Context:
{context}

Question:
{question}

Provide a concise answer in 1–2 short paragraphs.
"""


In [36]:
# load open-source language model (local inference, no API)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# u can switch this to another instruction model if needed!
# Some common open-source options (not all will fit on free Colab GPU):
#   - mistralai/Mistral-7B-Instruct-v0.2
#   - meta-llama/Meta-Llama-3-8B-Instruct
#   - google/gemma-2-7b-it
#   - microsoft/Phi-3-mini-4k-instruct
#   - TinyLlama/TinyLlama-1.1B-Chat-v1.0   (usually OK on Colab Pro)
#
# For now I m using a very small model so the notebook actually runs on
# the default Colab environment. Performance is not the focus here — the
# goal is just to demonstrate the RAG pipeline end-to-end.

MODEL_NAME = "sshleifer/tiny-gpt2"
print(f"Loading model: {MODEL_NAME}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
)
model.eval()


def get_max_context_len(model) -> int:
    """
    Infer the maximum context length (in tokens) of the model from its config.
    This works for GPT-2 style models and most other Hugging Face causal LMs.
    """
    cfg = model.config
    for name in [
        "max_position_embeddings",  # most Transformer models
        "n_positions",              # GPT-2 family
        "max_seq_len",
        "max_sequence_length",
        "seq_length",
    ]:
        if hasattr(cfg, name) and getattr(cfg, name) is not None:
            try:
                return int(getattr(cfg, name))
            except (TypeError, ValueError):
                pass

    # Conservative fallback if nothing is found
    return 1024


# Make sure the tokenizer has a pad token (GPT-2 does not by default).
if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        # Fallback: reuse unk as pad; rarely matters for small demos.
        tokenizer.add_special_tokens({"pad_token": tokenizer.unk_token})
    model.resize_token_embeddings(len(tokenizer))

max_ctx = get_max_context_len(model)
print("Model loaded.")
print("Max context length:", max_ctx)
print("Pad token id:", tokenizer.pad_token_id)
print("EOS token id:", tokenizer.eos_token_id)


Loading model: sshleifer/tiny-gpt2
Model loaded.
Max context length: 1024
Pad token id: 50256
EOS token id: 50256


In [37]:
def get_max_context_len(model) -> int:
    """
    Infer the maximum context length of the model from its config.
    This works for GPT-2 style models and most other HF causal LMs.
    """
    cfg = model.config
    for name in [
        "max_position_embeddings",  # most Transformer models
        "n_positions",              # GPT-2 family
        "max_seq_len",
        "max_sequence_length",
        "seq_length",
    ]:
        if hasattr(cfg, name) and getattr(cfg, name) is not None:
            try:
                return int(getattr(cfg, name))
            except (TypeError, ValueError):
                pass

    # Conservative fallback
    return 1024

In [38]:
from typing import Optional

def generate_from_model(
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
    top_p: float = 0.95,
    top_k: Optional[int] = None,
    return_only_new: bool = False,
) -> str:
    """
    Run a single forward pass of the causal LM.

    Features:
    - Respects the model's max context length.
    - Ensures input length + generated tokens never exceed that limit.
    - Supports temperature / top-p (nucleus) / top-k sampling.
    - Falls back to greedy decoding when no randomness is requested.
    """

    # Get max context length
    max_ctx = get_max_context_len(model)

    # Tokenize *without* truncation first
    enc = tokenizer(
        prompt,
        return_tensors="pt",
        add_special_tokens=False,
    )
    input_ids = enc["input_ids"]          # shape: [1, L]
    attn_mask = enc.get("attention_mask", None)
    input_len = input_ids.shape[1]

    # If the prompt alone is too long, keep only the last (max_ctx - 1) tokens
    # so that we still have room to generate at least 1 new token.
    if input_len >= max_ctx:
        keep_len = max_ctx - 1
        if keep_len <= 0:
            keep_len = 1  # extreme edge case
        input_ids = input_ids[:, -keep_len:]
        if attn_mask is not None:
            attn_mask = attn_mask[:, -keep_len:]
        input_len = keep_len

    # Enforce: input_len + max_new_tokens <= max_ctx.
    available_for_gen = max_ctx - input_len
    if available_for_gen <= 0:
        # No room left, force at least 1 token generation.
        max_new_tokens = 1
    else:
        max_new_tokens = min(max_new_tokens, available_for_gen)

    device = model.device
    input_ids = input_ids.to(device)
    if attn_mask is not None:
        attn_mask = attn_mask.to(device)

    inputs = {"input_ids": input_ids}
    if attn_mask is not None:
        inputs["attention_mask"] = attn_mask

    # Decide whether to sample or use greedy decoding.
    use_sampling = False
    if temperature is not None and temperature > 0.0:
        use_sampling = True
    if top_p is not None and top_p < 1.0:
        use_sampling = True
    if top_k is not None and top_k > 0:
        use_sampling = True

    pad_token_id = tokenizer.pad_token_id
    eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else pad_token_id

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        pad_token_id=pad_token_id,
        eos_token_id=eos_token_id,
    )

    if use_sampling:
        # Nucleus / top-k sampling (this is your "tup" = top-p).
        gen_kwargs.update(
            dict(
                do_sample=True,
                temperature=max(1e-5, float(temperature)),
                top_p=float(top_p) if top_p is not None else 1.0,
                top_k=int(top_k) if top_k is not None else 0,
            )
        )
    else:
        # Pure greedy decoding (no sampling).
        gen_kwargs.update(
            dict(
                do_sample=False,
            )
        )

    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            **gen_kwargs,
        )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if return_only_new:
        original_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        if full_text.startswith(original_text):
            return full_text[len(original_text):].lstrip()

    return full_text


In [39]:
# unified QA interface (baseline vs RAG)

def answer_question(
    retriever: MultiDocRetriever,
    question: str,
    mode: str = "rag",
    k: int = 6,
    temperature: float = 0.2,
) -> dict:
    """
    Unified entry point for QA.

    mode = "rag": retrieval-augmented generation with multi-document context.
    mode = "baseline": same model but no retrieved context.
    """
    if mode == "rag":
        context = get_context_for_query(retriever, question, k=k)
        prompt = build_rag_prompt(question, context)
    elif mode == "baseline":
        context = None
        prompt = f"""You are a general-purpose assistant.
Answer the question below as best as you can.
Do not assume you have access to any specific research papers.

Question:
{question}
"""
    else:
        raise ValueError(f"Unknown mode: {mode}")


    answer = generate_from_model(
        prompt=prompt,
        max_new_tokens=256,
        temperature=temperature,
        top_p=0.95,
        top_k=None,
        )


    return {
        "mode": mode,
        "question": question,
        "context": context,
        "prompt": prompt,
        "answer": answer,
    }


In [40]:
# quick demo/compare baseline vs RAG on one question

test_question = "What are the main sources of interest rate risk discussed in these papers?"

rag_result = answer_question(retriever, test_question, mode="rag", k=6, temperature=0.1)
baseline_result = answer_question(retriever, test_question, mode="baseline", temperature=0.1)

print("=== RAG ANSWER ===")
print(rag_result["answer"])

print("\n" + "=" * 80 + "\n")

print("=== BASELINE ANSWER ===")
print(baseline_result["answer"])


Token indices sequence length is longer than the specified maximum sequence length for this model (1232 > 1024). Running this sequence through the model will result in indexing errors


=== RAG ANSWER ===
 process is highly correlated with interest rate movements. This, of course, is the case for interest rate derivatives, where the underlying assets are the interest rates themselves. In response, a class of equilibrium-ba

[135 Term Structure Review ARFE 2009 (1).pdf — chunk 93]
ues. Settling these issues is essential for understanding risk management for both corporate and financial institutions. The capital structure decision for a corporation, that is, the determination of the debt equity ratio, or the determination of economic capital for a financial institution (related to the Basel II Accord), depends crucially on the evolution of the term structure of interest rates. In this determination there are four risks to be consid- ered: market, credit, liquidity, and operational risk. Interest rate risk is the major compo- nent of market risk. However, these term structure models still need to be extended to include credit, liquidity, and operational risk. Although th

In [41]:
# want to run over a small question set

questions = [
    "What are the main sources of interest rate risk discussed in these papers?",
    "How do the papers model default or credit risk?",
    "What are some key applications of term structure models to mortgages or corporate bonds?",
]

all_results = []

for q in questions:
    for mode in ["baseline", "rag"]:
        res = answer_question(retriever, q, mode=mode, k=6, temperature=0.1)
        all_results.append(res)

print(f"Collected {len(all_results)} (question, mode) pairs.")


Collected 6 (question, mode) pairs.
