In [14]:
!pip install pypdf sentence-transformers faiss-cpu



In [15]:
import os
import json
import textwrap
from typing import List, Dict, Optional

import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import torch


class MultiDocRetriever:
    """
    Minimal but robust implementation of:
    - PDF ingestion
    - text cleaning + chunking
    - embedding computation
    - FAISS-based vector search
    """

    def __init__(
        self,
        model_name: str = "all-MiniLM-L6-v2",
        max_chars: int = 800,
        overlap_chars: int = 150,
    ):
        """
        Initialize the retriever.

        Args:
            model_name: SentenceTransformer model name.
            max_chars: Max number of characters per chunk.
            overlap_chars: Overlap between consecutive chunks
                           (helps preserve context across boundaries).
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = SentenceTransformer(model_name)
        self.model.to(self.device)

        self.max_chars = max_chars
        self.overlap_chars = overlap_chars

        self.chunks: List[str] = []
        self.meta: List[Dict] = []   # per-chunk metadata
        self.embeddings: Optional[np.ndarray] = None
        self.index: Optional[faiss.Index] = None


    # 1. PDF loading w. the text prep
    @staticmethod
    def _load_pdf_text(pdf_path: str) -> str:
        """
        Read all text from a PDF file.

        Args:
            pdf_path: Path to the PDF file.

        Returns:
            Raw extracted text as a single string.
        """
        reader = PdfReader(pdf_path)
        texts = []
        for page in reader.pages:
            try:
                page_text = page.extract_text() or ""
            except Exception:
                page_text = ""
            texts.append(page_text)
        return "\n".join(texts)

    @staticmethod
    def _clean_text(text: str) -> str:
        """
        Basic text cleaning.

        - Replace newlines with spaces.
        - Collapse multiple whitespace into a single space.
        - Strip leading/trailing whitespace.
        """
        cleaned = " ".join(text.replace("\r", " ").replace("\n", " ").split())
        return cleaned.strip()

    def _chunk_text(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks by character length.

        Args:
            text: Cleaned text.

        Returns:
            List of text chunks.
        """
        if not text:
            return []

        chunks = []
        start = 0
        n = len(text)

        # Sliding window with overlap
        while start < n:
            end = min(start + self.max_chars, n)
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            if end == n:
                break
            # Move back by overlap_chars to keep some context
            start = end - self.overlap_chars

        return chunks


    # 2. Ingestion
    def add_pdf(self, pdf_path: str, doc_id: Optional[str] = None):
        """
        Ingest a single PDF: load, clean, chunk, and store metadata.
        Embeddings/index are NOT built here (call build_index separately).

        Args:
            pdf_path: Path to PDF.
            doc_id: Optional explicit document ID (e.g., "paper1").
                    If None, use the filename.
        """
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF not found: {pdf_path}")

        if doc_id is None:
            doc_id = os.path.basename(pdf_path)

        raw_text = self._load_pdf_text(pdf_path)
        cleaned = self._clean_text(raw_text)
        chunks = self._chunk_text(cleaned)

        start_idx = len(self.chunks)
        for local_idx, chunk in enumerate(chunks):
            self.chunks.append(chunk)
            self.meta.append(
                {
                    "doc_id": doc_id,
                    "chunk_id": local_idx,
                    "global_id": start_idx + local_idx,
                }
            )

        print(f"[INFO] Ingested {doc_id}: {len(chunks)} chunks.")

    def add_pdfs_from_dir(self, pdf_dir: str, recursive: bool = False):
        """
        Ingest all PDFs from a directory.

        Args:
            pdf_dir: Directory containing PDF files.
            recursive: If True, walk subdirectories as well.
        """
        if not os.path.isdir(pdf_dir):
            raise NotADirectoryError(f"Directory not found: {pdf_dir}")

        count_files = 0
        for root, dirs, files in os.walk(pdf_dir):
            for fname in files:
                if fname.lower().endswith(".pdf"):
                    fpath = os.path.join(root, fname)
                    self.add_pdf(fpath)
                    count_files += 1
            if not recursive:
                break

        print(f"[INFO] Finished ingesting PDFs from {pdf_dir}. Total files: {count_files}")

    # 3. Embeddings + FAISS index
    def build_index(self, show_progress: bool = True):
        """
        Compute embeddings for all chunks and build a FAISS index.

        Args:
            show_progress: Whether to show a progress bar during encoding.
        """
        if not self.chunks:
            raise ValueError("No chunks available. Ingest PDFs before building index.")

        print(f"[INFO] Computing embeddings for {len(self.chunks)} chunks...")
        self.embeddings = self.model.encode(
            self.chunks,
            batch_size=32,
            convert_to_numpy=True,
            show_progress_bar=show_progress,
            device=self.device,
        )

        d = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.embeddings)
        print(f"[INFO] FAISS index built. Vector dimension = {d}, size = {self.index.ntotal}")

    # 4. Retrieval
    def retrieve(self, query: str, k: int = 5) -> List[Dict]:
        """
        Retrieve top-k most relevant chunks for a query.

        Args:
            query: Natural language question / query string.
            k: Number of chunks to return.

        Returns:
            List of dicts with: rank, score (distance), doc_id, chunk_id, text
        """
        if self.index is None or self.embeddings is None:
            raise ValueError("Index not built. Call build_index() first.")

        if not query.strip():
            raise ValueError("Query is empty.")

        q_emb = self.model.encode(
            [query],
            convert_to_numpy=True,
            device=self.device,
        )

        distances, indices = self.index.search(q_emb, k)
        indices = indices[0]
        distances = distances[0]

        results = []
        for rank, (idx, dist) in enumerate(zip(indices, distances), start=1):
            meta = self.meta[idx]
            results.append(
                {
                    "rank": rank,
                    "score": float(dist),  # smaller = more similar for L2
                    "doc_id": meta["doc_id"],
                    "chunk_id": int(meta["chunk_id"]),
                    "global_id": int(meta["global_id"]),
                    "text": self.chunks[idx],
                }
            )
        return results

    # 5. Save and load index and metadata
    def save(self, out_dir: str):
        """
        Save embeddings, metadata, and FAISS index to disk.

        Args:
            out_dir: Directory to save files into.
        """
        if self.embeddings is None or self.index is None:
            raise ValueError("Nothing to save. Build the index first.")

        os.makedirs(out_dir, exist_ok=True)

        # Save embeddings
        np.save(os.path.join(out_dir, "embeddings.npy"), self.embeddings)

        # Save chunks and metadata
        with open(os.path.join(out_dir, "chunks.json"), "w", encoding="utf-8") as f:
            json.dump(self.chunks, f, ensure_ascii=False)

        with open(os.path.join(out_dir, "meta.json"), "w", encoding="utf-8") as f:
            json.dump(self.meta, f, ensure_ascii=False)

        # Save FAISS index
        faiss.write_index(self.index, os.path.join(out_dir, "faiss.index"))

        print(f"[INFO] Saved index and metadata to {out_dir}")

    def load(self, in_dir: str):
        """
        Load embeddings, metadata, and FAISS index from disk.

        Args:
            in_dir: Directory from which to load files.
        """
        emb_path = os.path.join(in_dir, "embeddings.npy")
        chunks_path = os.path.join(in_dir, "chunks.json")
        meta_path = os.path.join(in_dir, "meta.json")
        index_path = os.path.join(in_dir, "faiss.index")

        if not all(os.path.exists(p) for p in [emb_path, chunks_path, meta_path, index_path]):
            raise FileNotFoundError("Missing one or more saved files in the directory.")

        self.embeddings = np.load(emb_path)

        with open(chunks_path, "r", encoding="utf-8") as f:
            self.chunks = json.load(f)

        with open(meta_path, "r", encoding="utf-8") as f:
            self.meta = json.load(f)

        self.index = faiss.read_index(index_path)
        print(f"[INFO] Loaded index and metadata from {in_dir}")


# Helper functions built on top of MultiDocRetriever

def summarize_corpus(retriever: MultiDocRetriever) -> None:
    """
    Print a short summary of the corpus:
    total number of chunks and number of chunks per document.
    """
    from collections import Counter

    counts = Counter(m["doc_id"] for m in retriever.meta)
    print(f"Total chunks: {len(retriever.chunks)}")
    for doc_id, c in counts.items():
        print(f"- {doc_id}: {c} chunks")


def get_context_for_query(
    retriever: MultiDocRetriever,
    query: str,
    k: int = 6,
    max_total_chars: int = 4000,
) -> str:
    """
    Retrieve top-k chunks and format them as a single context string
    suitable for feeding into an LLM prompt.

    Args:
        retriever: A fitted MultiDocRetriever with an index built.
        query: Natural language question / query string.
        k: Number of chunks to retrieve.
        max_total_chars: Soft limit on total context length (in characters).

    Returns:
        A concatenated string of chunks with simple headers.
    """
    chunks = retriever.retrieve(query, k=k)
    parts = []
    total = 0

    for c in chunks:
        header = f"[{c['doc_id']} — chunk {c['chunk_id']}]"
        block = header + "\n" + c["text"]
        if total + len(block) > max_total_chars:
            break
        parts.append(block)
        total += len(block)

    return "\n\n".join(parts)


def demo_retrieval_example(retriever: MultiDocRetriever) -> None:
    """
    Simple demo: run a fixed query and print the top-k chunks.
    """
    user_question = "What are the main risks discussed in this paper?"
    results = retriever.retrieve(user_question, k=6)
    for r in results:
        print(f"[{r['rank']}] {r['doc_id']} (chunk {r['chunk_id']}), score={r['score']:.4f}")
        print(r["text"][:400], "\n", "-" * 80)



In [16]:
from google.colab import files
import os

os.makedirs("papers", exist_ok=True)

uploaded = files.upload()

for fname in uploaded.keys():
    os.rename(fname, f"papers/{fname}")

os.listdir("papers")




Saving 135 Term Structure Review ARFE 2009.pdf to 135 Term Structure Review ARFE 2009 (1).pdf
Saving 248 Inflation ARFE 2023.pdf to 248 Inflation ARFE 2023 (1).pdf
Saving 259 Coupon Bond JFQA 2024 wp.pdf to 259 Coupon Bond JFQA 2024 wp (1).pdf


['248 Inflation ARFE 2023 (1).pdf',
 '135 Term Structure Review ARFE 2009 (1).pdf',
 '259 Coupon Bond JFQA 2024 wp (1).pdf']

In [17]:
# 1.retriever
retriever = MultiDocRetriever(
    model_name="all-MiniLM-L6-v2",
    max_chars=800,
    overlap_chars=150,
)

# 2. Ingest all PDFs in the "papers" folder
retriever.add_pdfs_from_dir("papers", recursive=False)

# 3. Build vector index
retriever.build_index(show_progress=True)

# 4. Summarize
summarize_corpus(retriever)

# 5. Run a structured demo
demo_retrieval_example(retriever)

# 6. Build context for LLM (Module #2 will use this)
ctx = get_context_for_query(
    retriever,
    "What are the main risks discussed in this paper?",
    k=6
)

print(ctx[:800])



[INFO] Ingested 248 Inflation ARFE 2023 (1).pdf: 116 chunks.
[INFO] Ingested 135 Term Structure Review ARFE 2009 (1).pdf: 125 chunks.
[INFO] Ingested 259 Coupon Bond JFQA 2024 wp (1).pdf: 179 chunks.
[INFO] Finished ingesting PDFs from papers. Total files: 3
[INFO] Computing embeddings for 420 chunks...


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

[INFO] FAISS index built. Vector dimension = 384, size = 420
Total chunks: 420
- 248 Inflation ARFE 2023 (1).pdf: 116 chunks
- 135 Term Structure Review ARFE 2009 (1).pdf: 125 chunks
- 259 Coupon Bond JFQA 2024 wp (1).pdf: 179 chunks
[1] 259 Coupon Bond JFQA 2024 wp (1).pdf (chunk 65), score=1.2674
a vector of constants. For an application of such a hazard rate model applied to corporate default probabilities see Chava and Jarrow (2004). As discussed in Jarrow, Lando, and Yu (2005), this assumption does not imply that risky coupon bonds earn no risk premium. Quite the contrary. If the state variables Γt driving the default process represent systematic risk, which is the most likely case, the 
 --------------------------------------------------------------------------------
[2] 259 Coupon Bond JFQA 2024 wp (1).pdf (chunk 67), score=1.2705
robabili- ties from the 10-year term structure of monthly marginal default probabilities (the monthly probability of default conditional on no prior d

In [18]:
user_question = "What are the main risks discussed in this paper?"
context = get_context_for_query(retriever, user_question, k=6)

print(context[:800])


[259 Coupon Bond JFQA 2024 wp (1).pdf — chunk 65]
a vector of constants. For an application of such a hazard rate model applied to corporate default probabilities see Chava and Jarrow (2004). As discussed in Jarrow, Lando, and Yu (2005), this assumption does not imply that risky coupon bonds earn no risk premium. Quite the contrary. If the state variables Γt driving the default process represent systematic risk, which is the most likely case, then risky coupon bond prices necessarily earn a risk premium due to the bond price’s correlation to Γt. The diversifiable risk assumption just states that the timing of the default event itself, after conditioning onΓt, is diversifiable in a large portfolio. Alternatively stated, in a poor economy all firms are more likely to default. But, the timing


In [19]:
def get_context_for_query(retriever, query: str, k: int = 6) -> str:
    """
    Retrieve top-k chunks and format them as a single context string
    suitable for feeding into an LLM prompt.
    """
    chunks = retriever.retrieve(query, k=k)
    parts = []
    for c in chunks:
        header = f"[{c['doc_id']} — chunk {c['chunk_id']}]"
        parts.append(header + "\n" + c["text"])
    return "\n\n".join(parts)


In [20]:
retriever.save("multi_doc_index")

[INFO] Saved index and metadata to multi_doc_index
