# Multimodal RAG over Complex Documents (PDF/DOCX/PPTX) — Gemini 2.5 Pro + Gemini Embeddings
_Merged from your two notebooks; core logic preserved, models swapped to Gemini._

In [7]:
# ✅ Compatible, pinned deps for LangChain + Gemini + Chroma + Unstructured
%pip install -Uq "pip>=25.0"

# Core LangChain stack (v1.x line to avoid 'langchain.verbose' errors)
%pip install -Uq \
  "langchain>=1.0.6,<2.0" \
  "langchain-core>=1.0.6,<2.0" \
  "langchain-community>=0.4.1,<1.0" \
  "langchain-text-splitters>=1.0.0,<2.0" \
  "langchain-chroma>=1.0.0,<2.0" \
  "chromadb>=0.5.5,<0.6"

# Gemini integration (LangChain + official SDKs)
%pip install -Uq \
  "langchain-google-genai>=2.0.7,<3.0" \
  "google-generativeai>=0.7.2" \
  "google-genai>=0.2.0" \
  "python-dotenv>=1.0.1" \
  "pydantic>=2.7,<3" \
  "typing-extensions>=4.9"

# Document parsing stack
%pip install -Uq \
  "unstructured[all-docs]>=0.15.11,<0.16" \
  "pymupdf>=1.24.10" \
  "pdfplumber>=0.11.4" \
  "pillow>=10.4.0" \
  "pytesseract>=0.3.13"

# Windows MIME helper (no-op on Linux/macOS)
%pip install -Uq "python-magic-bin>=0.4.14" || true

print("✔ Dependencies installed. If your kernel had older packages loaded, restart the kernel once.")


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement langchain<2.0,>=1.0.6 (from versions: 0.0.1, 0.0.2, 0.0.3, 0.0.4, 0.0.5, 0.0.6, 0.0.7, 0.0.8, 0.0.9, 0.0.10, 0.0.11, 0.0.12, 0.0.13, 0.0.14, 0.0.15, 0.0.16, 0.0.17, 0.0.18, 0.0.19, 0.0.20, 0.0.21, 0.0.22, 0.0.23, 0.0.24, 0.0.25, 0.0.26, 0.0.27, 0.0.28, 0.0.29, 0.0.30, 0.0.31, 0.0.32, 0.0.33, 0.0.34, 0.0.35, 0.0.36, 0.0.37, 0.0.38, 0.0.39, 0.0.40, 0.0.41, 0.0.42, 0.0.43, 0.0.44, 0.0.45, 0.0.46, 0.0.47, 0.0.48, 0.0.49, 0.0.50, 0.0.51, 0.0.52, 0.0.53, 0.0.54, 0.0.55, 0.0.56, 0.0.57, 0.0.58, 0.0.59, 0.0.60, 0.0.61, 0.0.63, 0.0.64, 0.0.65, 0.0.66, 0.0.67, 0.0.68, 0.0.69, 0.0.70, 0.0.71, 0.0.72, 0.0.73, 0.0.74, 0.0.75, 0.0.76, 0.0.77, 0.0.78, 0.0.79, 0.0.80, 0.0.81, 0.0.82, 0.0.83, 0.0.84, 0.0.85, 0.0.86, 0.0.87, 0.0.88, 0.0.89, 0.0.90, 0.0.91, 0.0.92, 0.0.93, 0.0.94, 0.0.95, 0.0.96, 0.0.97, 0.0.98, 0.0.99rc0, 0.0.99, 0.0.100, 0.0.101rc0, 0.0.101, 0.0.102rc0, 0.0.102, 0.0.103, 0.0.104, 0.0.105, 0.0.106, 0.0.107, 0.0.108, 0.0.109,

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
✔ Dependencies installed. If your kernel had older packages loaded, restart the kernel once.


In [8]:
import os, io, base64, json, uuid, shutil, pathlib, tempfile
from typing import List, Dict, Any, Optional
from dotenv import load_dotenv
load_dotenv()

# --- Gemini API key ---
# Set the key in your environment as GEMINI_API_KEY or paste below (not recommended to hardcode).
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
assert GEMINI_API_KEY, "Please set GEMINI_API_KEY in your environment (.env) before proceeding."
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

# Silence LangChain's verbose tracing unless you want it
#os.environ.setdefault("LANGCHAIN_TRACING_V2", "false")


## Models: Gemini 2.5 Pro (multimodal) + Gemini Embeddings

In [9]:
# --- Environment & Keys ---
import os
from dotenv import load_dotenv

load_dotenv()  # reads .env if present
GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
assert GEMINI_API_KEY, "Set GOOGLE_API_KEY (or GEMINI_API_KEY) in your environment or .env file."

# Make sure the SDKs see the key (prevents DefaultCredentialsError / ADC fallback)
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

# --- LangChain + Gemini models ---
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

# Embeddings: latest stable Gemini embedding model
# Docs: https://ai.google.dev/gemini-api/docs/embeddings  :contentReference[oaicite:0]{index=0}
EMBED_MODEL = "gemini-embedding-001"
embeddings = GoogleGenerativeAIEmbeddings(
    model=EMBED_MODEL,
    google_api_key=GEMINI_API_KEY,  # <- forces API key auth (no ADC)
)

# Multimodal LLM: Gemini 2.5 Pro (or use "gemini-2.5-flash" for lower latency)
# Docs / model IDs: https://ai.google.dev/gemini-api/docs/models  :contentReference[oaicite:1]{index=1}
# Vertex model page (for reference): gemini-2.5-pro  :contentReference[oaicite:2]{index=2}
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0.2,
    google_api_key=GEMINI_API_KEY,  # <- forces API key auth (no ADC)
)

print("✔ Gemini embeddings and LLM initialized.")


AttributeError: module 'langchain' has no attribute 'verbose'

## Document ingestion (PDF/DOCX/PPTX) with Unstructured + extras
Extracts text, tables, and images; preserves page numbers and element types.

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx
from unstructured.partition.pptx import partition_pptx
import pdfplumber, fitz  # pymupdf
from PIL import Image
import numpy as np

DATA_DIR = pathlib.Path("data")
DATA_DIR.mkdir(exist_ok=True)

def load_file(path: str):
    path = str(path)
    suffix = pathlib.Path(path).suffix.lower()
    if suffix == ".pdf":
        return partition_pdf(
            filename=path,
            extract_images_in_pdf=True,
            infer_table_structure=True,
            strategy="hi_res",
        )
    if suffix == ".docx":
        return partition_docx(filename=path, include_page_breaks=True)
    if suffix == ".pptx":
        return partition_pptx(filename=path, include_page_breaks=True)
    raise ValueError(f"Unsupported file type: {suffix}")

def pil_to_data_uri(img: Image.Image) -> str:
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{b64}"

def pdf_page_images(path: str, max_per_page: int = 8):
    """Extract raster images for each page (for multimodal grounding)."""
    doc = fitz.open(path)
    out = {}
    for pno in range(len(doc)):
        page = doc[pno]
        images = []
        for i, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n >= 4:  # RGBA -> RGB
                pix = fitz.Pixmap(fitz.csRGB, pix)
            img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            images.append(pil_to_data_uri(img_pil))
            if len(images) >= max_per_page:
                break
        out[pno+1] = images
    doc.close()
    return out


## Chunking and multi-vector indexing
Splits text into chunks, keeps parent references, and generates auxiliary image/tables captions when available.

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", "! ", "? ", ", ", " "]
)

def element_to_text(e) -> str:
    # Unstructured elements have .text and .category
    txt = getattr(e, 'text', '') or ''
    cat = getattr(e, 'category', '') or ''
    return f"[{cat}] {txt}".strip()

def build_parent_and_children(file_path: str) -> Dict[str, Any]:
    elements = load_file(file_path)
    raw_texts = [element_to_text(e) for e in elements if element_to_text(e)]
    full_text = "\n".join(raw_texts)

    # Parent doc stores the full text (and metadata)
    parent_id = str(uuid.uuid4())
    parent_doc = Document(
        page_content=full_text[:2_000_000],  # safety cap
        metadata={
            "source": str(file_path),
            "parent_id": parent_id,
            "modality": "text+tables+images (extracted)",
        },
    )

    # Child chunks to be embedded
    child_docs = []
    for i, chunk in enumerate(text_splitter.split_text(full_text)):
        child_docs.append(Document(
            page_content=chunk,
            metadata={"parent_id": parent_id, "chunk": i, "source": str(file_path)}
        ))

    # Also extract raster images per page (data URIs) for multimodal context-at-query time
    images_by_page = {}
    if str(file_path).lower().endswith(".pdf"):
        images_by_page = pdf_page_images(file_path)

    return {"parent": parent_doc, "children": child_docs, "images_by_page": images_by_page}


## Vector store and retriever

In [None]:
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

# Persistent Chroma directory (optional)
VEC_DIR = "chroma_gemini_rag"

# Child store: vector index of chunks
vectorstore = Chroma(
    collection_name="mm_rag_chunks",
    embedding_function=embeddings,
    persist_directory=VEC_DIR,
)

# Parent store: in-memory docstore keyed by parent_id
docstore = InMemoryStore()

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key="parent_id",
)


## Index documents

In [None]:
def index_files(files: List[str]):
    for fp in files:
        bundle = build_parent_and_children(fp)
        parent, children = bundle["parent"], bundle["children"]
        parent_id = parent.metadata["parent_id"]
        # 1) Store the parent
        docstore.mset([(parent_id, parent)])
        # 2) Add children to vector DB
        vectorstore.add_documents(children)
    print(f"Indexed {len(files)} file(s).")")


## Answer generation (multimodal)
At query time, we retrieve top chunks and attach matching page images (if any) to the Gemini 2.5 Pro prompt.

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage

SYSTEM_PROMPT = (    "You are a precise assistant. Answer strictly from the provided context. "
    "Cite page numbers or slide numbers when possible. If unsure, say you don't know.")

PROMPT = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "Query: {query}\n\nContext:\n{context}\n\nAnswer:"),
])

def docs_to_bullets(docs: List[Document]) -> str:
    out = []
    for d in docs:
        src = d.metadata.get("source", "?")
        chunk = d.metadata.get("chunk", "?")
        out.append(f"- ({src} · chunk {chunk}) {d.page_content[:500]}".strip())
    return "\n".join(out)

def pick_images_for_docs(docs: List[Document], images_by_file: Dict[str, Dict[int, List[str]]], max_images=4) -> List[str]:
    imgs = []
    for d in docs:
        src = d.metadata.get("source")
        if not src:
            continue
        if src.lower().endswith(".pdf") and src in images_by_file:
            # Take first page images we have (heuristic)
            for page_no, uris in images_by_file[src].items():
                for u in uris:
                    if len(imgs) < max_images:
                        imgs.append(u)
                if len(imgs) >= max_images:
                    break
        if len(imgs) >= max_images:
            break
    return imgs

def query_rag(query: str, k: int = 6):
    # Retrieve top-k text chunks
    chunks = retriever.vectorstore.similarity_search(query, k=k)
    context = docs_to_bullets(chunks)

    # Collect images related to these docs (if PDF sources were indexed)
    # We maintain a map file->page->images during build; reconstruct it:
    # For simplicity, we re-extract here for the top sources (cheap for small docs).
    images_map = {}
    for d in chunks:
        src = d.metadata.get("source","")
        if src.lower().endswith(".pdf") and src not in images_map:
            try:
                images_map[src] = pdf_page_images(src)
            except Exception:
                pass

    images = pick_images_for_docs(chunks, images_map)

    # Compose messages (attach up to 4 images as data URIs)
    messages = [SystemMessage(content=SYSTEM_PROMPT)]
    if images:
        # HumanMessage can take a list of content parts: text + images
        parts = [{"type":"text", "text": PROMPT.format(query=query, context=context).to_string()}]
        for u in images:
            parts.append({"type": "image_url", "image_url": u})
        messages.append(HumanMessage(content=parts))
    else:
        messages.append(HumanMessage(content=PROMPT.format(query=query, context=context).to_string()))

    response = llm.invoke(messages)
    return response.content, chunks


## Usage

In [None]:
# Example:
# files_to_index = ["/path/to/your.pdf", "/path/to/your.docx", "/path/to/slides.pptx"]
# index_files(files_to_index)

# Then ask:
# answer, supporting_chunks = query_rag("What are the key findings? Provide citations.")
# print(answer)
# supporting_chunks[:2]


### Notes
- Embeddings model: `gemini-embedding-001` (flexible output dims; default via LangChain).  
- Multimodal LLM: `gemini-2.5-pro` (supports text, images, video, audio, and PDFs; long context).  
- Vector DB: Chroma. Parent documents are kept in an in-memory docstore; retrieved via the multi-vector pattern.
- You can switch to `gemini-2.5-flash` for lower cost/latency without changing the code.
