# üìö Multi-modal PDF RAG with LangGraph - Complete Google Colab Guide

**Run this on GPU for best performance!**

This notebook provides a complete multi-modal RAG system that can:
- Process PDFs with text, images, and tables
- Perform semantic search on both text and images  
- Answer questions using a multi-agent system
- Use Hugging Face models (free, no OpenAI required)

## ‚öôÔ∏è Setup Steps:
1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU (T4 or better)
2. **Get Hugging Face API key**: https://huggingface.co/settings/tokens
3. **Run all cells in order**


## Step 1: Install All Dependencies


In [None]:
# Install all required packages
!pip install -q langchain>=0.1.0 langchain-openai>=0.0.2 langchain-community>=0.0.10 langgraph>=0.0.20
!pip install -q unstructured[pdf] pypdf pdf2image Pillow
!pip install -q chromadb faiss-cpu
!pip install -q sentence-transformers torch torchvision
!pip install -q duckduckgo-search tavily-python
!pip install -q python-dotenv requests opencv-python
!pip install -q numpy==1.24.3 pydantic>=2.7.4,<3.0.0

# Install system dependencies
!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr

print("‚úÖ All dependencies installed!")


## Step 2: Set Your API Keys


In [None]:
import os

# ‚ö†Ô∏è REPLACE WITH YOUR ACTUAL API KEYS ‚ö†Ô∏è
HUGGINGFACE_API_KEY = "YOUR_HUGGINGFACE_API_KEY_HERE"  # Required - Get from https://huggingface.co/settings/tokens
TAVILY_API_KEY = "YOUR_TAVILY_API_KEY_HERE"  # Optional - For web search

# Set environment variables
os.environ["HUGGINGFACE_API_KEY"] = HUGGINGFACE_API_KEY
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
os.environ["USE_HUGGINGFACE_PRIMARY"] = "true"
os.environ["USE_OPENAI_EMBEDDINGS"] = "false"
os.environ["USE_OPENAI_FALLBACK"] = "false"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

print("‚úÖ API keys configured!")
print(f"‚úì Hugging Face API key set: {bool(HUGGINGFACE_API_KEY and HUGGINGFACE_API_KEY != 'YOUR_HUGGINGFACE_API_KEY_HERE')}")


## Step 3: Create Configuration File


In [None]:
# Create config.py
config_code = '''"""Configuration settings for the Multi-modal RAG system."""
import os

# API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "")

# Model Configuration
LLM_MODEL = "gpt-4-1106-preview"
VISION_MODEL = "gpt-4-vision-preview"
EMBEDDING_MODEL = "text-embedding-3-large"

# Hugging Face Configuration
HUGGINGFACE_LLM_MODEL = os.getenv("HUGGINGFACE_LLM_MODEL", "microsoft/DialoGPT-medium")
HUGGINGFACE_MULTIMODAL_MODEL = "Salesforce/blip-image-captioning-large"
USE_HUGGINGFACE_PRIMARY = os.getenv("USE_HUGGINGFACE_PRIMARY", "true").lower() == "true"
USE_OPENAI_EMBEDDINGS = os.getenv("USE_OPENAI_EMBEDDINGS", "false").lower() == "true"
USE_OPENAI_FALLBACK = os.getenv("USE_OPENAI_FALLBACK", "false").lower() == "true"

# Vector Store Configuration
VECTOR_STORE_PATH = os.getenv("VECTOR_STORE_PATH", "./vector_store")
CHROMA_COLLECTION_NAME = "multimodal_pdf_rag"
MAX_RETRIEVAL_DOCS = 3
MAX_IMAGES_PER_QUERY = 2

# PDF Processing Configuration
PDF_PROCESSING_MODE = "hi_res"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

# Agent Configuration
MAX_ITERATIONS = 30
TEMPERATURE = 0.0

# Rate Limit Configuration
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 2
'''

with open('config.py', 'w') as f:
    f.write(config_code)

print("‚úÖ Created config.py")


## Step 4: Create Image Embeddings Module


In [None]:
# Create image_embeddings.py
image_embeddings_code = '''"""Image embedding module using CLIP for semantic image search."""
import base64
from io import BytesIO
from typing import List, Optional
from PIL import Image
import numpy as np

try:
    from sentence_transformers import SentenceTransformer
    CLIP_AVAILABLE = True
except ImportError:
    CLIP_AVAILABLE = False

class ImageEmbedder:
    """Image embedding using CLIP model."""
    
    def __init__(self, model_name: str = "clip-ViT-B-32"):
        self.model = None
        self.model_name = model_name
        if CLIP_AVAILABLE:
            try:
                import torch
                device = "cuda" if torch.cuda.is_available() else "cpu"
                self.model = SentenceTransformer(self.model_name, device=device)
                print(f"‚úì Loaded CLIP model: {self.model_name} on {device}")
            except Exception as e:
                print(f"Warning: Could not load CLIP model: {e}")
    
    def is_available(self) -> bool:
        """Check if CLIP is available."""
        return self.model is not None
    
    def embed_image(self, image_base64: str) -> Optional[np.ndarray]:
        """Generate embedding for a base64-encoded image."""
        if not self.is_available():
            return None
        
        try:
            # Decode base64 image
            image_data = base64.b64decode(image_base64)
            image = Image.open(BytesIO(image_data))
            
            # Generate embedding
            embedding = self.model.encode(image, convert_to_numpy=True)
            return embedding
        except Exception as e:
            print(f"Error embedding image: {e}")
            return None
    
    def embed_images(self, images_base64: List[str]) -> List[Optional[np.ndarray]]:
        """Generate embeddings for multiple images."""
        if not self.is_available():
            return [None] * len(images_base64)
        
        embeddings = []
        for img_b64 in images_base64:
            emb = self.embed_image(img_b64)
            embeddings.append(emb)
        return embeddings
'''

with open('image_embeddings.py', 'w') as f:
    f.write(image_embeddings_code)

print("‚úÖ Created image_embeddings.py")


## Step 5: Clone Repository from GitHub


In [None]:
# Clone the repository from GitHub
!git clone https://github.com/DevXSoni021/Multi_model_RAG_Langgraph.git
%cd Multi_model_RAG_Langgraph

print("‚úÖ Repository cloned successfully!")
print("üìÅ All Python files are now available in the current directory")


## Step 6: Initialize the System


In [None]:
# Import all modules
from pdf_processor import MultimodalPDFProcessor
from vector_store import MultimodalVectorStore
from agents import MultiAgentRAG
import config

# Initialize vector store
print("üì¶ Initializing vector store...")
vector_store = MultimodalVectorStore()
print(f"‚úÖ Vector store initialized with: {vector_store.embedding_type} embeddings")

# Initialize RAG system
print("ü§ñ Initializing RAG system...")
rag_system = MultiAgentRAG(vector_store, use_huggingface_primary=True)
print(f"‚úÖ RAG system initialized with: {rag_system.primary_llm_type} LLM")

print("\nüéâ System ready!")


## Step 7: Upload and Process PDF


In [None]:
from google.colab import files

# Upload PDF file
print("üì§ Upload your PDF file...")
uploaded = files.upload()

# Process PDF
processor = MultimodalPDFProcessor(processing_mode=config.PDF_PROCESSING_MODE)

all_chunks = []
for filename in uploaded.keys():
    if filename.endswith('.pdf'):
        print(f"\nüìÑ Processing {filename}...")
        chunks = processor.process_pdf(filename)
        all_chunks.extend(chunks)
        print(f"‚úÖ Extracted {len(chunks)} chunks from {filename}")

# Add to vector store
if all_chunks:
    print(f"\nüíæ Adding {len(all_chunks)} chunks to vector store...")
    vector_store.add_documents(all_chunks)
    print("‚úÖ Documents added successfully!")
else:
    print("‚ö†Ô∏è No chunks extracted from PDF")


## Step 8: Ask Questions


In [None]:
# Ask a question about your documents
question = "tell me about the image in doc"  # Change this to your question

print(f"‚ùì Question: {question}\n")
print("ü§î Thinking...\n")

try:
    answer = rag_system.query(question)
    print(f"\nüí¨ Answer:\n{answer}")
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


## Step 9: Interactive Chat (Optional)


In [None]:
# Simple interactive chat loop
print("üí¨ Chat with your documents (type 'quit' to exit)\n")

chat_history = []

while True:
    question = input("\nYou: ")
    
    if question.lower() in ['quit', 'exit', 'q']:
        print("üëã Goodbye!")
        break
    
    if not question.strip():
        continue
    
    try:
        print("ü§î Thinking...")
        answer = rag_system.query(question)
        print(f"\nü§ñ Assistant: {answer}")
        chat_history.append((question, answer))
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
