# üìö Multi-modal PDF RAG with LangGraph - Complete Google Colab Guide

**Run this on GPU for best performance!**

This notebook provides a complete multi-modal RAG system that can:
- Process PDFs with text, images, and tables
- Perform semantic search on both text and images  
- Answer questions using a multi-agent system
- Use Hugging Face models (free, no OpenAI required)

## ‚öôÔ∏è Setup Steps:
1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU (T4 or better)
2. **Get Hugging Face API key**: https://huggingface.co/settings/tokens
3. **Run all cells in order**


## Step 1: Install All Dependencies


In [None]:
# Install all required packages
!pip install -q langchain>=0.1.0 langchain-openai>=0.0.2 langchain-community>=0.0.10 langgraph>=0.0.20
!pip install -q unstructured[pdf] pypdf pdf2image Pillow
!pip install -q chromadb faiss-cpu
!pip install -q sentence-transformers torch torchvision
!pip install -q duckduckgo-search tavily-python
!pip install -q python-dotenv requests opencv-python
!pip install -q numpy==1.24.3 pydantic>=2.7.4,<3.0.0

# Install system dependencies
!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr

print("‚úÖ All dependencies installed!")


## Step 2: Set Your API Keys


In [None]:
import os

# ‚ö†Ô∏è REPLACE WITH YOUR ACTUAL API KEYS ‚ö†Ô∏è
HUGGINGFACE_API_KEY = "YOUR_HUGGINGFACE_API_KEY_HERE"  # Required - Get from https://huggingface.co/settings/tokens
TAVILY_API_KEY = "YOUR_TAVILY_API_KEY_HERE"  # Optional - For web search

# Set environment variables
os.environ["HUGGINGFACE_API_KEY"] = HUGGINGFACE_API_KEY
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
os.environ["USE_HUGGINGFACE_PRIMARY"] = "true"
os.environ["USE_OPENAI_EMBEDDINGS"] = "false"
os.environ["USE_OPENAI_FALLBACK"] = "false"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

print("‚úÖ API keys configured!")
print(f"‚úì Hugging Face API key set: {bool(HUGGINGFACE_API_KEY and HUGGINGFACE_API_KEY != 'YOUR_HUGGINGFACE_API_KEY_HERE')}")


## Step 3: Create Configuration File


In [None]:
# Create config.py
config_code = '''"""Configuration settings for the Multi-modal RAG system."""
import os

# API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "")

# Model Configuration
LLM_MODEL = "gpt-4-1106-preview"
VISION_MODEL = "gpt-4-vision-preview"
EMBEDDING_MODEL = "text-embedding-3-large"

# Hugging Face Configuration
HUGGINGFACE_LLM_MODEL = os.getenv("HUGGINGFACE_LLM_MODEL", "distilgpt2")  # Use distilgpt2 for faster local loading
HUGGINGFACE_MULTIMODAL_MODEL = "Salesforce/blip-image-captioning-large"
USE_HUGGINGFACE_PRIMARY = os.getenv("USE_HUGGINGFACE_PRIMARY", "true").lower() == "true"
USE_OPENAI_EMBEDDINGS = os.getenv("USE_OPENAI_EMBEDDINGS", "false").lower() == "true"
USE_OPENAI_FALLBACK = os.getenv("USE_OPENAI_FALLBACK", "false").lower() == "true"

# Vector Store Configuration
VECTOR_STORE_PATH = os.getenv("VECTOR_STORE_PATH", "./vector_store")
CHROMA_COLLECTION_NAME = "multimodal_pdf_rag"
MAX_RETRIEVAL_DOCS = 3
MAX_IMAGES_PER_QUERY = 2

# PDF Processing Configuration
PDF_PROCESSING_MODE = "hi_res"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

# Agent Configuration
MAX_ITERATIONS = 30
TEMPERATURE = 0.0

# Rate Limit Configuration
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 2
'''

with open('config.py', 'w') as f:
    f.write(config_code)

print("‚úÖ Created config.py")


## Step 4: Create Image Embeddings Module


In [None]:
# Create image_embeddings.py
image_embeddings_code = '''"""Image embedding module using CLIP for semantic image search."""
import base64
from io import BytesIO
from typing import List, Optional
from PIL import Image
import numpy as np

try:
    from sentence_transformers import SentenceTransformer
    CLIP_AVAILABLE = True
except ImportError:
    CLIP_AVAILABLE = False

class ImageEmbedder:
    """Image embedding using CLIP model."""
    
    def __init__(self, model_name: str = "clip-ViT-B-32"):
        self.model = None
        self.model_name = model_name
        if CLIP_AVAILABLE:
            try:
                import torch
                device = "cuda" if torch.cuda.is_available() else "cpu"
                self.model = SentenceTransformer(self.model_name, device=device)
                print(f"‚úì Loaded CLIP model: {self.model_name} on {device}")
            except Exception as e:
                print(f"Warning: Could not load CLIP model: {e}")
    
    def is_available(self) -> bool:
        """Check if CLIP is available."""
        return self.model is not None
    
    def embed_image(self, image_base64: str) -> Optional[np.ndarray]:
        """Generate embedding for a base64-encoded image."""
        if not self.is_available():
            return None
        
        try:
            # Decode base64 image
            image_data = base64.b64decode(image_base64)
            image = Image.open(BytesIO(image_data))
            
            # Generate embedding
            embedding = self.model.encode(image, convert_to_numpy=True)
            return embedding
        except Exception as e:
            print(f"Error embedding image: {e}")
            return None
    
    def embed_images(self, images_base64: List[str]) -> List[Optional[np.ndarray]]:
        """Generate embeddings for multiple images."""
        if not self.is_available():
            return [None] * len(images_base64)
        
        embeddings = []
        for img_b64 in images_base64:
            emb = self.embed_image(img_b64)
            embeddings.append(emb)
        return embeddings
'''

with open('image_embeddings.py', 'w') as f:
    f.write(image_embeddings_code)

print("‚úÖ Created image_embeddings.py")


## Step 5: Clone Repository from GitHub


In [None]:
# Clone the repository from GitHub
!git clone https://github.com/DevXSoni021/Multi_model_RAG_Langgraph.git
%cd Multi_model_RAG_Langgraph

print("‚úÖ Repository cloned successfully!")
print("üìÅ All Python files are now available in the current directory")

# FIX 1: Fix import errors in vector_store.py and agents.py
import os
import re

# Fix vector_store.py imports and numpy array issues
with open('vector_store.py', 'r') as f:
    vector_store_code = f.read()

# Fix the Document import
vector_store_code = vector_store_code.replace(
    'from langchain.schema import Document',
    'from langchain_core.documents import Document'
)

# Also fix any other langchain.schema imports
vector_store_code = vector_store_code.replace(
    'from langchain.schema',
    'from langchain_core'
)

# Fix numpy array truth value check (ambiguous evaluation) - multiple patterns
# Pattern 1: Direct check
vector_store_code = vector_store_code.replace(
    'if image_embedding:',
    'if image_embedding is not None:'
)
# Pattern 2: With whitespace variations
vector_store_code = vector_store_code.replace(
    'if image_embedding :',
    'if image_embedding is not None:'
)
# Pattern 3: In try-except blocks (if not already fixed)
import re
vector_store_code = re.sub(
    r'if\s+image_embedding\s*:',
    'if image_embedding is not None:',
    vector_store_code
)

with open('vector_store.py', 'w') as f:
    f.write(vector_store_code)

print("‚úÖ Fixed vector_store.py imports and numpy array checks")

# Fix agents.py imports
with open('agents.py', 'r') as f:
    agents_imports_code = f.read()

# Fix langchain.prompts imports
agents_imports_code = agents_imports_code.replace(
    'from langchain.prompts import',
    'from langchain_core.prompts import'
)

# Fix langchain.tools imports
agents_imports_code = agents_imports_code.replace(
    'from langchain.tools import',
    'from langchain_core.tools import'
)

# Fix langchain.agents imports (keep these as they might be needed)
# But also add fallback for langchain_core

with open('agents.py', 'w') as f:
    f.write(agents_imports_code)

print("‚úÖ Fixed agents.py imports")

# FIX 2: Fix agents.py - Apply critical fixes for state handling and infinite loops
# Read agents.py
with open('agents.py', 'r') as f:
    agents_code = f.read()

# FIX 1: Replace query method to handle state correctly
new_query_method = '''    def query(self, question: str):
        """Query the RAG system - FIXED VERSION."""
        initial_state = {
            "question": question,
            "documents": "",
            "images": [],
            "chat_history": [],
            "image_query_triggered": False,
            "answer": ""  # Initialize answer field
        }
        
        try:
            # Use invoke() to get final merged state (not stream())
            final_state = self.graph.invoke(initial_state)
            
            # Extract answer - handle different state structures
            if isinstance(final_state, dict):
                # Method 1: Direct answer field (from invoke)
                answer = final_state.get("answer", "")
                if answer and answer.strip() and len(answer.strip()) > 10:
                    return answer.strip()
                
                # Method 2: Nested under node name (from stream or node output)
                if "answer_generator" in final_state:
                    nested = final_state["answer_generator"]
                    if isinstance(nested, dict):
                        nested_answer = nested.get("answer", "")
                        if nested_answer and nested_answer.strip() and len(nested_answer.strip()) > 10:
                            return nested_answer.strip()
                
                # Method 3: From messages
                if "messages" in final_state:
                    messages = final_state["messages"]
                    for msg in reversed(messages):
                        if hasattr(msg, "content") and msg.content:
                            content = str(msg.content).strip()
                            if len(content) > 10:
                                return content
                        elif isinstance(msg, str) and len(msg.strip()) > 10:
                            return msg.strip()
            
            return "No answer generated. Please try rephrasing your question."
            
        except Exception as e:
            error_msg = str(e)
            print(f"Error in query: {error_msg[:200]}")
            import traceback
            traceback.print_exc()
            return f"Error: {error_msg[:200]}. Please check your configuration and try again."
'''

# FIX 2: Replace generate_answer to prevent infinite loops
new_generate_answer = '''        def generate_answer(state):
            """Generate the final answer - FIXED VERSION."""
            question = state.get("question", "")
            documents = state.get("documents", "")
            images = state.get("images", [])
            chat_history = state.get("chat_history", [])
            
            # CRITICAL: Prevent infinite loops
            # Check if we're repeating the same question
            if chat_history:
                last_entries = chat_history[-3:] if len(chat_history) >= 3 else chat_history
                question_count = sum(1 for entry in last_entries if entry[0] == "user" and entry[1] == question)
                if question_count >= 2:
                    return {
                        "answer": "I notice this question was already asked. Please try rephrasing or ask a different question.",
                        "chat_history": chat_history
                    }
            
            # Limit chat history to prevent token overflow
            if len(chat_history) > 6:
                chat_history = chat_history[-6:]  # Keep only last 6 entries
            
            # If using OpenAI and images are available, use vision model
            if self.primary_llm_type == "openai" and images:
                try:
                    image_messages = [{
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img}"}
                    } for img in images[:2]]  # Limit to 2 images
                    
                    vision_content = [
                        {"type": "text", "text": f"Context: {documents}\\nQuestion: {question}"},
                        *image_messages
                    ]
                    response = self.vision_llm.invoke([("user", vision_content)])
                    answer = response.content if hasattr(response, "content") else str(response)
                except Exception as e:
                    print(f"Vision model error: {e}, falling back to text")
                    answer_chain = self.answer_agent
                    response = answer_chain.invoke({"question": question, "documents": documents, "images": []})
                    answer = response.content if hasattr(response, "content") else str(response)
            else:
                # Use standard LLM
                answer_chain = self.answer_agent
                try:
                    response = answer_chain.invoke({"question": question, "documents": documents, "images": images})
                    
                    # Handle different response types
                    if isinstance(response, str):
                        answer = response
                    elif hasattr(response, "content"):
                        answer = response.content
                    else:
                        answer = str(response)
                except Exception as e:
                    print(f"LLM error: {e}")
                    answer = f"I encountered an error while generating the answer: {str(e)[:100]}"
            
            # Clean up answer (remove repeated text)
            if answer:
                # Remove excessive repetition
                words = answer.split()
                if len(words) > 200:
                    # If too long, take first 200 words
                    answer = " ".join(words[:200]) + "..."
            
            # Update chat history - prevent exact duplicates
            if not chat_history or chat_history[-1] != ("user", question):
                new_history = chat_history + [("user", question), ("assistant", answer)]
            else:
                # Question already asked, just update answer
                new_history = chat_history[:-1] + [("assistant", answer)]
            
            return {
                "answer": answer,
                "chat_history": new_history
            }
'''

# Apply fixes using regex
# Fix query method - match method definition and body
query_pattern = r'(\s+def query\(self, question: str\):.*?)(?=\s+def |\s+class |\Z)'
agents_code = re.sub(query_pattern, new_query_method + '\n', agents_code, flags=re.DOTALL)

# Fix generate_answer method
generate_pattern = r'(\s+def generate_answer\(state\):.*?)(?=\s+def |\s+# Add nodes|\s+workflow\.add_node|\s+workflow\.set_entry_point)'
agents_code = re.sub(generate_pattern, new_generate_answer + '\n', agents_code, flags=re.DOTALL)

# Write fixed file
with open('agents.py', 'w') as f:
    f.write(agents_code)

print("‚úÖ Fixed agents.py successfully!")
print("   ‚úì Fixed query() method to extract answer correctly")
print("   ‚úì Fixed generate_answer() to prevent infinite loops")
print("   ‚úì Added duplicate detection and chat history limits")


## Step 6: Initialize the System


In [None]:
# Import all modules
from pdf_processor import MultimodalPDFProcessor
from vector_store import MultimodalVectorStore
from agents import MultiAgentRAG
import config
import shutil
import os

# Clean up existing vector store if it exists (to avoid ChromaDB conflicts)
# Also fix for Colab readonly database issue
vector_store_path = os.path.abspath(config.VECTOR_STORE_PATH)
if os.path.exists(vector_store_path):
    try:
        # Close any existing ChromaDB connections first
        import chromadb
        try:
            # Try to delete the database files with proper permissions
            for root, dirs, files in os.walk(vector_store_path):
                for file in files:
                    try:
                        file_path = os.path.join(root, file)
                        os.chmod(file_path, 0o644)  # Make writable
                    except:
                        pass
        except:
            pass
        
        shutil.rmtree(vector_store_path)
        print(f"‚úÖ Cleaned up existing vector store: {vector_store_path}")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not clean up vector store: {e}")
        # Try to use a unique path instead to avoid readonly issues
        import time
        unique_path = f"{vector_store_path}_{int(time.time())}"
        print(f"üîÑ Using unique path instead: {unique_path}")
        os.environ["VECTOR_STORE_PATH"] = unique_path
        config.VECTOR_STORE_PATH = unique_path

# Initialize vector store
print("üì¶ Initializing vector store...")
try:
    vector_store = MultimodalVectorStore()
    print(f"‚úÖ Vector store initialized with: {vector_store.embedding_type} embeddings")
except Exception as e:
    print(f"‚ùå Error initializing vector store: {e}")
    import traceback
    traceback.print_exc()
    raise

# Initialize RAG system
print("ü§ñ Initializing RAG system...")
try:
    rag_system = MultiAgentRAG(vector_store, use_huggingface_primary=True)
    print(f"‚úÖ RAG system initialized with: {rag_system.primary_llm_type} LLM")
except Exception as e:
    print(f"‚ùå Error initializing RAG system: {e}")
    import traceback
    traceback.print_exc()
    raise

print("\nüéâ System ready!")


## Step 7: Upload and Process PDF


In [None]:
from google.colab import files
import re
import importlib
import sys
import os

# CRITICAL FIX: Ensure vector_store.py has the numpy array fix applied
# This is a safety check in case Step 5 didn't catch it
try:
    # Find vector_store.py in current directory or nested directories
    # NOTE: Use 'file_list' instead of 'files' to avoid shadowing google.colab.files
    vs_path = None
    for root, dirs, file_list in os.walk('.'):
        if 'vector_store.py' in file_list:
            vs_path = os.path.join(root, 'vector_store.py')
            break
    
    if vs_path is None:
        vs_path = 'vector_store.py'  # Fallback to current directory
    
    with open(vs_path, 'r') as f:
        vs_content = f.read()
    
    # Check if the fix is already applied - look for the problematic pattern
    needs_fix = False
    if 'if image_embedding:' in vs_content:
        # Check if it's NOT already fixed
        lines = vs_content.split('\n')
        for i, line in enumerate(lines):
            if 'if image_embedding:' in line and 'is not None' not in line:
                needs_fix = True
                print(f"üîß Found problematic line {i+1}: {line.strip()}")
                break
    
    if needs_fix:
        print("üîß Applying numpy array fix to vector_store.py...")
        # Fix all variations using regex - be very specific
        vs_content = re.sub(
            r'(\s+)if\s+image_embedding\s*:',
            r'\1if image_embedding is not None:',
            vs_content
        )
        with open(vs_path, 'w') as f:
            f.write(vs_content)
        print("‚úÖ Fixed numpy array check in vector_store.py")
        
        # Reload the module
        if 'vector_store' in sys.modules:
            importlib.reload(sys.modules['vector_store'])
        print("‚úÖ Reloaded vector_store module")
    else:
        print("‚úÖ Numpy array fix already applied")
except Exception as e:
    print(f"‚ö†Ô∏è Could not apply fix: {e}")
    import traceback
    traceback.print_exc()

# Upload PDF file
print("üì§ Upload your PDF file...")
uploaded = files.upload()

# Process PDF
processor = MultimodalPDFProcessor(processing_mode=config.PDF_PROCESSING_MODE)

all_chunks = []
for filename in uploaded.keys():
    if filename.endswith('.pdf'):
        print(f"\nüìÑ Processing {filename}...")
        chunks = processor.process_pdf(filename)
        all_chunks.extend(chunks)
        print(f"‚úÖ Extracted {len(chunks)} chunks from {filename}")

# Add to vector store
if all_chunks:
    print(f"\nüíæ Adding {len(all_chunks)} chunks to vector store...")
    try:
        vector_store.add_documents(all_chunks)
        print("‚úÖ Documents added successfully!")
    except ValueError as e:
        if "truth value of an array" in str(e):
            print("‚ùå Error: Numpy array truth value issue detected.")
            print("üîß Attempting to fix vector_store.py and retry...")
            # Apply fix and reload
            with open('vector_store.py', 'r') as f:
                vs_content = f.read()
            vs_content = re.sub(
                r'if\s+image_embedding\s*:',
                'if image_embedding is not None:',
                vs_content
            )
            with open('vector_store.py', 'w') as f:
                f.write(vs_content)
            # Reload and retry
            if 'vector_store' in sys.modules:
                importlib.reload(sys.modules['vector_store'])
            from vector_store import MultimodalVectorStore
            vector_store = MultimodalVectorStore()
            vector_store.add_documents(all_chunks)
            print("‚úÖ Fixed and documents added successfully!")
        else:
            raise
else:
    print("‚ö†Ô∏è No chunks extracted from PDF")


## Step 8: Ask Questions


In [None]:
# Ask a question about your documents
question = "tell me about the image in doc"  # Change this to your question

print(f"‚ùì Question: {question}\n")
print("ü§î Thinking...\n")

try:
    answer = rag_system.query(question)
    print(f"\nüí¨ Answer:\n{answer}")
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


## Step 9: Interactive Chat (Optional)


In [None]:
# Simple interactive chat loop
print("üí¨ Chat with your documents (type 'quit' to exit)\n")

chat_history = []

while True:
    question = input("\nYou: ")
    
    if question.lower() in ['quit', 'exit', 'q']:
        print("üëã Goodbye!")
        break
    
    if not question.strip():
        continue
    
    try:
        print("ü§î Thinking...")
        answer = rag_system.query(question)
        print(f"\nü§ñ Assistant: {answer}")
        chat_history.append((question, answer))
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
