In [None]:
import os
import base64
import streamlit as st
from PIL import Image
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.vectorstores import Chroma
from unstructured.partition.pdf import partition_pdf

# Ensure directories exist
if not os.path.exists("figures"):
    os.makedirs("figures")

class MultiModalRAG:
    def __init__(self):
        # We use GPT-4o for both vision and reasoning
        self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
        self.vectorstore = Chroma(
            collection_name="enterprise_rag",
            embedding_function=OpenAIEmbeddings(),
            persist_directory="./chroma_db"
        )

    @staticmethod
    def encode_image(image_path):
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode('utf-8')

    def summarize_image(self, image_path):
        """Generates a searchable text description for an image."""
        b64_image = self.encode_image(image_path)
        response = self.llm.invoke([
            HumanMessage(content=[
                {"type": "text", "text": "Analyze this image from a document. Provide a detailed summary of its content, charts, or data for indexing."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"}}
            ])
        ])
        return response.content

    def process_pdf(self, file_path):
        """Extracts text and images, then stores them in the vector database."""
        elements = partition_pdf(
            filename=file_path,
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=1000,
            image_output_dir_path="figures"
        )
        
        for el in elements:
            metadata = {"source": file_path, "type": "text"}
            if "Image" in str(type(el)):
                img_path = el.metadata.image_path
                content = self.summarize_image(img_path)
                metadata.update({"type": "image", "image_path": img_path})
            else:
                content = el.text
            
            if content:
                self.vectorstore.add_texts(texts=[content], metadatas=[metadata])

    def get_final_answer(self, query, context_docs):
        """Standard RAG: Asks LLM to answer based ONLY on retrieved context."""
        context_text = "\n\n".join([d.page_content for d in context_docs])
        
        prompt = [
            SystemMessage(content="You are a helpful assistant. Answer the question using ONLY the provided context. If the answer isn't in the context, say you don't know."),
            HumanMessage(content=f"Context:\n{context_text}\n\nQuestion: {query}")
        ]
        response = self.llm.invoke(prompt)
        return response.content

    def query_system(self, user_input, is_image=False):
        """Handles both text and image queries, returning a text answer + images."""
        search_query = user_input
        if is_image:
            search_query = self.summarize_image(user_input)
            st.info("Image search initialized...")

        # 1. Retrieve the most relevant 3 documents (text or image summaries)
        docs = self.vectorstore.similarity_search(search_query, k=3)
        
        # 2. Generate a natural language answer based on those docs
        final_answer = self.get_final_answer(search_query, docs)
        
        return final_answer, docs

# --- STREAMLIT UI ---
st.set_page_config(page_title="Multimodal RAG", layout="wide")
st.title("üìë Multi-Modal Enterprise Intelligence")

if "rag" not in st.session_state:
    st.session_state.rag = MultiModalRAG()

with st.sidebar:
    st.header("Upload Knowledge Base")
    uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
    if uploaded_pdf and st.button("Index Document"):
        with st.spinner("Indexing..."):
            with open("temp.pdf", "wb") as f:
                f.write(uploaded_pdf.getbuffer())
            st.session_state.rag.process_pdf("temp.pdf")
            st.success("Indexing Complete!")

st.subheader("Query the Document")
col1, col2 = st.columns([2, 1])
with col2:
    uploaded_query_img = st.file_uploader("Search using an image?", type=["jpg", "png", "jpeg"])
with col1:
    user_text = st.chat_input("Ask a question about the PDF...")

# Input Processing Logic
query_input = None
is_image_query = False

if uploaded_query_img:
    img_path = f"figures/query_{uploaded_query_img.name}"
    with open(img_path, "wb") as f:
        f.write(uploaded_query_img.getbuffer())
    query_input, is_image_query = img_path, True
elif user_text:
    query_input, is_image_query = user_text, False

# Display logic
if query_input:
    with st.spinner("Thinking..."):
        answer, matched_docs = st.session_state.rag.query_system(query_input, is_image=is_image_query)
        
        # Show the conversational answer
        st.markdown(f"### Answer:\n{answer}")
        
        # Show relevant images found during the search
        st.markdown("---")
        st.markdown("#### Supporting Evidence from PDF:")
        
        cols = st.columns(len(matched_docs))
        for idx, doc in enumerate(matched_docs):
            with cols[idx]:
                if doc.metadata["type"] == "image":
                    st.image(doc.metadata["image_path"], caption="Found Image Context")
                else:
                    st.caption("Text Context Found")
                with st.expander("View Source Content"):
                    st.write(doc.page_content)

In [None]:
# # üìë Multimodal Enterprise Intelligence RAG
# > **An end-to-end RAG pipeline capable of querying unstructured data (PDFs, Images, and Tables) using Cross-Modal Retrieval.**

# ## üöÄ Overview
# Traditional RAG systems are limited to text. This project implements a **Multi-Vector Retrieval** architecture that allows users to query a document using either **text** or **images**. The system "sees" the charts, tables, and images within a PDF, indexes them semantically, and retrieves them contextually.



# ## ‚ú® Key Features
# * **Multimodal Ingestion:** Uses `Unstructured.io` to partition complex PDFs into text chunks, structured tables, and high-resolution images.
# * **Vision-Language Indexing:** Leverages **GPT-4o** to generate searchable text summaries of visual data, enabling images to be retrieved via standard vector similarity.
# * **Cross-Modal Querying:** Users can upload a separate image (e.g., a photo of a graph) to find matching or semantically related content within the indexed PDF.
# * **Unified Context:** The system provides "Ground Truth" by displaying the original source image alongside the AI-generated text answer.
# * **Industry-Ready UI:** Clean, interactive **Streamlit** interface for easy document indexing and real-time chat.

# ## üèóÔ∏è Technical Architecture
# The system follows a 4-stage pipeline to ensure accuracy and explainability:

# 1.  **Partitioning:** PDF elements are separated. Images are saved to a local store while tables are preserved in HTML/Markdown format.
# 2.  **Summarization:** Each image is sent to the **GPT-4o Vision API** to generate a "description proxy" that captures data trends and visual context.
# 3.  **Vector Store:** All content (text, tables, and image summaries) is converted into embeddings using `OpenAIEmbeddings` and stored in **ChromaDB**.
# 4.  **Retrieval Logic:**
#     * **Text Query:** Uses cosine similarity to find relevant chunks.
#     * **Image Query:** The user's uploaded image is first "translated" into a textual description by the LLM, which is then used to perform a similarity search against the index.



# ## üõ†Ô∏è Tech Stack
# * **LLM:** OpenAI GPT-4o (Vision & Text capabilities)
# * **Orchestration:** LangChain
# * **Vector Database:** ChromaDB (Persistent)
# * **Data Extraction:** Unstructured.io
# * **Frontend:** Streamlit
# * **Language:** Python 3.10+

# ## üì• Installation & Setup

# ### 1. Prerequisites
# Ensure you have an OpenAI API Key and `Poppler` (required for PDF image extraction) installed on your system.

# ### 2. Clone & Install
# ```bash
# git clone [https://github.com/yourusername/multimodal-rag-enterprise.git](https://github.com/yourusername/multimodal-rag-enterprise.git)
# cd multimodal-rag-enterprise
# pip install langchain langchain-openai unstructured[pdf] chromadb pillow streamlit rapidocr-onnxruntime


# 3. Set Environment Variables

# export OPENAI_API_KEY='your_api_key_here'

# 4. Run the App

# streamlit run app.py

In [None]:
import os
import base64
import shutil
import streamlit as st
from PIL import Image
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.vectorstores import Chroma
from unstructured.partition.pdf import partition_pdf

# --- CONFIGURATION & DIRECTORIES ---
CHROMA_PATH = "./chroma_db"
IMAGE_DIR = "figures"

for folder in [IMAGE_DIR, CHROMA_PATH]:
    if not os.path.exists(folder):
        os.makedirs(folder)

class MultiModalRAG:
    def __init__(self):
        # High-reasoning model (Cheap & Fast)
        self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        self.vectorstore = Chroma(
            collection_name="enterprise_rag",
            embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
            persist_directory=CHROMA_PATH
        )

    @staticmethod
    def encode_image(image_path):
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode('utf-8')

    def summarize_image(self, image_path):
        """Standardizes visual data into searchable text."""
        b64_image = self.encode_image(image_path)
        response = self.llm.invoke([
            HumanMessage(content=[
                {"type": "text", "text": "Analyze this image from a document. Describe charts, data points, or visual content for a search index."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"}}
            ])
        ])
        return response.content

    def process_pdf(self, file_path):
        """Industry-standard partitioning for PDF extraction."""
        elements = partition_pdf(
            filename=file_path,
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=1000,
            image_output_dir_path=IMAGE_DIR
        )
        
        for el in elements:
            metadata = {"source": file_path, "type": "text"}
            if "Image" in str(type(el)):
                img_path = el.metadata.image_path
                content = self.summarize_image(img_path)
                metadata.update({"type": "image", "image_path": img_path})
            else:
                content = el.text
            
            if content:
                self.vectorstore.add_texts(texts=[content], metadatas=[metadata])

    def query_system(self, user_input, is_image=False):
        """Cross-modal retrieval logic."""
        search_query = user_input
        if is_image:
            search_query = self.summarize_image(user_input)
        
        # Retrieve top matches
        docs = self.vectorstore.similarity_search(search_query, k=3)
        
        # Formulate answer based on context
        context_text = "\n\n".join([d.page_content for d in docs])
        prompt = [
            # SystemMessage(content="Answer the question using ONLY the provided context. Show expertise and clarity."),
            # SystemMessage(content="Answer based ONLY on the provided context. If an image is relevant, mention it."),
            SystemMessage(content="You are a helpful assistant. Answer the question using ONLY the provided context and if an image is relevant, mention it. If the answer isn't in the context, say you don't know."),

            HumanMessage(content=f"Context:\n{context_text}\n\nQuestion: {search_query}")
        ]
        answer = self.llm.invoke(prompt).content
        return answer, docs

# --- STREAMLIT DASHBOARD ---
st.set_page_config(page_title="Multimodal RAG", layout="wide")
st.title("üìë Multi-Modal Enterprise Intelligence")

if "rag" not in st.session_state:
    st.session_state.rag = MultiModalRAG()

with st.sidebar:
    st.header("Admin Controls")
    uploaded_pdf = st.file_uploader("Upload Knowledge Base", type="pdf")
    
    if uploaded_pdf and st.button("Index Document"):
        with st.spinner("Indexing..."):
            with open("temp.pdf", "wb") as f:
                f.write(uploaded_pdf.getbuffer())
            st.session_state.rag.process_pdf("temp.pdf")
            st.success("PDF Content Embedded!")

    if st.button("üóëÔ∏è Reset System", help="Clears the Vector DB and Images"):
        shutil.rmtree(CHROMA_PATH, ignore_errors=True)
        shutil.rmtree(IMAGE_DIR, ignore_errors=True)
        st.cache_resource.clear()
        st.rerun()

# --- CHAT & INTERACTION ---
st.subheader("Interactive Knowledge Retrieval")
user_text = st.chat_input("Ask a question about the document...")
uploaded_query_img = st.file_uploader("Or upload an image to search by visual context", type=["jpg", "png", "jpeg"])

# Retrieval Logic
query_val = None
is_img = False

if uploaded_query_img:
    img_save_path = os.path.join(IMAGE_DIR, f"query_{uploaded_query_img.name}")
    with open(img_save_path, "wb") as f:
        f.write(uploaded_query_img.getbuffer())
    query_val, is_img = img_save_path, True
elif user_text:
    query_val, is_img = user_text, False

if query_val:
    with st.spinner("Analyzing context..."):
        ans, results = st.session_state.rag.query_system(query_val, is_image=is_img)
        
        st.markdown(f"### ü§ñ Response\n{ans}")
        
        st.markdown("---")
        st.markdown("#### üñºÔ∏è Evidence & Sources")
        cols = st.columns(3)
        for i, doc in enumerate(results):
            with cols[i % 3]:
                if doc.metadata["type"] == "image":
                    st.image(doc.metadata["image_path"], use_container_width=True)
                st.info(f"Source Type: {doc.metadata['type'].upper()}")
                with st.expander("Show Raw Context"):
                    st.write(doc.page_content)

In [None]:
import os
import base64
import shutil
import streamlit as st
from PIL import Image
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.vectorstores import Chroma
from unstructured.partition.pdf import partition_pdf

# --- 1. SETTINGS & DIRECTORIES ---
CHROMA_PATH = "./chroma_db"
IMAGE_DIR = "extracted_figures"
os.makedirs(IMAGE_DIR, exist_ok=True)

class HybridMultimodalRAG:
    def __init__(self):
        # High-reasoning model (Cheap & Fast)
        self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        
        # LOW COST: Using free local HuggingFace embeddings instead of OpenAI
        # This reduces embedding costs to $0.
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        
        self.vectorstore = Chroma(
            collection_name="enterprise_inventory",
            embedding_function=self.embeddings,
            persist_directory=CHROMA_PATH
        )

    @staticmethod
    def encode_image(image_path):
        """Convert image to base64 for LLM vision processing."""
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode('utf-8')

    def describe_image(self, image_path):
        """Uses Vision LLM to turn an image into searchable text."""
        b64_image = self.encode_image(image_path)
        response = self.llm.invoke([
            HumanMessage(content=[
                {"type": "text", "text": "Describe this document image/chart in detail for indexing. Focus on data, labels, and trends."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"}}
            ])
        ])
        return response.content

    def ingest_pdf(self, pdf_path):
        """Extracts text and images, creates summaries, and saves to Vector DB."""
        # Partitioning PDF with image extraction
        elements = partition_pdf(
            filename=pdf_path,
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=1000,
            image_output_dir_path=IMAGE_DIR
        )

        for el in elements:
            metadata = {"source": pdf_path}
            
            if "Image" in str(type(el)):
                # Handle Image
                img_path = el.metadata.image_path
                summary = self.describe_image(img_path)
                metadata.update({"type": "image", "image_path": img_path})
                self.vectorstore.add_texts(texts=[summary], metadatas=[metadata])
            else:
                # Handle Text
                metadata.update({"type": "text"})
                self.vectorstore.add_texts(texts=[el.text], metadatas=[metadata])

    def query(self, user_input, is_image=False):
        """Multimodal query logic."""
        search_text = user_input
        
        # If user uploads an image to ask a question
        if is_image:
            search_text = self.describe_image(user_input)
            st.toast(f"Searching for visual context: {search_text[:50]}...")

        # Search top 3 relevant chunks
        docs = self.vectorstore.similarity_search(search_text, k=3)
        
        # Generate final answer based on context
        context = "\n\n".join([d.page_content for d in docs])
        prompt = [
            SystemMessage(content="Answer based ONLY on the provided context. If an image is relevant, mention it."),
            HumanMessage(content=f"Context: {context}\n\nQuestion: {search_text}")
        ]
        answer = self.llm.invoke(prompt).content
        return answer, docs

# --- 2. STREAMLIT INTERFACE ---
st.set_page_config(page_title="MultiModal RAG", layout="wide")
st.title("üìë Smart Multimodal Enterprise RAG")
st.markdown("---")

# Initialize Session
if "rag_engine" not in st.session_state:
    st.session_state.rag_engine = HybridMultimodalRAG()

# Sidebar Setup
with st.sidebar:
    st.header("Admin Panel")
    openai_key = st.text_input("OpenAI API Key", type="password")
    if openai_key:
        os.environ["OPENAI_API_KEY"] = openai_key
    
    uploaded_file = st.file_uploader("Upload PDF Knowledge Base", type="pdf")
    if uploaded_file and st.button("üî• Start Indexing"):
        with st.spinner("Processing PDF (Extraction & Vision Summary)..."):
            with open("temp_doc.pdf", "wb") as f:
                f.write(uploaded_file.getbuffer())
            st.session_state.rag_engine.ingest_pdf("temp_doc.pdf")
            st.success("Indexing Complete!")

    if st.button("üóëÔ∏è Reset All Data"):
        shutil.rmtree(CHROMA_PATH, ignore_errors=True)
        shutil.rmtree(IMAGE_DIR, ignore_errors=True)
        st.rerun()

# Main Search Area
col1, col2 = st.columns([3, 1])
with col1:
    user_query = st.chat_input("Ask about your document...")
with col2:
    query_img = st.file_uploader("Upload image query", type=['png', 'jpg', 'jpeg'])

# Execution Logic
if user_query or query_img:
    input_val = user_query
    is_img_mode = False
    
    if query_img:
        input_val = "query_image.jpg"
        with open(input_val, "wb") as f:
            f.write(query_img.getbuffer())
        is_img_mode = True

    with st.spinner("Generating Answer..."):
        ans, sources = st.session_state.rag_engine.query(input_val, is_image=is_img_mode)
        
        st.markdown("### ü§ñ Assistant Answer")
        st.write(ans)
        
        st.markdown("### üîç Source Evidence")
        grid = st.columns(3)
        for i, doc in enumerate(sources):
            with grid[i]:
                if doc.metadata.get("type") == "image":
                    st.image(doc.metadata["image_path"], caption="Visual Context")
                st.caption(f"Match Type: {doc.metadata.get('type')}")
                with st.expander("View Context Text"):
                    st.write(doc.page_content)