In [22]:
%pwd

'/home/ahmed/CV_ADD_PROJECT/AIEnginear/AI-Medical-Assistant'

In [23]:
import os
os.chdir("../")

In [24]:
%pwd

'/home/ahmed/CV_ADD_PROJECT/AIEnginear'

https://drive.google.com/file/d/1iVvTfm3sxA1xmaXbcWHmbcGNJbghi3xn/view?usp=drive_link

https://drive.google.com/file/d/10BZvKanGhfBhaB5nSbtFOuFpKD2731IZ/view?usp=sharing

https://drive.google.com/file/d/1obGHxfCcPx0tm5rjkcSNbsVK3PH0UYnv/view?usp=sharing

In [25]:
# ! gdown 1iVvTfm3sxA1xmaXbcWHmbcGNJbghi3xn

In [26]:
# ! pip install -U \
#   langchain \
#   langchain-openai \
#   langchain-community \
#   chromadb \
#   openai \
#   unstructured \
#   "unstructured[pdf]" \
#   pillow==10.2.0\
#   pydantic \
#   python-dotenv \
#   pytesseract \
#   opencv-python


In [27]:
import os
import base64
import shutil
# import streamlit as st
from PIL import Image
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.vectorstores import Chroma
from unstructured.partition.pdf import partition_pdf

# --- CONFIGURATION & DIRECTORIES ---
CHROMA_PATH = "./chroma_db"
IMAGE_DIR = "figures"



for folder in [IMAGE_DIR, CHROMA_PATH]:
    if not os.path.exists(folder):
        os.makedirs(folder)

class MultiModalRAG:
    def __init__(self):
        # High-reasoning model (Cheap & Fast)
        self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        self.vectorstore = Chroma(
            collection_name="multirag",
            embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
            persist_directory=CHROMA_PATH
        )

    @staticmethod
    def encode_image(image_path):
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode('utf-8')

    def summarize_image(self, image_path):
        """Standardizes visual data into searchable text."""
        b64_image = self.encode_image(image_path)
        response = self.llm.invoke([
            HumanMessage(content=[
                {"type": "text", "text": "Analyze this image from a document. Describe charts, data points, or visual content for a search index."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"}}
            ])
        ])
        return response.content

    def process_pdf(self, file_path):
        """Industry-standard partitioning for PDF extraction."""
        elements = partition_pdf(
            filename=file_path,
            strategy="hi_res",
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=1500,
            image_output_dir_path=IMAGE_DIR
        )
        
        for el in elements:
            metadata = {"source": file_path, "type": "text"}
            if "Image" in str(type(el)):
                img_path = el.metadata.image_path
                content = self.summarize_image(img_path)
                metadata.update({"type": "image", "image_path": img_path})
            else:
                content = el.text
            
            if content:
                self.vectorstore.add_texts(texts=[content], metadatas=[metadata])
        
        return elements

    def query_system_image(self, user_input, is_image=False):
        """Cross-modal retrieval logic."""
        search_query = user_input
        if is_image:
            search_query = self.summarize_image(user_input)
        
        # Retrieve top matches
        docs = self.vectorstore.similarity_search(search_query, k=3)
        
        # Formulate answer based on context
        context_text = "\n\n".join([d.page_content for d in docs])
        prompt = [
            # SystemMessage(content="Answer the question using ONLY the provided context. Show expertise and clarity."),
            # SystemMessage(content="Answer based ONLY on the provided context. If an image is relevant, mention it."),
            SystemMessage(content="You are a helpful assistant. Answer the question using ONLY the provided context and if an image is relevant, mention it. If the answer isn't in the context, say you don't know."),

            HumanMessage(content=f"Context:\n{context_text}\n\nQuestion: {search_query}")
        ]
        answer = self.llm.invoke(prompt).content
        return answer, docs


    def query_system_all(self, user_input, is_image=False):
        """
        Cross-modal retrieval logic:
        - Text query → direct vector search
        - Image query → image → text summary → vector search
        """

        # 1️⃣ Decide search query
        if is_image:
            if not os.path.exists(user_input):
                raise ValueError(f"Image file not found: {user_input}")

            # Convert image → searchable text
            search_query = self.summarize_image(user_input)
        else:
            # Plain text query
            search_query = user_input

        # 2️⃣ Retrieve relevant chunks
        docs = self.vectorstore.similarity_search(search_query, k=3)

        # 3️⃣ Build context safely
        if not docs:
            return "I don't know. No relevant context was found.", []

        context_text = "\n\n".join(
            [f"[Chunk {i+1}]\n{d.page_content}" for i, d in enumerate(docs)]
        )

        # 4️⃣ LLM prompt
        prompt = [
            SystemMessage(
                content=(
                    "You are a helpful assistant. "
                    "Answer the question using ONLY the provided context. "
                    "If the answer is not in the context, say 'I don't know'. "
                    "If an image is relevant, mention it."
                )
            ),
            HumanMessage(
                content=f"Context:\n{context_text}\n\nQuestion:\n{search_query}"
            )
        ]

        # 5️⃣ Generate answer
        answer = self.llm.invoke(prompt).content

        return answer, docs



In [28]:
from dotenv import load_dotenv
import os
load_dotenv
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [None]:
rag = MultiModalRAG()
elements = rag.process_pdf(file_path='Data/image_table_columns.pdf')


  self.vectorstore = Chroma(




The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [9]:
print(len(elements))

TypeError: object of type 'NoneType' has no len()

In [None]:
query = 'figures/figure-1-6.jpg'
answer, docs = rag.query_system_all(user_input=query,is_image=True)

In [15]:
answer

'The image features a bar chart displaying data over a five-year period from 2020 to 2024, with an upward trend indicating growth. The y-axis ranges from 0 to 25, and the bars, colored orange, represent data points for each year, showing consistent increases in values from approximately 5 in 2020 to close to 20 in 2024.'

In [16]:
docs

[Document(metadata={'type': 'text', 'source': 'Data/image_table_columns.pdf'}, page_content='>\n\nFINANGI AL\n\nSTATEMENT\n\nExplore our financial performance through balance sheets, income, and cash flow statements.\n\nDELAITTE\n\nStartupAI boasts an impressive return on investment (ROI), demonstrating its financial acumen and ability to generate substantial profits for its stakeholders through strategic decisions and operational excellence.\n\nGROSS INCOME 22,000,000 $ TOTAL EXPENSES 2.000,000 $ TAXES 5,000.000 $ NET INCOME 15,000,000 $\n\n33% ROI\n\nStartupAI has achieved a remarkable $22 million in sales, showcasing its market dominance and strong customer appeal.\n\n25 20 15 10 5 0\n\n2020 2021 2022 2023 2024\n\nwww.startupAI.com\n\n+123-456-7890')]

In [37]:
import os
CHROMA_PATH = "./chroma_db"
print(os.path.abspath(CHROMA_PATH))
print(os.listdir(CHROMA_PATH))



/home/ahmed/CV_ADD_PROJECT/AIEnginear/chroma_db


FileNotFoundError: [Errno 2] No such file or directory: './chroma_db'

In [31]:
import shutil
shutil.rmtree(CHROMA_PATH, ignore_errors=True)


In [32]:
rag = MultiModalRAG()
elements = rag.process_pdf(file_path='Data/image_table_columns.pdf')


InternalError: Database error: error returned from database: (code: 1) no such table: tenants

In [None]:
print(len(elements))