<a href="https://colab.research.google.com/github/Fatmaaai/AGF-x-ZAKA-Coursework/blob/main/graduation_project_multimodal_rag_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is our Multimodal rag system - graduation project development collabration notebook. In this notebook, we will explore different components of the system and run it using streamlit.

**Team Members:**

* Fatma Egal
* Mai Ahmed


**Supervised under:**
* Dr. Karim & Naim



---


**Project Goals:**




1.   Develop a robust and efficient multimodel RAG system.
2.   Experiment with different LLM models, embedding models, and retrieval strategies.
3. Explore and implement advanced features
4.   Design and implement a user-friendly Streamlit interface.



---

**Project Structure:**

This notebook will be organized into different sections:

1. **Data Loading and Preprocessing:**
    * Data loading and cleaning.
    * Text splitting and chunking.
2. **Embedding Generation:**
    * Embedding model selection and experimentation.
    * Embedding generation and storage.
3. **Retrieval:**
    * Index creation and management and retrieval.
4. **Generation:**
    * LLM model selection and prompting.
    * Response generation and evaluation.
5. **User Interface (Streamlit):**
    * UI design and development.
    * User interaction and feedback mechanisms.



---


**Collaboration Guidelines:**

* Use comments extensively to explain your code and document your findings.
* Make sure to save the changes you have done in the colab notebook.
* Clearly communicate and discuss any challenges or roadblocks encountered in our meetings.





**Time to learn, learn & learn together!**

# **MAIN**

In [None]:
import streamlit as st
from multimodal_rag import MultimodalRAGWithModelSelection
import tempfile
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set Hugging Face token if available
if os.getenv("HUGGINGFACE_TOKEN"):
    os.environ["HUGGINGFACE_TOKEN"] = os.getenv("HUGGINGFACE_TOKEN")

# Set page configuration
st.set_page_config(page_title="Multimodal RAG System", layout="wide")

# Title and description
st.title("Multimodal RAG System")
st.write("Upload a PDF document and ask questions about its content.  For LM Studio models, ensure LM Studio is running with the selected model and the server is active.")

# Initialize session state
if 'rag_system' not in st.session_state:
    st.session_state.rag_system = MultimodalRAGWithModelSelection()

if 'pdf_processed' not in st.session_state:
    st.session_state.pdf_processed = False

# Sidebar for model selection
st.sidebar.title("Model Selection")
model_options = {
    "T5 Small (Hugging Face)": "google/flan-t5-small",
    "Phi-3.5-mini-instruct (LM Studio)": "Phi-3.5-mini-instruct",  #  Adjust names as needed
    "Falcon-7B (LM Studio)": "Falcon-7B",
    "Llama 3.1 (LM Studio)": "Llama 3.1"
}

selected_model = st.sidebar.selectbox(
    "Select LLM Model",
    list(model_options.keys()),
    index=0
)

# Change model if selection changed
if st.session_state.get('selected_model') != selected_model:
    st.session_state.selected_model = selected_model
    model_name = model_options[selected_model]
    if 'rag_system' in st.session_state:
        try:
            st.session_state.rag_system.change_model(model_name)
            st.success(f"Model changed to {model_name}")
        except Exception as e:
            st.error(f"Error changing model: {e}")
            st.stop()

# File uploader - trying again
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])

if uploaded_file:
    # Save the uploaded file to a temporary location
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
        tmp_file.write(uploaded_file.getvalue())
        pdf_path = tmp_file.name

    # Process the PDF
    if not st.session_state.pdf_processed:
        try:
            stats = st.session_state.rag_system.process_pdf(pdf_path)
            st.session_state.pdf_processed = True
            st.success("PDF processed successfully")
        except Exception as e:
            st.error(f"Error processing PDF: {e}")
            st.stop()


# Query input
if st.session_state.pdf_processed and st.session_state.rag_system.llm is not None:  # Check if LLM is initialized
    st.subheader("Ask a Question")
    query = st.text_input("Enter your question about the document")

    if query:
        try:
            with st.spinner(f"Generating answer using {selected_model}..."):
                result = st.session_state.rag_system.query(query)

            # Display answer
            st.subheader("Answer")
            st.write(result["answer"])

            # Display retrieved documents
            st.subheader("Sources")
            for i, doc in enumerate(result["retrieved_docs"]):
                with st.expander(f"Source {i+1} ({doc['type']}, Page {doc['page']})"):
                    st.write(doc["content"])
        except ValueError as ve:
            st.error(f"RAG system error: {ve}")
        except Exception as e:
            st.error(f"An unexpected error occurred: {e}")

# Add model information in the sidebar
st.sidebar.markdown("---")
st.sidebar.subheader("Model Information")
st.sidebar.markdown("""
- **T5 Small (Hugging Face)**: Lightweight model, fastest but less capable (use with caution)
- **Phi-3.5-mini-instruct (LM Studio)**: Microsoft's compact but powerful model (via LM Studio)
- **Falcon-7B (LM Studio)**: Larger model with good performance (via LM Studio)
- **Llama 3.1 (LM Studio)**: Meta's latest model, most capable but requires more resources (via LM Studio)
""")

# Footer
st.markdown("---")
st.markdown("Multimodal RAG System Demo - WAI Team 3")



# **MUILTIMODEL RAG**

In [None]:
import os
import re
from typing import List, Dict, Any

import torch
import numpy as np
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader

import streamlit as st
import traceback

# LangChain imports
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Loaders
from langchain_community.document_loaders import PyPDFLoader

# Embeddings and models
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.llms import OpenAI
from langchain_community.vectorstores import FAISS
from langchain_community.chat_models import ChatOpenAI
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

from transformers import pipeline, AutoTokenizer

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

class MultimodalRAGWithModelSelection:
    def __init__(self, model_name="google/flan-t5-small"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        st.write(f"Using device: {self.device}")

        self.pdf_path = None
        self.combined_vectors = None
        self.llm = None
        self.retriever = None
        self.rag_chain = None
        self.current_model = model_name

        with st.spinner("Loading embedding model..."):
            self.text_embedder = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2"
            )

        with st.spinner(f"Loading language model: {model_name}..."):
            self._initialize_llm(model_name)

    def _initialize_llm(self, model_name):
        self.current_model = model_name

        try:
            if any(keyword in model_name.lower() for keyword in ["llama", "phi", "falcon"]):
                self.llm = ChatOpenAI(
                    model_name=model_name,
                    openai_api_key="DUMMY_KEY",
                    openai_api_base="http://localhost:1234/v1",
                    temperature=0.7,
                    max_tokens=512
                )
                st.success(f"Connected to LM Studio with model: {model_name}")
            elif model_name == "google/flan-t5-small":
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                gen_pipeline = pipeline(
                    "text2text-generation",
                    model=model_name,
                    tokenizer=tokenizer,
                    max_length=512,
                    device=0 if self.device == "cuda" else -1
                )
                self.llm = HuggingFacePipeline(pipeline=gen_pipeline)
                st.success(f"Loaded Hugging Face model: {model_name}")
            else:
                self.llm = None
                st.error(f"Model {model_name} not supported.")
        except Exception as e:
            st.error(f"Error initializing model: {e}")
            self.llm = None

        if self.retriever is not None:
            self._build_rag_chain()

    def change_model(self, new_model_name):
        if new_model_name != self.current_model:
            with st.spinner(f"Loading new model: {new_model_name}..."):
                self._initialize_llm(new_model_name)


    def process_pdf(self, pdf_path: str):
        self.pdf_path = pdf_path
        st.write("Processing PDF...")

        with st.spinner("Extracting text..."):
            text_docs = self._extract_text()
            st.write(f"Extracted {len(text_docs)} text chunks")

        with st.spinner("Extracting images and performing OCR..."):
            image_docs = self._extract_images()
            st.write(f"Extracted {len(image_docs)} image documents")

        with st.spinner("Creating vector database..."):
            self._create_vector_stores(text_docs, image_docs)

        with st.spinner("Building RAG chain..."):
            self._build_rag_chain()

        st.success("PDF processing complete!")

        return {
            "text_docs": len(text_docs),
            "image_docs": len(image_docs)
        }

    def _extract_text(self) -> List[Document]:
        try:
            loader = PyPDFLoader(self.pdf_path)
            documents = loader.load()

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                keep_separator=True
            )

            for i, doc in enumerate(documents):
                doc.metadata["source"] = self.pdf_path
                doc.metadata["page"] = i + 1
                doc.metadata["type"] = "text"

                headings = re.findall(r'^(#+)\s+(.+)$', doc.page_content, re.MULTILINE)
                if headings:
                    doc.metadata["headings"] = [h[1] for h in headings]

            return text_splitter.split_documents(documents)

        except Exception as e:
            st.error(f"Error in _extract_text: {e}")
            st.error(traceback.format_exc())
            return []


    def _extract_images(self) -> List[Document]:
        images = convert_from_path(self.pdf_path)
        image_docs = []

        for i, img in enumerate(images):
            ocr_text = pytesseract.image_to_string(img)
            if len(ocr_text.strip()) > 20:
                doc = Document(
                    page_content=ocr_text,
                    metadata={
                        "source": self.pdf_path,
                        "page": i + 1,
                        "type": "image"
                    }
                )
                image_docs.append(doc)

        return image_docs

    def _create_vector_stores(self, text_docs, image_docs):
        self.text_vectors = FAISS.from_documents(text_docs, self.text_embedder)

        if image_docs:
            self.image_vectors = FAISS.from_documents(image_docs, self.text_embedder)
            self.combined_vectors = self.text_vectors
            self.combined_vectors.merge_from(self.image_vectors)
        else:
            self.combined_vectors = self.text_vectors

    def _build_rag_chain(self):
        prompt = ChatPromptTemplate.from_template("""
        Answer the question based ONLY on the provided context.

        Context:
        {context}

        Question: {input}

        Instructions:
        1. Only use information from the provided context
        2. If the context contains tables or charts, analyze them
        3. If you don't know the answer based on the context, say "I don't have enough information to answer this question."
        4. Provide a detailed answer

        Answer:
        """)

        self.retriever = self.combined_vectors.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}
        )

        document_chain = create_stuff_documents_chain(self.llm, prompt)
        self.rag_chain = create_retrieval_chain(self.retriever, document_chain)

    def query(self, question: str) -> Dict[str, Any]:
        if not self.rag_chain:
            raise ValueError("RAG chain not initialized. Process a PDF first.")
        if self.llm is None:
            raise ValueError("Language model is not initialized. Please select a valid model.")

        response = self.rag_chain.invoke({"input": question})

        retrieved_docs = []
        for doc in response.get("context", []):
            doc_info = {
                "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
                "type": doc.metadata.get("type", "unknown"),
                "page": doc.metadata.get("page", "unknown")
            }
            retrieved_docs.append(doc_info)

        return {
            "answer": response.get("answer", "No answer generated"),
            "retrieved_docs": retrieved_docs
        }


# **RUN**

In [None]:
import os
import sys
from dotenv import load_dotenv
import asyncio
import sys

import sys

# Prevent torch introspection bug in Windows + Streamlit
if "torch._classes" in sys.modules:
    del sys.modules["torch._classes"]

if sys.platform == "win32":
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

import sys
sys.path.append('./app')

# Get the absolute path to the project root directory (where run.py is)
project_root = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(0, project_root)

# Load environment variables
load_dotenv()

# Import the main app
from app.main import *

# streamlit run run.py

#streamlit run app/main.py