In [1]:
import os
import faiss
import numpy as np
import fitz  
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

### DOCUMENT PROCESSING ###

# Load text embedding model
text_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

def preprocess_text(text):
    """Enhance retrieval by applying TF-IDF weighting."""
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X = vectorizer.fit_transform([text]).toarray()
    return " ".join(vectorizer.get_feature_names_out())

# Directory setup
base_dir = "business_flowcharts"
pdf_dir = os.path.join(base_dir, "documents")

# Read and process PDFs
pdf_texts = {}
pdf_filenames = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

pdf_embeddings = []
for file in pdf_filenames:
    pdf_path = os.path.join(pdf_dir, file)
    raw_text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(raw_text)
    pdf_texts[file] = raw_text
    embedding = text_model.encode(processed_text)
    pdf_embeddings.append(embedding)

# Convert to FAISS-compatible format
pdf_embeddings = np.array(pdf_embeddings, dtype="float32")
pdf_embeddings /= np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)  # Normalize (IMPORTANT)

# Create FAISS index
text_index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
text_index.add(pdf_embeddings)

print("✅ Stored document text embeddings in FAISS database.")


✅ Stored document text embeddings in FAISS database.


In [2]:
import torch
import clip
from PIL import Image
from torchvision import transforms

### FLOWCHART IMG PROCESSING ###

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def preprocess_image(image_path):
    """Preprocess an image dynamically while maintaining aspect ratio."""
    image = Image.open(image_path).convert("RGB")
    
    # Resize while keeping aspect ratio
    aspect_ratio = image.width / image.height
    if aspect_ratio > 1:
        new_width = 224
        new_height = int(224 / aspect_ratio)
    else:
        new_height = 224
        new_width = int(224 * aspect_ratio)

    transform = transforms.Compose([
        transforms.Resize((new_height, new_width)),  # Maintain aspect ratio
        transforms.Pad((0, 0, 224 - new_width, 224 - new_height), fill=(255, 255, 255)),  # Pad with white
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.481, 0.457, 0.408], std=[0.268, 0.261, 0.275]),  # CLIP normalization (ALSO IMPORATNT)
    ])

    return transform(image).unsqueeze(0)

def get_image_embedding(image_path):
    """Generate an embedding for a flowchart image using CLIP."""
    image_tensor = preprocess_image(image_path).to(device)
    with torch.no_grad():
        embedding = clip_model.encode_image(image_tensor).cpu().numpy()
    return embedding.flatten()

# Directory setup
image_dir = os.path.join(base_dir, "flowcharts")
image_filenames = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")])

# Process images
image_embeddings = [get_image_embedding(os.path.join(image_dir, file)) for file in image_filenames]
image_embeddings = np.array(image_embeddings, dtype="float32")
image_embeddings /= np.linalg.norm(image_embeddings, axis=1, keepdims=True)  # Normalize

# Create FAISS index
image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
image_index.add(image_embeddings)

print("✅ Stored flowchart image embeddings in FAISS database.")


✅ Stored flowchart image embeddings in FAISS database.


In [3]:
import pytesseract
from PIL import Image

def extract_text_from_image(image_path):
    """Extract text from a given flowchart image using OCR."""
    image = Image.open(image_path).convert("RGB")
    extracted_text = pytesseract.image_to_string(image)
    return extracted_text.strip()

def get_text_embedding(text):
    """Generate an embedding for extracted text using Sentence-BERT."""
    return text_model.encode(text)

print("Done")

Done


In [4]:
def get_query_image_embedding(image_path):
    """Generate a normalized embedding for a query image using CLIP."""
    image_tensor = preprocess_image(image_path).to(device)
    
    with torch.no_grad():
        embedding = clip_model.encode_image(image_tensor).cpu().numpy()
    
    return embedding.flatten() / np.linalg.norm(embedding)  # Normalize

def retrieve_relevant_data(query, flowchart_imgs=None, user_provided_pdfs=None, top_k=2):
    """Retrieve the most relevant documents & images for the query using FAISS."""

    if flowchart_imgs and isinstance(flowchart_imgs, str):
        flowchart_imgs = [flowchart_imgs]

    if user_provided_pdfs and isinstance(user_provided_pdfs, str):
        user_provided_pdfs = [user_provided_pdfs]

    # Convert query to text embedding
    query_embedding = text_model.encode(query).reshape(1, -1)
    query_embedding /= np.linalg.norm(query_embedding)

    # Search FAISS text database
    text_distances, text_results = text_index.search(query_embedding, top_k)
    retrieved_pdfs = [(pdf_filenames[idx], text_distances[0][i]) for i, idx in enumerate(text_results[0])]

    retrieved_images = []
    image_scores = []
    unknown_images = []
    unknown_pdfs = []

    indexed_flowcharts = set(image_filenames)
    indexed_pdfs = set(pdf_filenames)

    # Handle flowchart images
    if flowchart_imgs:
        for flowchart_img in flowchart_imgs:
            if os.path.basename(flowchart_img) not in indexed_flowcharts:
                print(f"🆕 Marking '{flowchart_img}' as [NEW FLOWCHART] (not in FAISS index)")
                unknown_images.append(flowchart_img)
                continue

            extracted_text = extract_text_from_image(flowchart_img)
            text_embedding = get_text_embedding(extracted_text)

            text_distances, text_results = text_index.search(text_embedding.reshape(1, -1), top_k)
            retrieved_pdfs += [(pdf_filenames[idx], text_distances[0][i]) for i, idx in enumerate(text_results[0])]

            image_query_embedding = get_query_image_embedding(flowchart_img)
            image_distances, image_results = image_index.search(image_query_embedding.reshape(1, -1), top_k)

            if image_results[0][0] >= 0:
                retrieved_images += [image_filenames[idx] for idx in image_results[0]]
                image_scores += list(image_distances[0])
            else:
                unknown_images.append(flowchart_img)

    # Handle user-provided PDFs
    if user_provided_pdfs:
        for user_pdf in user_provided_pdfs:
            if os.path.basename(user_pdf) not in indexed_pdfs:
                print(f"🆕 Marking '{user_pdf}' as [NEW PDF] (not in FAISS index)")
                unknown_pdfs.append(user_pdf)
            else:
                retrieved_pdfs.append((os.path.basename(user_pdf), 0))  # Mark as exact match (0 distance)

    # Sort PDFs & images by FAISS similarity scores
    retrieved_pdfs = sorted(set(retrieved_pdfs), key=lambda x: x[1])[:top_k]
    retrieved_pdfs = [pdf for pdf, _ in retrieved_pdfs]

    image_sorted = sorted(zip(retrieved_images, image_scores), key=lambda x: x[1])[:top_k]
    retrieved_images = [img for img, _ in image_sorted]

    retrieved_images += [f"[NEW FLOWCHART] {img}" for img in unknown_images]
    retrieved_pdfs += [f"[NEW PDF] {pdf}" for pdf in unknown_pdfs]

    return retrieved_pdfs, retrieved_images



print("✅ Retrieval function V2 (With user PDF) is ready.")


✅ Retrieval function V2 (With user PDF) is ready.


In [5]:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

# Load Qwen model
qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype=torch.bfloat16,  
    attn_implementation="flash_attention_2",
    device_map="auto"
)

qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
import re

def query_qwen_with_rag(query, flowchart_imgs=None, user_provided_pdfs=None, top_k=1):
    # Work related keywords are here so that the model don't automatically fetch documents for prompts that has nothing to do with work.
    # Feel free to comment this out should it be needed.
    work_related_keywords = [
        "flowchart", "document", "database", "pdf", "incident",
        "management", "network", "troubleshooting", "cybersecurity",
        "procedure", "process", "policy", "workflow", "information"
    ]

    # Check if query is work-related
    if not any(keyword in query.lower() for keyword in work_related_keywords):
        # If not work-related, skip retrieval and context
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": [{"type": "text", "text": query}]}],
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],
            return_tensors="pt"
        ).to(qwen_model.device)

        with torch.no_grad():
            output_ids = qwen_model.generate(**inputs, max_new_tokens=1024)

        response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=False)[0]

        match = re.search(r"assistant\s*\n(.*)", response_text, re.DOTALL)
        cleaned_response = match.group(1).strip() if match else response_text.strip()

        return cleaned_response, [], []

    # Otherwise, proceed normally
    if flowchart_imgs and isinstance(flowchart_imgs, str):
        flowchart_imgs = [flowchart_imgs]

    if user_provided_pdfs and isinstance(user_provided_pdfs, str):
        user_provided_pdfs = [user_provided_pdfs]

    retrieved_pdfs, retrieved_images = retrieve_relevant_data(
        query=query,
        flowchart_imgs=flowchart_imgs,
        user_provided_pdfs=user_provided_pdfs,
        top_k=top_k
    )

    # Extract text from retrieved PDFs, ignoring "[NEW PDF]" entries
    context = "\n".join([pdf_texts[pdf] for pdf in retrieved_pdfs if not pdf.startswith("[NEW PDF]")])

    # Extract and explicitly include text from user-provided PDFs
    user_pdf_texts = []
    descriptions = []
    
    if user_provided_pdfs:
        for user_pdf in user_provided_pdfs:
            is_new_pdf = os.path.basename(user_pdf) not in pdf_filenames
            
            if is_new_pdf:
                descriptions.append(f"🆕 This is a newly provided PDF: {os.path.basename(user_pdf)}")
                extracted_text = extract_text_from_pdf(user_pdf)
                user_pdf_texts.append(
                    f"[NEW PDF] User-Provided PDF '{os.path.basename(user_pdf)}':\n\n{extracted_text}\n\n"
                    "[NOTE: This PDF is explicitly NOT part of the existing database.]"
                )
            else:
                extracted_text = pdf_texts[os.path.basename(user_pdf)]
                user_pdf_texts.append(f"User-Provided PDF '{os.path.basename(user_pdf)}':\n\n{extracted_text}")
    
    user_pdf_section = "\n\n".join(user_pdf_texts)


    # Extract flowchart texts (unchanged)
    flowchart_texts = []
    valid_images = []
    
    if flowchart_imgs:
        for img_path in flowchart_imgs:
            is_new_flowchart = os.path.basename(img_path) not in image_filenames
            
            if is_new_flowchart:
                descriptions.append(f"🆕 This is a newly provided flowchart: {os.path.basename(img_path)}")
                extracted_text = extract_text_from_image(img_path)
                flowchart_texts.append(
                    f"[NEW FLOWCHART] Flowchart {len(flowchart_texts) + 1} (User-Provided):\n\n{extracted_text}\n\n"
                    "[NOTE: This flowchart is explicitly NOT part of the existing database.]"
                )
            else:
                extracted_text = extract_text_from_image(img_path)
                flowchart_texts.append(f"Flowchart {len(flowchart_texts) + 1}:\n\n{extracted_text}")
            
            valid_images.append(Image.open(img_path).convert("RGB"))
    
    flowchart_section = "\n\n".join(flowchart_texts)


    # PROMPT
    content = [
        *([{"type": "image", "image": img} for img in valid_images]),
        *([{"type": "text", "text": desc} for desc in descriptions]),
        {"type": "text", "text": (
            "Carefully review the provided context below:\n\n"
            f"Indexed Documents:\n{context}\n\n"
            f"User-Provided PDF Context:\n{user_pdf_section}\n\n"
            f"Flowchart Information:\n{flowchart_section}\n\n"
            "Important clarifications:\n"
            "- Documents labeled '[NEW PDF]' or '[NEW FLOWCHART]' are NOT part of the database.\n"
            "- Documents without these labels ARE part of the database.\n\n"
            "Your task:\n"
            "1. Clearly state whether the relevant document or flowchart is currently in the database.\n"
            "2. Provide a summary explaining its key contents and purpose, explicitly referencing the context provided. Pay attention if the user wants it brief or detailed.\n\n"
            f"Query: {query}"
        )},
    ]

    # Prepare Qwen inputs as before
    if not valid_images:
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": content}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],
            return_tensors="pt"
        ).to(qwen_model.device)
    else:
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": content}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],
            images=valid_images,
            padding=True,
            return_tensors="pt"
        ).to(qwen_model.device)

    if not hasattr(inputs, "input_ids"):
        print("⚠️ Error: Inputs are incorrectly formatted. Skipping generation.")
        return "Error: Invalid input formatting", retrieved_pdfs, retrieved_images

    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=1024)

    response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=False)[0]

    match = re.search(r"assistant\s*\n(.*)", response_text, re.DOTALL)
    cleaned_response = match.group(1).strip() if match else response_text.strip()

    return cleaned_response, retrieved_pdfs, retrieved_images


print("✅ Qwen RAG V2 (with user-provided PDFs) is ready.")


✅ Qwen RAG V2 (with user-provided PDFs) is ready.


In [7]:
# PROMPT

import time

user_query = "Hello there!"

start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")

# user_query_2 = "What is your function?"
user_query_2 = "Can you give me information about handling incident management?"

start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query_2)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")


🤖 Qwen's Response (With RAG):
Hello! How can I assist you today?<|im_end|>
🔍 Retrieved Documents: []
🖼️ Retrieved Flowcharts: []
⏳ Execution Time: 0.32 seconds

🤖 Qwen's Response (With RAG):
### Relevant Document or Flowchart in the Database

**Document Name:** Incident Management Flowchart

### Summary of Key Contents and Purpose

The Incident Management Flowchart outlines a structured approach to handling IT issues efficiently. It includes the following key steps:

1. **User Reports Issue**: The process begins when a user identifies an issue and reports it to the IT department through various channels such as a ticketing system, phone call, or automated monitoring alerts.

2. **Classify Incident**: The support team categorizes the issue based on its nature, urgency, and complexity to determine the appropriate response.

3. **High Impact?**: The severity of the incident is evaluated. If it is a high-impact issue affecting multiple users or critical systems, it is escalated immediatel

In [8]:
# IMAGE + PROMPT

import time

# Define the test query and flowchart image
user_query = "Explain what this flowchart is about in detail."

user_flowchart_img = "business_flowcharts/flowcharts/13_machine_maintenance.png"

start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, user_flowchart_img)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")


🤖 Qwen's Response (With RAG):
This flowchart outlines a structured approach to machine maintenance and troubleshooting within an industrial setting. It begins with the identification of a machine issue through formal reporting by operators or monitoring systems. The next step involves diagnosing the problem to determine the root cause. Depending on whether the issue is minor or requires further investigation, different actions are taken. For minor issues, quick fixes such as recalibration or replacing minor parts are performed. If the issue cannot be resolved quickly, the machine is tested under operational conditions to ensure the problem has been fully addressed. If the issue persists, a major repair is scheduled, which involves taking the machine offline for comprehensive repairs or component replacements. Once the repairs are completed and verified, the machine returns to operational status. This flowchart serves as a guide for technical teams to systematically detect and resolve 

In [9]:
# IMAGE + PDF + PROMPT

import time

# Define the test query and flowchart image
user_query = "Check this new incident management flowchart and pdf I provided. Is it an improvement from the one saved in the database?"

user_flowchart_img = "6_incident_management_v2.png"
user_pdf = "6_Incident_Management_V2.pdf"

start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, user_flowchart_img, user_pdf)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")

🆕 Marking '6_incident_management_v2.png' as [NEW FLOWCHART] (not in FAISS index)
🆕 Marking '6_Incident_Management_V2.pdf' as [NEW PDF] (not in FAISS index)

🤖 Qwen's Response (With RAG):
### Summary of the New Incident Management Process

#### Key Contents:
1. **Start**: The incident management process begins when an issue is reported or detected.
2. **User Reports Issue**: A user experiences a problem and submits an incident report. Details like error messages, timestamps, and impact assessments are crucial for efficient resolution.
3. **Initial Triage**: The support team performs an initial assessment to determine the urgency, scope, and potential resolution path.
4. **Classify Incident**: The incident is categorized based on its nature (e.g., hardware failure, software bug, network issue). Proper classification ensures the correct resolution protocol is followed.
5. **Automated Resolution?**: A decision is made regarding whether the issue can be resolved automatically. Common automa

In [10]:
# PDF + PROMPT

import time

# Define the test query and flowchart image
user_query = "Is this incident management pdf document an improvement from the one saved in the database? Explain in detail"
user_flowchart_img = []
user_pdf = "6_Incident_Management_V2.pdf"

start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, user_flowchart_img, user_pdf)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")

🆕 Marking '6_Incident_Management_V2.pdf' as [NEW PDF] (not in FAISS index)

🤖 Qwen's Response (With RAG):
### 1. Is this incident management PDF document an improvement from the one saved in the database?

**Yes, this incident management PDF document is an improvement from the one saved in the database.**

**Explanation:**
The new document, "6_Incident_Management_V2.pdf," provides a more detailed and structured approach to incident management compared to the previous version saved in the database. Here are the key improvements and additions:

- **Enhanced Workflow:** The new document includes a more comprehensive flowchart that outlines the entire incident management process, from reporting to resolution and escalation.
  
- **Structured Triage:** It introduces an "Initial Triage" step, which helps in quickly assessing the urgency and scope of the incident, allowing for more efficient resource allocation.

- **Automated Resolution Pathways:** The document specifies how to determine if 

In [11]:
# PDF + PROMPT

import time

# Define the test query and flowchart image
user_query = "Read out this pdf. Is it better than the old incident management pdf?"
user_flowchart_img = []
user_pdf = "6_Incident_Management_V2.pdf"

start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, user_flowchart_img, user_pdf)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")

🆕 Marking '6_Incident_Management_V2.pdf' as [NEW PDF] (not in FAISS index)

🤖 Qwen's Response (With RAG):
### Summary of the New Incident Management PDF

**Document Title:** Incident Management Process Flowchart (V2)

**Purpose:** 
This document provides an improved incident management process aimed at enhancing the efficiency and effectiveness of resolving reported issues. It includes structured triage, automated resolution pathways, impact classification, and follow-up mechanisms to ensure that resources are allocated appropriately and that the appropriate teams are assigned for resolution.

**Key Contents:**

1. **Start:**
   - The incident management process begins when an issue is reported or detected, triggering the workflow.

2. **User Reports Issue:**
   - Users submit incident reports through various channels like ticketing systems, emails, or direct calls to the support team. The details provided by the user, including error messages, timestamps, and impact assessments, are c

In [12]:
import re

def compare_query_qwen_with_rag(query, flowchart_imgs=None, user_provided_pdfs=None, top_k=1):
    """Retrieve relevant document and flowchart data, and query Qwen for a comparison-focused response."""

    # Ensure inputs are in list format
    if flowchart_imgs and isinstance(flowchart_imgs, str):
        flowchart_imgs = [flowchart_imgs]
    if user_provided_pdfs and isinstance(user_provided_pdfs, str):
        user_provided_pdfs = [user_provided_pdfs]

    # Retrieve relevant PDFs and flowcharts
    retrieved_pdfs, retrieved_images = retrieve_relevant_data(
        query=query,
        flowchart_imgs=flowchart_imgs,
        user_provided_pdfs=user_provided_pdfs,
        top_k=top_k
    )

    # Extract context from indexed PDFs (exclude [NEW PDF] labels)
    context = "\n".join([pdf_texts[pdf] for pdf in retrieved_pdfs if not pdf.startswith("[NEW PDF]")])

    # Extract user-provided PDF text
    user_pdf_texts = []
    descriptions = []

    if user_provided_pdfs:
        for pdf_path in user_provided_pdfs:
            if not os.path.exists(pdf_path):
                print(f"⚠️ Warning: PDF '{pdf_path}' not found. Skipping.")
                continue

            is_new_pdf = os.path.basename(pdf_path) not in pdf_filenames
            extracted_text = extract_text_from_pdf(pdf_path)

            label = "[NEW PDF] " if is_new_pdf else ""
            note = "\n[NOTE: This PDF is not in the existing database.]" if is_new_pdf else ""
            user_pdf_texts.append(
                f"{label}User-Provided PDF '{os.path.basename(pdf_path)}':\n\n{extracted_text}{note}"
            )
            if is_new_pdf:
                descriptions.append(f"🆕 This is a newly provided PDF: {os.path.basename(pdf_path)}")

    user_pdf_section = "\n\n".join(user_pdf_texts)

    # Extract flowchart OCR text
    flowchart_texts = []
    valid_images = []

    if flowchart_imgs:
        for img_path in flowchart_imgs:
            is_new_flowchart = os.path.basename(img_path) not in image_filenames

            if not os.path.exists(img_path):
                print(f"⚠️ Warning: Image '{img_path}' not found. Skipping.")
                continue

            if is_new_flowchart:
                descriptions.append(f"🆕 This is a newly provided flowchart: {os.path.basename(img_path)}")
                extracted_text = extract_text_from_image(img_path)
                flowchart_texts.append(
                    f"[NEW FLOWCHART] Flowchart {len(flowchart_texts) + 1} (User-Provided):\n\n{extracted_text}\n\n"
                    "[NOTE: This flowchart is NOT in the existing database.]"
                )
            else:
                extracted_text = extract_text_from_image(img_path)
                flowchart_texts.append(f"Flowchart {len(flowchart_texts) + 1}:\n\n{extracted_text}")

            valid_images.append(Image.open(img_path).convert("RGB"))

    flowchart_section = "\n\n".join(flowchart_texts)

    # PROMPT
    content = [
        *([{"type": "image", "image": img} for img in valid_images]),
        *([{"type": "text", "text": desc} for desc in descriptions]),
        {"type": "text", "text": (
            "You are provided with the following information for comparison:\n\n"
    
            "### SECTION 1: Content from the Existing Database\n"
            "**Database PDFs:**\n"
            f"{context if context.strip() else '[No indexed documents retrieved]'}\n\n"
    
            "**Database Flowcharts (OCR Extracted Text):**\n"
            f"{flowchart_section if flowchart_section.strip() else '[No indexed flowcharts retrieved]'}\n\n"
    
            "### SECTION 2: User-Provided Content for Comparison\n"
            "**User PDFs:**\n"
            f"{user_pdf_section if user_pdf_section.strip() else '[No user PDFs provided]'}\n\n"
    
            "**User Flowcharts (OCR Extracted Text):**\n"
            f"{flowchart_section if flowchart_section.strip() else '[No user flowcharts provided]'}\n\n"
    
            "### Task:\n"
            "- Carefully compare the content in Section 2 (user-provided) against Section 1 (database).\n"
            "- If multiple flowcharts are provided, **compare the second one to the first**.\n"
            "- Highlight step-by-step differences, changes in structure, terminology, or flow.\n"
            "- Summarize each major difference clearly.\n\n"
            f"{query}"
        )}
    ]

    # Handle text-only queries if no images are involved
    if not valid_images:
        print("🔹 No images detected, processing as a pure text query.")
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": content}],
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],
            return_tensors="pt"
        ).to(qwen_model.device)
    else:
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": content}],
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],
            images=valid_images,
            padding=True,
            return_tensors="pt"
        ).to(qwen_model.device)

    # Sanity check before generating
    if not hasattr(inputs, "input_ids"):
        print("⚠️ Error: Inputs are incorrectly formatted. Skipping generation.")
        return "Error: Invalid input formatting", retrieved_pdfs, retrieved_images

    # Generate response
    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=1024)

    # Decode and clean response
    response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=False)[0]
    match = re.search(r"assistant\s*\n(.*)", response_text, re.DOTALL)
    cleaned_response = match.group(1).strip() if match else response_text.strip()

    return cleaned_response, retrieved_pdfs, retrieved_images

print("✅ Improved Comparison RAG is set!")


✅ Improved Comparison RAG is set!


In [13]:
user_query = "Can you compare these flowcharts and pdfs in detail? What are the differences between the first and the second one?"

flowchart_imgs = [
    "business_flowcharts/flowcharts/6_incident_management.png",
    "6_incident_management_v2.png"
]

# flowchart_imgs = [
#     "business_flowcharts/flowcharts/9_cybersecurity_incident_response.png",
#     "9_cybersecurity_incident_response_v2.png"
# ]

user_pdf = "6_Incident_Management_V2.pdf"

start_time = time.time()

qwen_response_rag, retrieved_pdfs, retrieved_images = compare_query_qwen_with_rag(
    query=user_query,
    flowchart_imgs=flowchart_imgs,
    user_provided_pdfs=user_pdf
)

end_time = time.time()
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {end_time - start_time:.2f} seconds")


🆕 Marking '6_incident_management_v2.png' as [NEW FLOWCHART] (not in FAISS index)
🆕 Marking '6_Incident_Management_V2.pdf' as [NEW PDF] (not in FAISS index)

🤖 Qwen's Response (With RAG):
Certainly! Let's carefully compare the content in Section 2 (user-provided) against Section 1 (database), focusing on the differences, changes in structure, terminology, or flow.

### Differences Between the First and Second Flowcharts

#### 1. **Initial Triage**
   - **Database Flowchart:** No specific mention of "Initial Triage."
   - **User Provided Flowchart:** Added "Initial Triage" as a step after "User Reports Issue."

#### 2. **Automated Resolution?**
   - **Database Flowchart:** No specific mention of "Automated Resolution?"
   - **User Provided Flowchart:** Added "Automated Resolution?" as a step after "Classify Incident."

#### 3. **High Impact?**
   - **Database Flowchart:** No specific mention of "High Impact?"
   - **User Provided Flowchart:** Added "High Impact?" as a step after "Classif

In [14]:
# # Test saved documents in FAISS database
# def view_stored_pdfs():
#     """Display all stored PDFs and their extracted text."""
#     for filename, text in pdf_texts.items():
#         print(f"📄 PDF: {filename}\n")
#         print(f"Extracted Content:\n{text[:1000]}")  # Show first 1000 characters
#         print("="*80)

# view_stored_pdfs()


In [15]:
# import matplotlib.pyplot as plt

# # Test saved images in FAISS database

# def view_stored_flowcharts():
#     """Display all stored flowchart images and their filenames."""
#     for filename in image_filenames:
#         image_path = os.path.join(image_dir, filename)
#         image = Image.open(image_path)
        
#         plt.figure(figsize=(5, 5))
#         plt.imshow(image)
#         plt.axis("off")
#         plt.title(f"🖼️ Flowchart: {filename}")
#         plt.show()

# view_stored_flowcharts()


In [16]:
# def check_pdf_embedding(index=11):
#     """Check stored text embedding by retrieving the closest match for a given PDF."""
#     query_embedding = pdf_embeddings[index].reshape(1, -1)
#     _, retrieved_indices = text_index.search(query_embedding, 1)
    
#     original_pdf = pdf_filenames[index]
#     matched_pdf = pdf_filenames[retrieved_indices[0][0]]
    
#     print(f"📄 Original PDF: {original_pdf}")
#     print(f"🔍 Closest Match: {matched_pdf}")
#     print(f"Similarity Score: {np.dot(pdf_embeddings[index], pdf_embeddings[retrieved_indices[0][0]])}")
    
# check_pdf_embedding()


In [17]:
# import matplotlib.pyplot as plt

# def check_image_embedding(index=3):
#     """Check stored image embedding by retrieving the closest match for a given flowchart."""
#     query_embedding = image_embeddings[index].reshape(1, -1)
#     _, retrieved_indices = image_index.search(query_embedding, 1)
    
#     original_image = image_filenames[index]
#     matched_image = image_filenames[retrieved_indices[0][0]]
    
#     print(f"🖼️ Original Flowchart: {original_image}")
#     print(f"🔍 Closest Match: {matched_image}")
#     print(f"Similarity Score: {np.dot(image_embeddings[index], image_embeddings[retrieved_indices[0][0]])}")
    
#     # Show both images
#     fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    
#     ax[0].imshow(Image.open(os.path.join(image_dir, original_image)))
#     ax[0].set_title("Original Image")
#     ax[0].axis("off")
    
#     ax[1].imshow(Image.open(os.path.join(image_dir, matched_image)))
#     ax[1].set_title("Closest Match")
#     ax[1].axis("off")
    
#     plt.show()

# check_image_embedding()


In [18]:
# # Check if all image embeddings are unique
# unique_embeddings = np.unique(image_embeddings, axis=0)

# if unique_embeddings.shape[0] == 1:
#     print("⚠️ WARNING: All image embeddings are identical! FAISS cannot differentiate them.")
# else:
#     print(f"✅ FAISS has {unique_embeddings.shape[0]} unique image embeddings.")


In [19]:
# # Print shapes of stored image embeddings
# print(f"Stored Image Embeddings Shape: {image_embeddings.shape}")

# # Generate a query embedding for comparison
# query_embedding = get_query_image_embedding(os.path.join(image_dir, image_filenames[0]))  # Use any image as query
# print(f"Query Image Embedding Shape: {query_embedding.shape}")

# # Print first stored embedding vs query embedding
# print(f"\nFirst Stored Embedding:\n{image_embeddings[0][:10]}")  # Print first 10 values
# print(f"\nQuery Embedding:\n{query_embedding[:10]}")  # Print first 10 values


In [20]:
# def debug_faiss_retrieval(index=3):
#     """Check if FAISS is properly differentiating images."""
#     query_embedding = image_embeddings[index].reshape(1, -1)  # Use stored image for retrieval test
#     _, retrieved_indices = image_index.search(query_embedding, 5)  # Top 5 results

#     print(f"🖼️ Original Flowchart: {image_filenames[index]}")
#     print(f"\n🔍 Closest Matches:")
#     for rank, idx in enumerate(retrieved_indices[0]):
#         matched_image = image_filenames[idx]
#         similarity_score = np.dot(image_embeddings[index], image_embeddings[idx])  # Cosine similarity
#         print(f"{rank + 1}. {matched_image} (Score: {similarity_score:.6f})")

# debug_faiss_retrieval()
