In [1]:
import os
import faiss
import numpy as np
import fitz  
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

### DOCUMENT PROCESSING ###

# Load text embedding model
text_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

def preprocess_text(text):
    """Enhance retrieval by applying TF-IDF weighting."""
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X = vectorizer.fit_transform([text]).toarray()
    return " ".join(vectorizer.get_feature_names_out())

# Directory setup
base_dir = "business_flowcharts"
pdf_dir = os.path.join(base_dir, "documents")

# Read and process PDFs
pdf_texts = {}
pdf_filenames = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

pdf_embeddings = []
for file in pdf_filenames:
    pdf_path = os.path.join(pdf_dir, file)
    raw_text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(raw_text)
    pdf_texts[file] = raw_text
    embedding = text_model.encode(processed_text)
    pdf_embeddings.append(embedding)

# Convert to FAISS-compatible format
pdf_embeddings = np.array(pdf_embeddings, dtype="float32")
pdf_embeddings /= np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)  # Normalize (IMPORTANT)

# Create FAISS index
text_index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
text_index.add(pdf_embeddings)

print("✅ Stored document text embeddings in FAISS database.")


✅ Stored document text embeddings in FAISS database.


In [2]:
import torch
import clip
from PIL import Image
from torchvision import transforms

### FLOWCHART IMG PROCESSING ###

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def preprocess_image(image_path):
    """Preprocess an image dynamically while maintaining aspect ratio."""
    image = Image.open(image_path).convert("RGB")
    
    # Resize while keeping aspect ratio
    aspect_ratio = image.width / image.height
    if aspect_ratio > 1:
        new_width = 224
        new_height = int(224 / aspect_ratio)
    else:
        new_height = 224
        new_width = int(224 * aspect_ratio)

    transform = transforms.Compose([
        transforms.Resize((new_height, new_width)),  # Maintain aspect ratio
        transforms.Pad((0, 0, 224 - new_width, 224 - new_height), fill=(255, 255, 255)),  # Pad with white
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.481, 0.457, 0.408], std=[0.268, 0.261, 0.275]),  # CLIP normalization (ALSO IMPORATNT)
    ])

    return transform(image).unsqueeze(0)

def get_image_embedding(image_path):
    """Generate an embedding for a flowchart image using CLIP."""
    image_tensor = preprocess_image(image_path).to(device)
    with torch.no_grad():
        embedding = clip_model.encode_image(image_tensor).cpu().numpy()
    return embedding.flatten()

# Directory setup
image_dir = os.path.join(base_dir, "flowcharts")
image_filenames = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")])

# Process images
image_embeddings = [get_image_embedding(os.path.join(image_dir, file)) for file in image_filenames]
image_embeddings = np.array(image_embeddings, dtype="float32")
image_embeddings /= np.linalg.norm(image_embeddings, axis=1, keepdims=True)  # Normalize

# Create FAISS index
image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
image_index.add(image_embeddings)

print("✅ Stored flowchart image embeddings in FAISS database.")


✅ Stored flowchart image embeddings in FAISS database.


In [3]:
import pytesseract
from PIL import Image

def extract_text_from_image(image_path):
    """Extract text from a given flowchart image using OCR."""
    image = Image.open(image_path).convert("RGB")
    extracted_text = pytesseract.image_to_string(image)
    return extracted_text.strip()

def get_text_embedding(text):
    """Generate an embedding for extracted text using Sentence-BERT."""
    return text_model.encode(text)

print("Done")

Done


In [4]:
def get_query_image_embedding(image_path):
    """Generate a normalized embedding for a query image using CLIP."""
    image_tensor = preprocess_image(image_path).to(device)
    
    with torch.no_grad():
        embedding = clip_model.encode_image(image_tensor).cpu().numpy()
    
    return embedding.flatten() / np.linalg.norm(embedding)  # Normalize

def retrieve_relevant_data(query, flowchart_imgs=None, top_k=2):
    """Retrieve the most relevant documents & images for the query using FAISS."""

    if flowchart_imgs and isinstance(flowchart_imgs, str):
        flowchart_imgs = [flowchart_imgs]  # Ensure list format

    # Convert query to text embedding
    query_embedding = text_model.encode(query).reshape(1, -1)
    query_embedding /= np.linalg.norm(query_embedding)  # Normalize

    # Search FAISS text database (Retrieve relevant PDFs)
    text_distances, text_results = text_index.search(query_embedding, top_k)
    retrieved_pdfs = [(pdf_filenames[idx], text_distances[0][i]) for i, idx in enumerate(text_results[0])]

    retrieved_images = []
    image_scores = []
    unknown_images = []  # Store unrecognized images

    # Get a set of indexed flowcharts
    indexed_flowcharts = set(image_filenames)  # All pre-indexed flowcharts

    if flowchart_imgs:
        for flowchart_img in flowchart_imgs:
            # If the image is outside of the indexed directory, mark as NEW
            if os.path.basename(flowchart_img) not in indexed_flowcharts:
                print(f"🆕 Marking '{flowchart_img}' as [NEW FLOWCHART] (not in FAISS index)")
                unknown_images.append(flowchart_img)
                continue  # Skip FAISS retrieval

            # Normal FAISS retrieval for known images
            extracted_text = extract_text_from_image(flowchart_img)
            text_embedding = get_text_embedding(extracted_text)

            # Retrieve documents based on extracted image text
            text_distances, text_results = text_index.search(text_embedding.reshape(1, -1), top_k)
            retrieved_pdfs += [(pdf_filenames[idx], text_distances[0][i]) for i, idx in enumerate(text_results[0])]

            # Get CLIP image embedding
            image_query_embedding = get_query_image_embedding(flowchart_img)

            # Retrieve similar images from FAISS (with distance scores)
            image_distances, image_results = image_index.search(image_query_embedding.reshape(1, -1), top_k)

            if image_results[0][0] >= 0:  # If valid results exist
                retrieved_images += [image_filenames[idx] for idx in image_results[0]]
                image_scores += list(image_distances[0])  # Store distances
            else:
                unknown_images.append(flowchart_img)  # Mark as unknown

    # Sort PDFs & images by FAISS similarity scores
    retrieved_pdfs = sorted(set(retrieved_pdfs), key=lambda x: x[1])[:top_k]
    retrieved_pdfs = [pdf for pdf, _ in retrieved_pdfs]  # Keep only filenames

    # Sort images by similarity scores
    image_sorted = sorted(zip(retrieved_images, image_scores), key=lambda x: x[1])[:top_k]
    retrieved_images = [img for img, _ in image_sorted]  # Extract sorted filenames

    # Append unknown images explicitly labeled as "New Flowchart (V2)"
    retrieved_images += [f"[NEW FLOWCHART] {img}" for img in unknown_images]

    return retrieved_pdfs, retrieved_images


print("✅ Retrieval function V2 is ready.")


✅ Retrieval function V2 is ready.


In [5]:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

# Load Qwen model
qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype=torch.bfloat16,  
    attn_implementation="flash_attention_2",
    device_map="auto"
)

qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
# def debug_query_qwen_with_rag(query, flowchart_img=None, top_k=2):
#     """DEBUG MODE - Retrieve relevant flowchart data (text & images) and query Qwen for an AI-generated response."""
    
#     # Retrieve relevant PDFs (text) & Flowcharts (images)
#     retrieved_pdfs, retrieved_images = retrieve_relevant_data(query, flowchart_img, top_k)
    
#     # Extract text from retrieved PDFs
#     context = "\n".join([pdf_texts[pdf] for pdf in retrieved_pdfs])

#     # Use retrieved flowchart image if `flowchart_img` is None
#     image_path = flowchart_img if flowchart_img else os.path.join(image_dir, retrieved_images[0])
#     image = Image.open(image_path).convert("RGB")
    
#     # Define the user message (Injecting retrieved context)
#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 {"type": "image", "image": image},
#                 {"type": "text", "text": f"Attached is document providing context. Based on the provided image and textual information, please analyze the content and generate a response that accurately addresses the user's inquiry.\n\nContext:\n{context}\n\nQuery: {query}"},
#             ],
#         }
#     ]
    
#     # Format input for Qwen
#     text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     inputs = qwen_processor(
#         text=[text],
#         images=[image],
#         padding=True,
#         return_tensors="pt",
#     ).to(qwen_model.device)
    
#     # Generate response
#     with torch.no_grad():
#         output_ids = qwen_model.generate(inputs, max_new_tokens=512)
    
#     # Decode response
#     response_text = "========\n\n".join(qwen_processor.batch_decode(output_ids, skip_special_tokens=False))
    
#     return response_text, retrieved_pdfs, retrieved_images

# print("✅ DEBUG Qwen RAG system is ready.")


In [7]:
import re

def query_qwen_with_rag(query, flowchart_imgs=None, top_k=2):
    """Retrieve relevant flowchart data (text & images) and query Qwen for an AI-generated response."""

    if flowchart_imgs and isinstance(flowchart_imgs, str):
        flowchart_imgs = [flowchart_imgs]  # Ensure list format

    # Retrieve relevant PDFs (text) & Flowcharts (images)
    retrieved_pdfs, retrieved_images = retrieve_relevant_data(query, flowchart_imgs, top_k)
    
    # Extract text from retrieved PDFs
    context = "\n".join([pdf_texts[pdf] for pdf in retrieved_pdfs])

    # Extract text from each flowchart individually
    flowchart_texts = []
    descriptions = []
    valid_images = []  # Store valid images

    if flowchart_imgs:
        for img_path in flowchart_imgs:
            if img_path.startswith("[NEW FLOWCHART]"):
                descriptions.append(f"⚠️ This is a newly provided flowchart: {img_path.replace('[NEW FLOWCHART] ', '')}")
                flowchart_texts.append(f"Flowchart {len(flowchart_texts) + 1} (User-Provided Flowchart):\n\n[Unable to extract full text, refer to image]")
                continue

            if not os.path.exists(img_path):
                print(f"⚠️ Warning: Image '{img_path}' not found. Skipping.")
                continue

            # Extract OCR text
            extracted_text = extract_text_from_image(img_path)
            flowchart_texts.append(f"Flowchart {len(flowchart_texts) + 1}:\n\n{extracted_text}")

            # Load valid image
            valid_images.append(Image.open(img_path).convert("RGB"))

    # Ensure the AI processes both flowcharts separately
    flowchart_section = "\n\n".join(flowchart_texts)

    # Build user message content
    content = [
        *([{"type": "image", "image": img} for img in valid_images]),  # Attach images
        *([{"type": "text", "text": desc} for desc in descriptions]),  # Describe unknown images
        {"type": "text", "text": f"Context:\n{context}\n\n{flowchart_section}\n\n{query}"}
    ]

    # Ensure `inputs` is correctly formatted
    if not valid_images:  # Handle text-only query
        print("🔹 No images detected, processing as a pure text query.")
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": [{"type": "text", "text": query}]}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],  # Ensure list format
            return_tensors="pt"
        ).to(qwen_model.device)
    else:
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": content}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],  # Ensure list format
            images=valid_images,  # Provide images only if available
            padding=True,
            return_tensors="pt"
        ).to(qwen_model.device)

    # Check if inputs is correctly formatted before generating
    if not hasattr(inputs, "input_ids"):
        print("⚠️ Error: Inputs are incorrectly formatted. Skipping generation.")
        return "Error: Invalid input formatting", retrieved_pdfs, retrieved_images

    # Generate response
    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=1024)

    # Decode response
    response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=False)[0]

    # Extract Assistant's Response Only
    match = re.search(r"assistant\s*\n(.*)", response_text, re.DOTALL)
    cleaned_response = match.group(1).strip() if match else response_text.strip()

    return cleaned_response, retrieved_pdfs, retrieved_images


print("✅ Qwen RAG V2 system is ready.")


✅ Qwen RAG V2 system is ready.


In [8]:
import time

# Define the test query and flowchart image
# user_query = "Can you see what flowchart this is? Can you list out the steps and give an explanation?"
# flowchart_img = "business_flowcharts/flowcharts/14_quality_control.png"
# flowchart_img = "business_flowcharts/flowcharts/17_medical_diagnosis.png"
user_query = "Who classifies the incidents here?"
flowchart_img = "business_flowcharts/flowcharts/6_incident_management.png"
# In this cybersecurity flowchart, explain the difference in action should the threat be labelled as critical or not.
# Measure execution time
import time
start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, flowchart_img)

# Calculate total time taken
end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")


🤖 Qwen's Response (With RAG):
In the Incident Management flowchart provided, the classification of incidents is performed by the support team. Specifically, the "Classify Incident" step involves the support team categorizing the issue based on its nature, urgency, and complexity to determine the appropriate response.<|im_end|>
🔍 Retrieved Documents: ['6_Incident_Management.pdf', '10_Help_Desk_Ticketing.pdf']
🖼️ Retrieved Flowcharts: ['6_incident_management.png', '1_customer_support.png']
⏳ Execution Time: 2.00 seconds


In [9]:
import re

def compare_query_qwen_with_rag(query, flowchart_imgs=None, top_k=2):
    """Retrieve relevant flowchart data (text & images) and query Qwen for an AI-generated response."""

    if flowchart_imgs and isinstance(flowchart_imgs, str):
        flowchart_imgs = [flowchart_imgs]  # Ensure list format

    # Retrieve relevant PDFs (text) & Flowcharts (images)
    retrieved_pdfs, retrieved_images = retrieve_relevant_data(query, flowchart_imgs, top_k)
    
    # Extract text from retrieved PDFs
    context = "\n".join([pdf_texts[pdf] for pdf in retrieved_pdfs])

    # Extract text from each flowchart individually
    flowchart_texts = []
    descriptions = []
    valid_images = []  # Store valid images

    if flowchart_imgs:
        for img_path in flowchart_imgs:
            if img_path.startswith("[NEW FLOWCHART]"):
                descriptions.append(f"⚠️ This is a newly provided flowchart: {img_path.replace('[NEW FLOWCHART] ', '')}")
                flowchart_texts.append(f"Flowchart {len(flowchart_texts) + 1} (User-Provided Flowchart):\n\n[Unable to extract full text, refer to image]")
                continue

            if not os.path.exists(img_path):
                print(f"⚠️ Warning: Image '{img_path}' not found. Skipping.")
                continue

            # Extract OCR text
            extracted_text = extract_text_from_image(img_path)
            flowchart_texts.append(f"Flowchart {len(flowchart_texts) + 1}:\n\n{extracted_text}")

            # Load valid image
            valid_images.append(Image.open(img_path).convert("RGB"))

    # Ensure the AI processes both flowcharts separately
    flowchart_section = "\n\n".join(flowchart_texts)

    # Define the user message (Injecting retrieved context)
    content = [
        *([{"type": "image", "image": img} for img in valid_images]),  # Attach images
        *([{"type": "text", "text": desc} for desc in descriptions]),  # Describe unknown images
        {"type": "text", "text": f"Below are the details extracted from the flowcharts provided:\n\n{flowchart_section}\n\nCompare the flowcharts carefully. Provide step-by-step differences and explain any structural changes.\n\n{query}"},
    ]

    # Ensure `inputs` is correctly formatted
    if not valid_images:  # Handle text-only query
        print("🔹 No images detected, processing as a pure text query.")
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": [{"type": "text", "text": query}]}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],  # Ensure list format
            return_tensors="pt"
        ).to(qwen_model.device)
    else:
        text_input = qwen_processor.apply_chat_template(
            [{"role": "user", "content": content}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        inputs = qwen_processor(
            text=[text_input],  # Ensure list format
            images=valid_images,  # Provide images only if available
            padding=True,
            return_tensors="pt"
        ).to(qwen_model.device)

    # Check if inputs is correctly formatted before generating
    if not hasattr(inputs, "input_ids"):
        print("⚠️ Error: Inputs are incorrectly formatted. Skipping generation.")
        return "Error: Invalid input formatting", retrieved_pdfs, retrieved_images

    # Generate response
    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=1024)

    # Decode response
    response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=False)[0]

    # Extract Assistant's Response Only
    match = re.search(r"assistant\s*\n(.*)", response_text, re.DOTALL)
    cleaned_response = match.group(1).strip() if match else response_text.strip()

    return cleaned_response, retrieved_pdfs, retrieved_images

print("✅ Comparison RAG IS FIXED I HOPE")



✅ Comparison RAG IS FIXED I HOPE


In [10]:
import time

# Define the test query and multiple flowchart images
# user_query = "Can you compare these flowcharts in detail? What are the difference between the first and the second one. I'm working on the second one to give more detail."
user_query = "Can you compare these flowcharts in detail? What are the difference between the first and the second one. I'm working on the second one to give more detail. Summarize each comparison."
flowchart_imgs = [
    "business_flowcharts/flowcharts/6_incident_management.png",
    "6_incident_management_v2.png"
]

# flowchart_imgs = [
#     "business_flowcharts/flowcharts/9_cybersecurity_incident_response.png",
#     "9_cybersecurity_incident_response_v2.png"
# ]
# Measure execution time
start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = compare_query_qwen_with_rag(user_query, flowchart_imgs)

# Calculate total time taken
end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")


🆕 Marking '6_incident_management_v2.png' as [NEW FLOWCHART] (not in FAISS index)

🤖 Qwen's Response (With RAG):
Certainly! Let's compare the two flowcharts step by step:

### Flowchart 1:
1. **Start**
2. **User Reports Issue**
3. **Classify Incident**
4. **High Impact?**
   - If Yes: Escalate to Higher Support
   - If No: Assign to Support Team
5. **Resolve Issue**
6. **Issue Resolved?**
   - If Yes: Close Ticket
   - If No: Escalate to Higher Support

### Flowchart 2:
1. **Start**
2. **User Reports Issue**
3. **Initial Triage**
4. **Classify Incident**
5. **Automated Resolution?**
   - If Yes: Resolve Issue
   - If No:
     - High Impact?
       - If Yes: Escalate to Higher Support
       - If No: Assign to Support Team
     - If Yes: Critical?
       - If Yes: Escalate to Higher Support
       - If No: Assign to Support Team
6. **Resolve Issue**
7. **Issue Resolved?**
   - If Yes: Close Ticket
   - If No: Follow-up Needed?

### Differences:

1. **Initial Triage**:
   - **Flowchart 1*

In [11]:
# # Test saved documents in FAISS database
# def view_stored_pdfs():
#     """Display all stored PDFs and their extracted text."""
#     for filename, text in pdf_texts.items():
#         print(f"📄 PDF: {filename}\n")
#         print(f"Extracted Content:\n{text[:1000]}")  # Show first 1000 characters
#         print("="*80)

# view_stored_pdfs()


In [12]:
# import matplotlib.pyplot as plt

# # Test saved images in FAISS database

# def view_stored_flowcharts():
#     """Display all stored flowchart images and their filenames."""
#     for filename in image_filenames:
#         image_path = os.path.join(image_dir, filename)
#         image = Image.open(image_path)
        
#         plt.figure(figsize=(5, 5))
#         plt.imshow(image)
#         plt.axis("off")
#         plt.title(f"🖼️ Flowchart: {filename}")
#         plt.show()

# view_stored_flowcharts()


In [13]:
# def check_pdf_embedding(index=11):
#     """Check stored text embedding by retrieving the closest match for a given PDF."""
#     query_embedding = pdf_embeddings[index].reshape(1, -1)
#     _, retrieved_indices = text_index.search(query_embedding, 1)
    
#     original_pdf = pdf_filenames[index]
#     matched_pdf = pdf_filenames[retrieved_indices[0][0]]
    
#     print(f"📄 Original PDF: {original_pdf}")
#     print(f"🔍 Closest Match: {matched_pdf}")
#     print(f"Similarity Score: {np.dot(pdf_embeddings[index], pdf_embeddings[retrieved_indices[0][0]])}")
    
# check_pdf_embedding()


In [14]:
# import matplotlib.pyplot as plt

# def check_image_embedding(index=3):
#     """Check stored image embedding by retrieving the closest match for a given flowchart."""
#     query_embedding = image_embeddings[index].reshape(1, -1)
#     _, retrieved_indices = image_index.search(query_embedding, 1)
    
#     original_image = image_filenames[index]
#     matched_image = image_filenames[retrieved_indices[0][0]]
    
#     print(f"🖼️ Original Flowchart: {original_image}")
#     print(f"🔍 Closest Match: {matched_image}")
#     print(f"Similarity Score: {np.dot(image_embeddings[index], image_embeddings[retrieved_indices[0][0]])}")
    
#     # Show both images
#     fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    
#     ax[0].imshow(Image.open(os.path.join(image_dir, original_image)))
#     ax[0].set_title("Original Image")
#     ax[0].axis("off")
    
#     ax[1].imshow(Image.open(os.path.join(image_dir, matched_image)))
#     ax[1].set_title("Closest Match")
#     ax[1].axis("off")
    
#     plt.show()

# check_image_embedding()


In [15]:
# # Check if all image embeddings are unique
# unique_embeddings = np.unique(image_embeddings, axis=0)

# if unique_embeddings.shape[0] == 1:
#     print("⚠️ WARNING: All image embeddings are identical! FAISS cannot differentiate them.")
# else:
#     print(f"✅ FAISS has {unique_embeddings.shape[0]} unique image embeddings.")


In [16]:
# # Print shapes of stored image embeddings
# print(f"Stored Image Embeddings Shape: {image_embeddings.shape}")

# # Generate a query embedding for comparison
# query_embedding = get_query_image_embedding(os.path.join(image_dir, image_filenames[0]))  # Use any image as query
# print(f"Query Image Embedding Shape: {query_embedding.shape}")

# # Print first stored embedding vs query embedding
# print(f"\nFirst Stored Embedding:\n{image_embeddings[0][:10]}")  # Print first 10 values
# print(f"\nQuery Embedding:\n{query_embedding[:10]}")  # Print first 10 values


In [17]:
# def debug_faiss_retrieval(index=3):
#     """Check if FAISS is properly differentiating images."""
#     query_embedding = image_embeddings[index].reshape(1, -1)  # Use stored image for retrieval test
#     _, retrieved_indices = image_index.search(query_embedding, 5)  # Top 5 results

#     print(f"🖼️ Original Flowchart: {image_filenames[index]}")
#     print(f"\n🔍 Closest Matches:")
#     for rank, idx in enumerate(retrieved_indices[0]):
#         matched_image = image_filenames[idx]
#         similarity_score = np.dot(image_embeddings[index], image_embeddings[idx])  # Cosine similarity
#         print(f"{rank + 1}. {matched_image} (Score: {similarity_score:.6f})")

# debug_faiss_retrieval()
