# Part 3: Multimodal & Interface Upgrade (Advanced continuation)

## Project Overview
This notebook demonstrates the creation of both Streamlit and Gradio web applications for our multimodal search engine. The search engine can:
- Find images using text descriptions (text-to-image search)
- Find text descriptions using uploaded images (image-to-text search)


## 1. System Information and Setup


In [3]:
# Import necessary libraries
import streamlit as st
import gradio as gr
import torch
import torchvision.transforms as transforms
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import pandas as pd
import os
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import platform
import psutil
import sys
from datetime import datetime
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Display comprehensive system information
print("=" * 80)
print("🔍 MULTIMODAL INTERFACE - SYSTEM STATUS")
print("=" * 80)
print(f"📅 Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

# System Information
print("🖥️  SYSTEM INFORMATION")
print("-" * 40)
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.architecture()[0]}")
print(f"Processor: {platform.processor()}")
print(f"Python Version: {sys.version.split()[0]}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Streamlit Version: {st.__version__}")
print()

# Hardware Information
print("⚡ HARDWARE INFORMATION")
print("-" * 40)
print(f"CPU Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical")
print(f"RAM: {psutil.virtual_memory().total / (1024**3):.1f} GB total, {psutil.virtual_memory().available / (1024**3):.1f} GB available")
print(f"RAM Usage: {psutil.virtual_memory().percent:.1f}%")

# GPU Information
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"cuDNN Version: {torch.backends.cudnn.version()}")
else:
    print("GPU: Not available (using CPU)")
print()

# Project Status
print("📁 PROJECT STATUS")
print("-" * 40)

# Check if data exists
data_path = '../data/'
if os.path.exists(data_path):
    print("✅ Data directory found")
    if os.path.exists('../data/images/'):
        image_files = [f for f in os.listdir('../data/images/') if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        image_count = len(image_files)
        print(f"✅ {image_count} images found")
        if image_count > 0:
            total_size = sum(os.path.getsize(os.path.join('../data/images/', f)) for f in image_files) / (1024**2)
            print(f"   Total image size: {total_size:.1f} MB")
    if os.path.exists('../data/captions.txt'):
        with open('../data/captions.txt', 'r') as f:
            caption_count = sum(1 for line in f)
        caption_size = os.path.getsize('../data/captions.txt') / 1024
        print(f"✅ {caption_count} captions found ({caption_size:.1f} KB)")
    if os.path.exists('../data/Flickr8k.token.txt'):
        token_size = os.path.getsize('../data/Flickr8k.token.txt') / 1024
        print(f"✅ Flickr8k token file found ({token_size:.1f} KB)")
else:
    print("❌ Data directory not found!")

# Check if embeddings exist
embeddings_path = '../embeddings/'
if os.path.exists(embeddings_path):
    print("✅ Embeddings directory found")
    if os.path.exists('../embeddings/image_embeddings.npy'):
        image_emb_size = os.path.getsize('../embeddings/image_embeddings.npy') / (1024**2)
        print(f"✅ Image embeddings found ({image_emb_size:.1f} MB)")
    if os.path.exists('../embeddings/text_embeddings.npy'):
        text_emb_size = os.path.getsize('../embeddings/text_embeddings.npy') / (1024**2)
        print(f"✅ Text embeddings found ({text_emb_size:.1f} MB)")
    if os.path.exists('../embeddings/metadata.csv'):
        metadata_size = os.path.getsize('../embeddings/metadata.csv') / 1024
        print(f"✅ Metadata found ({metadata_size:.1f} KB)")
    if os.path.exists('../embeddings/model_info.json'):
        print("✅ Model info found")
else:
    print("❌ Embeddings directory not found - please run Part 1 first!")

print()
print("🚀 READY TO BUILD MULTIMODAL INTERFACE")
print("=" * 80)


🔍 MULTIMODAL INTERFACE - SYSTEM STATUS
📅 Timestamp: 2025-09-10 21:19:50

🖥️  SYSTEM INFORMATION
----------------------------------------
Platform: Windows-11-10.0.26100-SP0
Architecture: 64bit
Processor: Intel64 Family 6 Model 151 Stepping 5, GenuineIntel
Python Version: 3.12.9
PyTorch Version: 2.8.0+cpu
Streamlit Version: 1.49.1

⚡ HARDWARE INFORMATION
----------------------------------------
CPU Cores: 6 physical, 12 logical
RAM: 15.8 GB total, 2.0 GB available
RAM Usage: 87.7%
Device: cpu
GPU: Not available (using CPU)

📁 PROJECT STATUS
----------------------------------------
✅ Data directory found
✅ 8091 images found
   Total image size: 1063.1 MB
✅ 40460 captions found (3355.2 KB)
✅ Flickr8k token file found (3315.7 KB)
✅ Embeddings directory found
✅ Image embeddings found (1.0 MB)
✅ Text embeddings found (1.0 MB)
✅ Metadata found (58.3 KB)
✅ Model info found

🚀 READY TO BUILD MULTIMODAL INTERFACE


## 2. Load Model and Data


In [4]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🖥️  Using device: {device}")

# Load CLIP model
print("🔄 Loading CLIP model...")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
print("✅ CLIP model loaded successfully!")

# Load embeddings data
print("🔄 Loading embeddings data...")
image_embeddings = np.load('../embeddings/image_embeddings.npy')
text_embeddings = np.load('../embeddings/text_embeddings.npy')
metadata = pd.read_csv('../embeddings/metadata.csv')

# Load model info
with open('../embeddings/model_info.json', 'r') as f:
    model_info = json.load(f)

print("✅ Embeddings data loaded successfully!")
print(f"📊 Image embeddings shape: {image_embeddings.shape}")
print(f"📊 Text embeddings shape: {text_embeddings.shape}")
print(f"📊 Metadata shape: {metadata.shape}")
print(f"📊 Model info: {model_info}")


🖥️  Using device: cpu
🔄 Loading CLIP model...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ CLIP model loaded successfully!
🔄 Loading embeddings data...
✅ Embeddings data loaded successfully!
📊 Image embeddings shape: (500, 512)
📊 Text embeddings shape: (500, 512)
📊 Metadata shape: (500, 3)
📊 Model info: {'model_name': 'openai/clip-vit-base-patch32', 'embedding_dim': 512, 'num_samples': 500, 'num_images': 8091, 'total_embeddings': 500, 'device_used': 'cpu', 'processing_date': '2025-09-10', 'dataset': 'Flickr8k'}


## 3. Search Functions


In [5]:
# Text-to-Image Search Function
def text_to_image_search(query_text, top_k=5):
    """Search for images based on text query"""
    # Generate embedding for text query
    inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        query_embedding = model.get_text_features(**inputs)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    
    # Calculate similarities with all image embeddings
    similarities = cosine_similarity(query_embedding.cpu().numpy(), image_embeddings)[0]
    
    # Get top-k most similar images
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        result = {
            'image_id': metadata.iloc[idx]['image_id'],
            'image_path': metadata.iloc[idx]['image_path'],
            'caption': metadata.iloc[idx]['caption'],
            'similarity': similarities[idx]
        }
        results.append(result)
    
    return results

# Image-to-Text Search Function
def image_to_text_search(uploaded_image, top_k=5):
    """Search for text descriptions based on uploaded image"""
    # Generate embedding for uploaded image
    inputs = processor(images=uploaded_image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        query_embedding = model.get_image_features(**inputs)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    
    # Calculate similarities with all text embeddings
    similarities = cosine_similarity(query_embedding.cpu().numpy(), text_embeddings)[0]
    
    # Get top-k most similar text descriptions
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        result = {
            'image_id': metadata.iloc[idx]['image_id'],
            'image_path': metadata.iloc[idx]['image_path'],
            'caption': metadata.iloc[idx]['caption'],
            'similarity': similarities[idx]
        }
        results.append(result)
    
    return results

print("✅ Search functions defined successfully!")


✅ Search functions defined successfully!


## 4. Test Search Functions


In [6]:
# Test text-to-image search
print("🔍 Testing text-to-image search...")
test_query = "a dog playing"
results = text_to_image_search(test_query, top_k=3)

print(f"Query: '{test_query}'")
print(f"Found {len(results)} results:")
for i, result in enumerate(results, 1):
    print(f"{i}. Similarity: {result['similarity']:.3f}")
    print(f"   Caption: {result['caption']}")
    print(f"   Image ID: {result['image_id']}")
    print()

print("✅ Text-to-image search test completed!")


🔍 Testing text-to-image search...
Query: 'a dog playing'
Found 3 results:
1. Similarity: 0.324
   Caption: A black and white dog catches a toy in midair .
   Image ID: 1072153132_53d2bb1b60

2. Similarity: 0.324
   Caption: A dog leaps while chasing a tennis ball through a grassy field .
   Image ID: 1072153132_53d2bb1b60

3. Similarity: 0.324
   Caption: A dog and a tennis ball .
   Image ID: 1072153132_53d2bb1b60

✅ Text-to-image search test completed!


## 5. Create Standalone Streamlit App


In [7]:
# Create the complete Streamlit app code
streamlit_code = '''
import streamlit as st
import torch
import torchvision.transforms as transforms
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import pandas as pd
import os
import json
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load CLIP model
@st.cache_resource
def load_clip_model():
    """Load CLIP model and processor"""
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    return model, processor

# Load embeddings data
@st.cache_data
def load_embeddings_data():
    """Load pre-computed embeddings and metadata"""
    # Load embeddings
    image_embeddings = np.load('../embeddings/image_embeddings.npy')
    text_embeddings = np.load('../embeddings/text_embeddings.npy')
    
    # Load metadata
    metadata = pd.read_csv('../embeddings/metadata.csv')
    
    # Load model info
    with open('../embeddings/model_info.json', 'r') as f:
        model_info = json.load(f)
    
    return image_embeddings, text_embeddings, metadata, model_info

# Load model and data
model, processor = load_clip_model()
image_embeddings, text_embeddings, metadata, model_info = load_embeddings_data()

# Text-to-Image Search Function
def text_to_image_search(query_text, top_k=5):
    """Search for images based on text query"""
    # Generate embedding for text query
    inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        query_embedding = model.get_text_features(**inputs)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    
    # Calculate similarities with all image embeddings
    similarities = cosine_similarity(query_embedding.cpu().numpy(), image_embeddings)[0]
    
    # Get top-k most similar images
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        result = {
            'image_id': metadata.iloc[idx]['image_id'],
            'image_path': metadata.iloc[idx]['image_path'],
            'caption': metadata.iloc[idx]['caption'],
            'similarity': similarities[idx]
        }
        results.append(result)
    
    return results

# Image-to-Text Search Function
def image_to_text_search(uploaded_image, top_k=5):
    """Search for text descriptions based on uploaded image"""
    # Generate embedding for uploaded image
    inputs = processor(images=uploaded_image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        query_embedding = model.get_image_features(**inputs)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    
    # Calculate similarities with all text embeddings
    similarities = cosine_similarity(query_embedding.cpu().numpy(), text_embeddings)[0]
    
    # Get top-k most similar text descriptions
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        result = {
            'image_id': metadata.iloc[idx]['image_id'],
            'image_path': metadata.iloc[idx]['image_path'],
            'caption': metadata.iloc[idx]['caption'],
            'similarity': similarities[idx]
        }
        results.append(result)
    
    return results

# Main Streamlit app
def main():
    st.set_page_config(
        page_title="🔍 Search Engine",
        page_icon="🔍",
        layout="wide",
        initial_sidebar_state="expanded"
    )
    
    # Header
    st.title("🔍 Search Engine")
    st.markdown("A powerful multimodal search engine using OpenAI CLIP")
    
    # Sidebar
    st.sidebar.header("Search Options")
    
    # Search type selection
    search_type = st.sidebar.radio(
        "Choose search type:",
        ["Text-to-Image Search", "Image-to-Text Search"]
    )
    
    # Number of results
    top_k = st.sidebar.slider(
        "Number of results:",
        min_value=1,
        max_value=20,
        value=5,
        help="Number of top results to display"
    )
    
    # Popular searches
    st.sidebar.markdown("---")
    st.sidebar.markdown("### 🔥 Popular Searches")
    
    popular_searches = [
        "dog playing", "children smiling", "red car", "food cooking",
        "person running", "cat sleeping", "blue sky", "water beach",
        "house building", "tree nature", "person walking", "animal pet"
    ]
    
    # Create clickable search suggestions
    for i, search in enumerate(popular_searches):
        if st.sidebar.button(f"🔍 {search}", key=f"popular_{i}"):
            st.session_state.popular_search = search
            st.session_state.auto_search = True
    
    # Display dataset info
    st.sidebar.markdown("---")
    st.sidebar.markdown("### 📊 Dataset Information")
    
    # Get values and format properly
    num_images = model_info.get('num_images', 'Unknown')
    num_embeddings = model_info.get('total_embeddings', model_info.get('num_samples', 'Unknown'))
    embedding_dim = model_info.get('embedding_dim', 'Unknown')
    model_name = model_info.get('model_name', 'Unknown')
    dataset = model_info.get('dataset', 'Unknown')
    processing_date = model_info.get('processing_date', datetime.now().strftime('%Y-%m-%d'))
    
    # Format numbers properly
    images_text = f"{num_images:,}" if isinstance(num_images, int) else str(num_images)
    embeddings_text = f"{num_embeddings:,}" if isinstance(num_embeddings, int) else str(num_embeddings)
    model_display = model_name.split('/')[-1] if '/' in model_name else model_name
    
    st.sidebar.metric("Total Images", images_text)
    st.sidebar.metric("Total Embeddings", embeddings_text)
    st.sidebar.metric("Embedding Dimension", f"{embedding_dim}D")
    st.sidebar.metric("Model", model_display)
    st.sidebar.metric("Dataset", dataset)
    st.sidebar.metric("Processing Date", processing_date)
    
    # Check if this is a demo dataset
    num_images = model_info.get('num_images', len(metadata))
    if isinstance(num_images, int) and num_images < 1000:
        st.warning(f"⚠️ **Demo Mode**: You're using a small subset ({num_images:,} images) of the full Flickr8k dataset. For production use, run the full dataset processing in Part 1 to get all 8,091 images.")
    
    # Main content area
    if search_type == "Text-to-Image Search":
        st.header("🔤 Text-to-Image Search")
        st.markdown("Enter a text description to find similar images:")
        
        # Search suggestions
        st.markdown("#### 💡 Search Tips")
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("**Try searching for:**")
            st.markdown("• Animals: 'dog', 'cat', 'bird'")
            st.markdown("• Activities: 'playing', 'running', 'cooking'")
            st.markdown("• Objects: 'car', 'house', 'food'")
            st.markdown("• Emotions: 'smiling', 'happy', 'sad'")
        
        with col2:
            st.markdown("**Example queries:**")
            if st.button("🐕 A dog playing", key="example1"):
                st.session_state.example_query = "a dog playing"
                st.session_state.auto_search = True
            if st.button("👶 Children smiling", key="example2"):
                st.session_state.example_query = "children smiling"
                st.session_state.auto_search = True
            if st.button("🚗 Red car", key="example3"):
                st.session_state.example_query = "red car"
                st.session_state.auto_search = True
            if st.button("🍕 Food cooking", key="example4"):
                st.session_state.example_query = "food cooking"
                st.session_state.auto_search = True
        
        # Text input with better placeholder
        query_text = st.text_input(
            "🔍 Enter your search query:",
            placeholder="Describe what you're looking for... (e.g., 'a dog playing in the park', 'children smiling', 'red car on street')",
            help="💡 Be specific! Try describing objects, actions, colors, or emotions. The more descriptive, the better the results!",
            value=st.session_state.get('example_query', st.session_state.get('popular_search', '')),
            key="search_input"
        )
        
        # Clear example queries after use
        if 'example_query' in st.session_state:
            del st.session_state.example_query
        if 'popular_search' in st.session_state:
            del st.session_state.popular_search
        
        # Check if we should auto-search (from popular searches or example queries)
        should_search = st.session_state.get('auto_search', False)
        if should_search:
            st.session_state.auto_search = False  # Reset the flag
            # Use example query if available, otherwise use popular search
            query_text = st.session_state.get('example_query', st.session_state.get('popular_search', query_text))
        
        if st.button("🔍 Search Images", type="primary") or should_search:
            if query_text:
                with st.spinner("Searching for images..."):
                    results = text_to_image_search(query_text, top_k)
                
                if results:
                    st.success(f"Found {len(results)} results for: '{query_text}'")
                    
                    # Display results in columns
                    cols = st.columns(min(3, len(results)))
                    for i, result in enumerate(results):
                        with cols[i % 3]:
                            try:
                                image_path = result['image_path']
                                # Fix path - remove ../ if present
                                if image_path.startswith('../'):
                                    image_path = image_path[3:]  # Remove ../
                                
                                if os.path.exists(image_path):
                                    image = Image.open(image_path)
                                    st.image(image, caption=f"Similarity: {result['similarity']:.3f}", use_container_width=True)
                                    
                                    # Display details
                                    st.markdown(f"**Image ID:** {result['image_id']}")
                                    st.markdown(f"**Caption:** {result['caption']}")
                                    st.markdown(f"**Similarity:** {result['similarity']:.3f}")
                                else:
                                    st.error(f"Image not found: {image_path}")
                            except Exception as e:
                                st.error(f"Error loading image: {e}")
                    else:
                        st.warning("No results found. Try a different search query.")
            else:
                st.warning("Please enter a search query.")
    
    else:  # Image-to-Text Search
        st.header("🖼️ Image-to-Text Search")
        st.markdown("Upload an image to find similar text descriptions:")
        
        # Upload guidance
        st.markdown("#### 📋 Upload Guidelines")
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("**Supported formats:**")
            st.markdown("• JPG, JPEG")
            st.markdown("• PNG")
            st.markdown("• BMP, GIF")
        
        with col2:
            st.markdown("**Best results with:**")
            st.markdown("• Clear, well-lit images")
            st.markdown("• Single main subject")
            st.markdown("• Good contrast")
        
        # Image upload
        uploaded_file = st.file_uploader(
            "📁 Choose an image file:",
            type=['jpg', 'jpeg', 'png', 'bmp', 'gif'],
            help="💡 Upload a clear image with a main subject for best search results!",
            label_visibility="collapsed"
        )
        
        if uploaded_file is not None:
            # Display uploaded image
            uploaded_image = Image.open(uploaded_file)
            st.image(uploaded_image, caption="Uploaded Image", use_container_width=True)
            
            if st.button("🔍 Search Descriptions", type="primary"):
                with st.spinner("Searching for similar descriptions..."):
                    results = image_to_text_search(uploaded_image, top_k)
                
                if results:
                    st.success(f"Found {len(results)} similar descriptions")
                    
                    # Display results in columns
                    cols = st.columns(min(3, len(results)))
                    for i, result in enumerate(results):
                        with cols[i % 3]:
                            try:
                                image_path = result['image_path']
                                # Fix path - remove ../ if present
                                if image_path.startswith('../'):
                                    image_path = image_path[3:]  # Remove ../
                                
                                if os.path.exists(image_path):
                                    original_image = Image.open(image_path)
                                    st.image(original_image, caption="Original Image", use_container_width=True)
                                else:
                                    st.error(f"Original image not found: {image_path}")
                            except Exception as e:
                                st.error(f"Error loading original image: {e}")
                            
                            # Display details
                            st.markdown(f"**Image ID:** {result['image_id']}")
                            st.markdown(f"**Caption:** {result['caption']}")
                            st.markdown(f"**Similarity:** {result['similarity']:.3f}")
                else:
                    st.warning("No results found. Try a different image.")

if __name__ == "__main__":
    main()
'''

# Write Streamlit app to file
with open('../streamlit_app.py', 'w', encoding='utf-8') as f:
    f.write(streamlit_code)

print("✅ Streamlit app created successfully!")
print("📁 File saved as: ../streamlit_app.py")
print("🚀 To run: streamlit run ../streamlit_app.py")


✅ Streamlit app created successfully!
📁 File saved as: ../streamlit_app.py
🚀 To run: streamlit run ../streamlit_app.py


## 6. Create Standalone Gradio App


In [8]:
# Create the complete Gradio app code
gradio_code = '''
import gradio as gr
import torch
import torchvision.transforms as transforms
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import pandas as pd
import os
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load CLIP model
@gr.cache()
def load_clip_model():
    """Load CLIP model and processor"""
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    return model, processor

# Load embeddings data
@gr.cache()
def load_embeddings_data():
    """Load pre-computed embeddings and metadata"""
    # Load embeddings
    image_embeddings = np.load('../embeddings/image_embeddings.npy')
    text_embeddings = np.load('../embeddings/text_embeddings.npy')
    
    # Load metadata
    metadata = pd.read_csv('../embeddings/metadata.csv')
    
    # Load model info
    with open('../embeddings/model_info.json', 'r') as f:
        model_info = json.load(f)
    
    return image_embeddings, text_embeddings, metadata, model_info

# Load model and data
model, processor = load_clip_model()
image_embeddings, text_embeddings, metadata, model_info = load_embeddings_data()

# Text-to-Image Search Function
def text_to_image_search(query_text, top_k=5):
    """Search for images based on text query"""
    # Generate embedding for text query
    inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        query_embedding = model.get_text_features(**inputs)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    
    # Calculate similarities with all image embeddings
    similarities = cosine_similarity(query_embedding.cpu().numpy(), image_embeddings)[0]
    
    # Get top-k most similar images
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        result = {
            'image_id': metadata.iloc[idx]['image_id'],
            'image_path': metadata.iloc[idx]['image_path'],
            'caption': metadata.iloc[idx]['caption'],
            'similarity': similarities[idx]
        }
        results.append(result)
    
    return results

# Image-to-Text Search Function
def image_to_text_search(uploaded_image, top_k=5):
    """Search for text descriptions based on uploaded image"""
    # Generate embedding for uploaded image
    inputs = processor(images=uploaded_image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        query_embedding = model.get_image_features(**inputs)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    
    # Calculate similarities with all text embeddings
    similarities = cosine_similarity(query_embedding.cpu().numpy(), text_embeddings)[0]
    
    # Get top-k most similar text descriptions
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        result = {
            'image_id': metadata.iloc[idx]['image_id'],
            'image_path': metadata.iloc[idx]['image_path'],
            'caption': metadata.iloc[idx]['caption'],
            'similarity': similarities[idx]
        }
        results.append(result)
    
    return results

# Text-to-Image Search Interface
def search_images(query, num_results):
    """Gradio interface for text-to-image search"""
    if not query.strip():
        return [], "Please enter a search query."
    
    try:
        results = text_to_image_search(query, num_results)
        
        if not results:
            return [], "No results found. Try a different search query."
        
        # Prepare images and captions for display
        images = []
        captions = []
        
        for result in results:
            image_path = result['image_path']
            # Fix path - remove ../ if present
            if image_path.startswith('../'):
                image_path = image_path[3:]  # Remove ../
            
            if os.path.exists(image_path):
                images.append(image_path)
                captions.append(f"Similarity: {result['similarity']:.3f}\\nCaption: {result['caption']}")
            else:
                images.append(None)
                captions.append(f"Image not found: {image_path}")
        
        return images, f"Found {len(results)} results for: '{query}'"
    
    except Exception as e:
        return [], f"Error during search: {str(e)}"

# Image-to-Text Search Interface
def search_descriptions(image, num_results):
    """Gradio interface for image-to-text search"""
    if image is None:
        return [], "Please upload an image."
    
    try:
        results = image_to_text_search(image, num_results)
        
        if not results:
            return [], "No results found. Try a different image."
        
        # Prepare images and captions for display
        images = []
        captions = []
        
        for result in results:
            image_path = result['image_path']
            # Fix path - remove ../ if present
            if image_path.startswith('../'):
                image_path = image_path[3:]  # Remove ../
            
            if os.path.exists(image_path):
                images.append(image_path)
                captions.append(f"Similarity: {result['similarity']:.3f}\\nCaption: {result['caption']}")
            else:
                images.append(None)
                captions.append(f"Image not found: {image_path}")
        
        return images, f"Found {len(results)} similar descriptions"
    
    except Exception as e:
        return [], f"Error during search: {str(e)}"

# Create Gradio interface
def create_gradio_app():
    """Create the Gradio web application"""
    
    # Project description
    description = """
    # 🔍 Multimodal Search Engine
    
    A powerful search engine that can find images using text descriptions and find text descriptions using images.
    
    **Technology Stack:**
    - **Model**: OpenAI CLIP (Contrastive Language-Image Pre-training)
    - **Framework**: Gradio for web interface
    - **Dataset**: Flickr8k (8,091 images with captions)
    - **Embeddings**: 512-dimensional vector representations
    - **Similarity**: Cosine similarity for matching
    
    **Features:**
    - Text-to-Image Search: Describe what you're looking for
    - Image-to-Text Search: Upload an image to find similar descriptions
    - Real-time similarity scoring
    - Interactive web interface
    """
    
    # Popular search suggestions
    popular_searches = [
        "dog playing", "children smiling", "red car", "food cooking",
        "person running", "cat sleeping", "blue sky", "water beach"
    ]
    
    with gr.Blocks(title="🔍 Search Engine", theme=gr.themes.Soft()) as app:
        gr.Markdown(description)
        
        # Dataset information
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown(f"""
                ### 📊 Dataset Information
                - **Total Images**: {model_info.get('num_images', 'Unknown'):,}
                - **Total Embeddings**: {model_info.get('total_embeddings', model_info.get('num_samples', 'Unknown'):,}
                - **Embedding Dimension**: {model_info.get('embedding_dim', 'Unknown')}D
                - **Model**: {model_info.get('model_name', 'Unknown').split('/')[-1]}
                - **Dataset**: {model_info.get('dataset', 'Unknown')}
                - **Processing Date**: {model_info.get('processing_date', 'Unknown')}
                """)
            
            with gr.Column(scale=1):
                gr.Markdown(f"""
                ### 🔥 Popular Searches
                Click any suggestion to search:
                """)
                # Create clickable search suggestions
                for i, search in enumerate(popular_searches):
                    if i % 2 == 0:
                        with gr.Row():
                            gr.Button(f"🔍 {search}", size="sm").click(
                                lambda s=search: s, outputs=gr.Textbox(visible=False)
                            ).then(
                                search_images, 
                                inputs=[gr.Textbox(value=search, visible=False), gr.Slider(1, 20, 5)],
                                outputs=[gr.Gallery(), gr.Textbox()]
                            )
                    else:
                        gr.Button(f"🔍 {search}", size="sm").click(
                            lambda s=search: s, outputs=gr.Textbox(visible=False)
                        ).then(
                            search_images,
                            inputs=[gr.Textbox(value=search, visible=False), gr.Slider(1, 20, 5)],
                            outputs=[gr.Gallery(), gr.Textbox()]
                        )
        
        # Main search interface
        with gr.Tabs():
            # Text-to-Image Search Tab
            with gr.Tab("🔤 Text-to-Image Search"):
                gr.Markdown("Enter a text description to find similar images:")
                
                with gr.Row():
                    with gr.Column(scale=3):
                        text_query = gr.Textbox(
                            label="Search Query",
                            placeholder="e.g., 'a dog playing in the park' or 'children smiling'",
                            info="Describe what you're looking for in the images"
                        )
                        num_results_text = gr.Slider(
                            label="Number of Results",
                            minimum=1,
                            maximum=20,
                            value=5,
                            step=1
                        )
                        search_btn = gr.Button("🔍 Search Images", variant="primary")
                    
                    with gr.Column(scale=1):
                        gr.Markdown("""
                        ### 💡 Search Tips
                        **Try searching for:**
                        - Animals: 'dog', 'cat', 'bird'
                        - Activities: 'playing', 'running', 'cooking'
                        - Objects: 'car', 'house', 'food'
                        - Emotions: 'smiling', 'happy', 'sad'
                        """)
                
                # Results
                text_results = gr.Gallery(
                    label="Search Results",
                    show_label=True,
                    elem_id="gallery",
                    columns=3,
                    rows=2,
                    object_fit="contain",
                    height="auto"
                )
                text_status = gr.Textbox(label="Status", interactive=False)
                
                # Connect search button
                search_btn.click(
                    search_images,
                    inputs=[text_query, num_results_text],
                    outputs=[text_results, text_status]
                )
            
            # Image-to-Text Search Tab
            with gr.Tab("🖼️ Image-to-Text Search"):
                gr.Markdown("Upload an image to find similar text descriptions:")
                
                with gr.Row():
                    with gr.Column(scale=3):
                        image_input = gr.Image(
                            label="Upload Image",
                            type="pil",
                            info="Upload a clear image with a main subject for best results"
                        )
                        num_results_image = gr.Slider(
                            label="Number of Results",
                            minimum=1,
                            maximum=20,
                            value=5,
                            step=1
                        )
                        search_img_btn = gr.Button("🔍 Search Descriptions", variant="primary")
                    
                    with gr.Column(scale=1):
                        gr.Markdown("""
                        ### 📋 Upload Guidelines
                        **Supported formats:**
                        - JPG, JPEG
                        - PNG
                        - BMP, GIF
                        
                        **Best results with:**
                        - Clear, well-lit images
                        - Single main subject
                        - Good contrast
                        """)
                
                # Results
                image_results = gr.Gallery(
                    label="Search Results",
                    show_label=True,
                    elem_id="gallery",
                    columns=3,
                    rows=2,
                    object_fit="contain",
                    height="auto"
                )
                image_status = gr.Textbox(label="Status", interactive=False)
                
                # Connect search button
                search_img_btn.click(
                    search_descriptions,
                    inputs=[image_input, num_results_image],
                    outputs=[image_results, image_status]
                )
        
        # Footer
        gr.Markdown("""
        ---
        **🔍 Search Engine** - Built with Gradio and OpenAI CLIP
        """)
    
    return app

# Create and launch the app
if __name__ == "__main__":
    app = create_gradio_app()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )
'''

# Write Gradio app to file
with open('../gradio_app.py', 'w', encoding='utf-8') as f:
    f.write(gradio_code)

print("✅ Gradio app created successfully!")
print("📁 File saved as: ../gradio_app.py")
print("🚀 To run: python ../gradio_app.py")


✅ Gradio app created successfully!
📁 File saved as: ../gradio_app.py
🚀 To run: python ../gradio_app.py


## 7. Framework Comparison and Launch Instructions


In [9]:
# Framework Comparison
print("🔍 MULTIMODAL SEARCH ENGINE - FRAMEWORK COMPARISON")
print("=" * 60)

print("\n📊 STREAMLIT vs GRADIO")
print("-" * 30)
print("| Feature                | Streamlit | Gradio |")
print("|------------------------|-----------|--------|")
print("| Learning Curve         | Easier    | Medium |")
print("| UI Style               | Traditional| Modern|")
print("| Layout                 | Column-based| Tab-based|")
print("| Interactions           | Form-based| Event-driven|")
print("| Customization          | High      | Medium |")
print("| Performance            | Good      | Good   |")
print("| Community              | Large     | Growing|")
print("| Documentation          | Excellent | Good   |")
print("| Deployment             | Easy      | Easy   |")

print("\n🚀 LAUNCH INSTRUCTIONS")
print("-" * 30)
print("1. STREAMLIT APP (Port 8501):")
print("   streamlit run streamlit_app.py")
print()
print("2. GRADIO APP (Port 7860):")
print("   python gradio_app.py")
print()
print("3. RUN BOTH SIMULTANEOUSLY:")
print("   - Open two terminal windows")
print("   - Run each command in separate terminal")
print("   - Access Streamlit at: http://localhost:8501")
print("   - Access Gradio at: http://localhost:7860")

print("\n✅ BOTH APPS INCLUDE:")
print("-" * 30)
print("• Text-to-Image Search")
print("• Image-to-Text Search") 
print("• Popular search suggestions")
print("• Dataset information display")
print("• Search tips and guidelines")
print("• Real-time similarity scoring")
print("• Responsive image galleries")
print("• Error handling and validation")

print("\n🎯 RECOMMENDATION:")
print("-" * 30)
print("• Use Streamlit for: Traditional web apps, data science projects")
print("• Use Gradio for: AI demos, quick prototypes, modern interfaces")
print("• Both are excellent choices for this project!")


🔍 MULTIMODAL SEARCH ENGINE - FRAMEWORK COMPARISON

📊 STREAMLIT vs GRADIO
------------------------------
| Feature                | Streamlit | Gradio |
|------------------------|-----------|--------|
| Learning Curve         | Easier    | Medium |
| UI Style               | Traditional| Modern|
| Layout                 | Column-based| Tab-based|
| Interactions           | Form-based| Event-driven|
| Customization          | High      | Medium |
| Performance            | Good      | Good   |
| Community              | Large     | Growing|
| Documentation          | Excellent | Good   |
| Deployment             | Easy      | Easy   |

🚀 LAUNCH INSTRUCTIONS
------------------------------
1. STREAMLIT APP (Port 8501):
   streamlit run streamlit_app.py

2. GRADIO APP (Port 7860):
   python gradio_app.py

3. RUN BOTH SIMULTANEOUSLY:
   - Open two terminal windows
   - Run each command in separate terminal
   - Access Streamlit at: http://localhost:8501
   - Access Gradio at: http://localhos