In [None]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
# This may fail on some runtimes but try:
!nvidia-smi || true



In [None]:
!pip install -q transformers torchvision timm accelerate Pillow



In [None]:
from google.colab import drive
drive.mount('/content/drive')
# now you can read/write at /content/drive/MyDrive/


In [None]:
!unzip -q "/content/drive/MyDrive/Test images.zip" -d .
!ls -la test_images | sed -n '1,40p'


In [None]:
!ls -R test_images


In [None]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch, os
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

test_folder = "test_images/Test images"

for fname in sorted(os.listdir(test_folder)):
    path = os.path.join(test_folder, fname)
    if os.path.isfile(path):  # ✅ Only process files, skip folders
        image = Image.open(path).convert("RGB")

        inputs = processor(images=image, return_tensors="pt").to(device)
        out = model.generate(**inputs, max_new_tokens=50)
        caption = processor.decode(out[0], skip_special_tokens=True)

        print(f"\n{fname} -> {caption}")
        plt.imshow(image)
        plt.axis('off')
        plt.show()


In [None]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch, glob
import matplotlib.pyplot as plt

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Recursively find all images
image_paths = glob.glob("test_images/Test images/**/*.*", recursive=True)
print(f"Found {len(image_paths)} images")

# Loop through first 10 images
for path in image_paths[:10]:
    image = Image.open(path).convert("RGB")

    inputs = processor(images=image,text="A photo of", return_tensors="pt").to(device)
    out = model.generate(
    **inputs,
    max_new_tokens=50,
    repetition_penalty=2.0,
    num_beams=5,
    early_stopping=True
)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"\n📷 {path} -> {caption}")
    plt.imshow(image)
    plt.axis('off')
    plt.show()



In [None]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch, glob
import matplotlib.pyplot as plt

# ---------- CATEGORY MAPPING ----------
CATEGORY_KEYWORDS = {
    "Fruit": ["apple", "banana", "mango", "orange", "grape", "tomato", "pineapple", "papaya", "watermelon", "strawberry", "citrus"],
    "Vegetable": ["carrot", "onion", "potato", "broccoli", "lettuce", "spinach", "cauliflower", "beans", "peas"],
    "Spice": ["mustard", "cardamom", "cinnamon", "clove", "nutmeg", "fennel", "fenugreek", "bay leaf", "coriander", "turmeric", "pepper", "chilli"],
    "Dry Fruit": ["almond", "cashew", "raisin", "pistachio", "walnut", "hazelnut", "dates", "fig", "apricot"],
    "Pulse": ["lentil", "gram", "bean", "dhal", "dal", "pea"]
}

def classify_caption(caption):
    caption_lower = caption.lower()
    for category, keywords in CATEGORY_KEYWORDS.items():
        if any(keyword in caption_lower for keyword in keywords):
            return category
    return "Unknown"

# ---------- SETUP MODEL ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ---------- LOAD IMAGES ----------
image_paths = glob.glob("test_images/Test images/**/*.*", recursive=True)
print(f"Found {len(image_paths)} images")

# ---------- PROCESS & LOG ----------
with open("vlm_results.txt", "w") as log_file:
    for path in image_paths:
        image = Image.open(path).convert("RGB")

        # Prompt-based captioning to avoid loops
        inputs = processor(images=image, text="A photo of", return_tensors="pt").to(device)
        out = model.generate(**inputs, max_new_tokens=50, repetition_penalty=2.0, num_beams=5, early_stopping=True)
        caption = processor.decode(out[0], skip_special_tokens=True)

        # Classify
        category = classify_caption(caption)

        # Print & show
        print(f"\n📷 {path}")
        print(f"🧠 Caption: {caption}")
        print(f"🏷️ Category: {category}")
        plt.imshow(image)
        plt.axis('off')
        plt.title(f"{category}: {caption}")
        plt.show()

        # Log to file
        log_file.write(f"{path} | {caption} | {category}\n")


In [None]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, AutoModelForCausalLM
import torch, glob
import matplotlib.pyplot as plt

# ---------- CATEGORY MAPPING with plurals & synonyms ----------
CATEGORY_KEYWORDS = {
    "Fruit": ["apple", "apples", "banana", "bananas", "mango", "mangos", "mangoes", "orange", "oranges",
              "grape", "grapes", "tomato", "tomatoes", "pineapple", "papaya", "watermelon", "strawberry", "strawberries", "citrus", "lemon", "lemons", "lime", "limes"],
    "Vegetable": ["carrot", "carrots", "onion", "onions", "potato", "potatoes", "broccoli", "lettuce", "spinach", "cauliflower", "beans", "bean", "peas", "pea"],
    "Spice": ["mustard", "cardamom", "cardamoa", "cinnamon", "clove", "nutmeg", "fennel", "fenugreek", "bay leaf", "bay leaves", "coriander", "turmeric", "pepper", "chilli", "chillies"],
    "Dry Fruit": ["almond", "almonds", "cashew", "cashews", "raisin", "raisins", "pistachio", "pistachios", "walnut", "walnuts", "hazelnut", "hazelnuts", "dates", "date", "fig", "figs", "apricot", "apricots"],
    "Pulse": ["lentil", "lentils", "gram", "grams", "bean", "beans", "dhal", "dal", "pea", "peas"]
}

def classify_caption(caption):
    caption_lower = caption.lower()
    for category, keywords in CATEGORY_KEYWORDS.items():
        if any(keyword in caption_lower for keyword in keywords):
            return category
    return "Unknown"

# ---------- Setup Models ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# BLIP
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# GIT-VLM (fallback)
git_model_name = "microsoft/git-base"
git_processor = AutoProcessor.from_pretrained(git_model_name)
git_model = AutoModelForCausalLM.from_pretrained(git_model_name).to(device)

# ---------- Caption Functions ----------
def blip_caption(image, prompt="A photo of"):
    inputs = blip_processor(images=image, text=prompt, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs, max_new_tokens=50, repetition_penalty=2.0, num_beams=5, early_stopping=True)
    return blip_processor.decode(out[0], skip_special_tokens=True)

def git_caption(image):
    pixel_values = git_processor(images=image, return_tensors="pt").pixel_values.to(device)
    generated_ids = git_model.generate(pixel_values=pixel_values, max_length=50)
    return git_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

def is_repetitive(caption, threshold=3):
    words = caption.lower().split()
    return any(words.count(w) >= threshold for w in set(words))

# ---------- Load Images ----------
image_paths = glob.glob("test_images/Test images/**/*.*", recursive=True)
print(f"Found {len(image_paths)} images")

# ---------- Process & Log ----------
with open("vlm_results_v2.txt", "w") as log_file:
    for path in image_paths:
        image = Image.open(path).convert("RGB")

        # Step 1: Try BLIP
        caption = blip_caption(image)

        # Step 2: Retry with stronger prompt if repetitive
        if is_repetitive(caption):
            caption = blip_caption(image, prompt="Describe the main object in this image")

        # Step 3: If still repetitive or too short, use GIT-VLM
        if is_repetitive(caption) or len(caption.split()) < 3:
            caption = git_caption(image)

        # Step 4: Classify
        category = classify_caption(caption)

        # Step 5: Show & Log
        print(f"\n📷 {path}")
        print(f"🧠 Caption: {caption}")
        print(f"🏷️ Category: {category}")
        plt.imshow(image)
        plt.axis('off')
        plt.title(f"{category}: {caption}")
        plt.show()

        log_file.write(f"{path} | {caption} | {category}\n")


In [None]:
%pip install gradio


In [None]:
import gradio as gr
from PIL import Image

def process_image(image):
    # Step 1: Caption
    caption = blip_caption(image)
    if is_repetitive(caption):
        caption = blip_caption(image, prompt="Describe the main object in this image")
    if is_repetitive(caption) or len(caption.split()) < 3:
        caption = git_caption(image)

    # Step 2: Classify
    category = classify_caption(caption)

    # Step 3: Log
    with open("vlm_results_ui.txt", "a") as f:
        f.write(f"Uploaded Image | {caption} | {category}\n")

    return caption, category

# Create the Gradio interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Textbox(label="Caption"), gr.Textbox(label="Category")],
    title="VLM Playground",
    description="Upload an image to get a caption and category using BLIP + fallback GIT-VLM."
)

iface.launch()


In [None]:
def process_image_with_agents(image):
    """
    Returns: caption (str), category (str), agent_trace (str)
    agent_trace: a short history of which agent ran and why (BLIP/GIT, retries, classifier)
    """
    agent_steps = []

    # Agent 1: Caption Agent (BLIP)
    agent_steps.append("CaptionAgent: BLIP (initial)")
    caption = blip_caption(image)
    if is_repetitive(caption) or len(caption.split()) < 3:
        agent_steps.append("CaptionAgent: BLIP retry with stronger prompt")
        caption = blip_caption(image, prompt="Describe the main object in this image")

    # If still poor, fallback to GIT
    if is_repetitive(caption) or len(caption.split()) < 3:
        agent_steps.append("CaptionAgent: Fallback -> GIT")
        try:
            caption = git_caption(image)
        except Exception as e:
            agent_steps.append(f"CaptionAgent: GIT failed ({str(e)})")
            caption = "Caption generation failed"

    # Agent 2: Classifier Agent
    category = classify_caption(caption)
    agent_steps.append(f"ClassifierAgent: matched -> {category}")

    # Agent 3: Logger Agent
    log_line = f"Uploaded Image | {caption} | {category} | Trace: {' > '.join(agent_steps)}"
    with open("vlm_results_ui.txt", "a") as f:
        f.write(log_line + "\n")
    agent_steps.append("LoggerAgent: logged to vlm_results_ui.txt")

    agent_trace = " | ".join(agent_steps)
    return caption, category, agent_trace


In [None]:
iface = gr.Interface(
    fn=process_image_with_agents,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Textbox(label="Caption"), gr.Textbox(label="Category"), gr.Textbox(label="Agent Trace")],
    title="VLM Playground (Agentic Trace)",
    description="Upload an image to get a caption, category, and agent trace (BLIP/GIT fallback)."
)
iface.launch()


In [None]:
%pip install gradio pandas


In [None]:
import pandas as pd
import tempfile, os, zipfile

# -------- Batch Processing Function --------
def process_zip(zip_file):
    # Create temp folder
    temp_dir = tempfile.mkdtemp()

    # Extract uploaded ZIP
    with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Find images recursively
    image_paths = glob.glob(f"{temp_dir}/**/*.*", recursive=True)
    results = []

    for path in image_paths:
        try:
            image = Image.open(path).convert("RGB")
        except:
            continue  # skip non-images

        # Step 1: Caption
        caption = blip_caption(image)
        if is_repetitive(caption):
            caption = blip_caption(image, prompt="Describe the main object in this image")
        if is_repetitive(caption) or len(caption.split()) < 3:
            caption = git_caption(image)

        # Step 2: Classify
        category = classify_caption(caption)

        # Step 3: Agent trace
        trace = []
        trace.append("BLIP")
        if is_repetitive(caption):
            trace.append("BLIP retry")
        if is_repetitive(caption) or len(caption.split()) < 3:
            trace.append("GIT fallback")
        trace.append(f"Classified: {category}")

        results.append({
            "Image Path": os.path.basename(path),
            "Caption": caption,
            "Category": category,
            "Agent Trace": " > ".join(trace)
        })

    # Create DataFrame & save as CSV
    df = pd.DataFrame(results)
    csv_path = os.path.join(temp_dir, "batch_results.csv")
    df.to_csv(csv_path, index=False)

    return csv_path

# -------- Gradio UI --------
with gr.Blocks() as demo:
    gr.Markdown("# 📷 VLM Playground — Single & Batch Mode")
    gr.Markdown("Upload a single image **or** a ZIP of images to get captions, categories, and agent traces.")

    with gr.Tab("Single Image Mode"):
        single_img = gr.Image(type="pil", label="Upload an Image")
        caption_out = gr.Textbox(label="Caption")
        category_out = gr.Textbox(label="Category")
        trace_out = gr.Textbox(label="Agent Trace")
        btn_single = gr.Button("Process Image")
        btn_single.click(process_image_with_agents, inputs=single_img, outputs=[caption_out, category_out, trace_out])

    with gr.Tab("Batch Mode"):
        zip_in = gr.File(label="Upload ZIP of Images", type="filepath")
        csv_out = gr.File(label="Download Results CSV")
        btn_batch = gr.Button("Process Batch")
        btn_batch.click(process_zip, inputs=zip_in, outputs=csv_out)

demo.launch()
