<a href="https://colab.research.google.com/github/Bhuvana2488/airl-cv/blob/main/q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Required Packages
print("🚀 Installing SAM 2 and dependencies...")

!pip install -q git+https://github.com/facebookresearch/segment-anything-2.git
!pip install -q supervision groundingdino-py
!pip install -q opencv-python matplotlib torch torchvision pillow numpy

print("✅ Installation complete!")

🚀 Installing SAM 2 and dependencies...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
✅ Installation complete!


In [2]:
import torch
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image
import supervision as sv
from pathlib import Path
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🎮 Using device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

🎮 Using device: cuda
   GPU: Tesla T4
   Memory: 15.83 GB


In [3]:
import os
from pathlib import Path

checkpoint_dir = Path("checkpoints")
checkpoint_dir.mkdir(exist_ok=True)

model_cfg = "sam2_hiera_small.yaml"
checkpoint_path = checkpoint_dir / "sam2_hiera_small.pt"

if not checkpoint_path.exists():
    print("📥 Downloading SAM 2 model checkpoint...")
    !wget -q https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt -P checkpoints/
    print("✅ Download complete!")
else:
    print("✅ Checkpoint already exists!")

print(f"Model config: {model_cfg}")
print(f"Checkpoint: {checkpoint_path}")

✅ Checkpoint already exists!
Model config: sam2_hiera_small.yaml
Checkpoint: checkpoints/sam2_hiera_small.pt


In [4]:
# Cell 4: Initialize SAM 2 Model (CORRECTED)
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

print("🔧 Initializing SAM 2...")

# Use the correct config path
model_cfg = "sam2_hiera_s.yaml"  # Changed from sam2_hiera_small.yaml

sam2_model = build_sam2(model_cfg, str(checkpoint_path), device=device)
sam2_predictor = SAM2ImagePredictor(sam2_model)

print("✅ SAM 2 model loaded successfully!")

🔧 Initializing SAM 2...
✅ SAM 2 model loaded successfully!


In [5]:
# Cell 5: Initialize GroundingDINO (CORRECTED)
from groundingdino.util.inference import load_model, predict
import os

print("🔧 Initializing GroundingDINO...")

# Download config and weights
os.makedirs("gdino_checkpoints", exist_ok=True)

# Download GroundingDINO config
if not os.path.exists("gdino_checkpoints/GroundingDINO_SwinT_OGC.py"):
    !wget -q https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py -P gdino_checkpoints/

# Download GroundingDINO weights
if not os.path.exists("gdino_checkpoints/groundingdino_swint_ogc.pth"):
    !wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth -P gdino_checkpoints/

# Load model with correct paths
grounding_model = load_model(
    model_config_path="gdino_checkpoints/GroundingDINO_SwinT_OGC.py",
    model_checkpoint_path="gdino_checkpoints/groundingdino_swint_ogc.pth"
)

print("✅ GroundingDINO loaded successfully!")

BOX_THRESHOLD = 0.25
TEXT_THRESHOLD = 0.25

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


🔧 Initializing GroundingDINO...
final text_encoder_type: bert-base-uncased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ GroundingDINO loaded successfully!


In [6]:
def load_image(image_path):
    if image_path.startswith('http'):
        import urllib.request
        urllib.request.urlretrieve(image_path, 'temp_image.jpg')
        image_path = 'temp_image.jpg'

    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image

def text_to_boxes(image, text_prompt, model):
    boxes, logits, phrases = predict(
        model=model,
        image=image,
        caption=text_prompt,
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD
    )

    h, w = image.shape[:2]
    boxes = boxes * torch.tensor([w, h, w, h])

    boxes_xyxy = torch.zeros_like(boxes)
    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2
    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2
    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2
    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2

    return boxes_xyxy.cpu().numpy(), logits.cpu().numpy(), phrases

def visualize_results(image, masks, boxes, labels, title="Segmentation Results"):
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    axes[0].imshow(image)
    axes[0].set_title("Original Image", fontsize=14, fontweight='bold')
    axes[0].axis('off')

    img_boxes = image.copy()
    for box, label in zip(boxes, labels):
        x1, y1, x2, y2 = box.astype(int)
        cv2.rectangle(img_boxes, (x1, y1), (x2, y2), (0, 255, 0), 3)
        cv2.putText(img_boxes, label, (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    axes[1].imshow(img_boxes)
    axes[1].set_title("Detected Objects", fontsize=14, fontweight='bold')
    axes[1].axis('off')

    img_masked = image.copy()
    if len(masks) > 0:
        colors = plt.cm.tab10(np.linspace(0, 1, len(masks)))[:, :3] * 255
        for i, mask in enumerate(masks):
            color = colors[i % len(colors)]
            mask_bool = mask.astype(bool)
            img_masked[mask_bool] = img_masked[mask_bool] * 0.5 + color * 0.5

            contours, _ = cv2.findContours(mask.astype(np.uint8),
                                           cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_SIMPLE)
            cv2.drawContours(img_masked, contours, -1, color.tolist(), 3)

    axes[2].imshow(img_masked.astype(np.uint8))
    axes[2].set_title("Final Segmentation", fontsize=14, fontweight='bold')
    axes[2].axis('off')

    plt.suptitle(title, fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()

print("✅ Helper functions defined!")

✅ Helper functions defined!


In [7]:
#Main Text-Driven Segmentation Pipeline
def text_driven_segmentation(image_path, text_prompt):
    """Complete pipeline: text prompt → object detection → segmentation"""
    print(f"🎯 Processing: '{text_prompt}'")
    print("-" * 60)

    # Step 1: Load image
    print("📸 Step 1/4: Loading image...")
    image = load_image(image_path)
    print(f"   Image shape: {image.shape}")

    # Step 2: Text-to-box detection
    print(f"🔍 Step 2/4: Detecting '{text_prompt}' in image...")
    boxes, confidences, labels = text_to_boxes(image, text_prompt, grounding_model)
    print(f"   Found {len(boxes)} object(s)")

    if len(boxes) == 0:
        print("   ⚠️  No objects detected! Try a different prompt or lower threshold.")
        return None, None, None

    for i, (label, conf) in enumerate(zip(labels, confidences)):
        print(f"   - Object {i+1}: {label} (confidence: {conf:.2f})")

    # Step 3: Set image for SAM 2
    print("🎨 Step 3/4: Preparing SAM 2...")
    sam2_predictor.set_image(image)

    # Step 4: Generate masks
    print("✨ Step 4/4: Generating segmentation masks...")
    masks_list = []

    for i, box in enumerate(boxes):
        masks, scores, _ = sam2_predictor.predict(
            point_coords=None,
            point_labels=None,
            box=box,
            multimask_output=False
        )
        masks_list.append(masks[0])

    masks_array = np.array(masks_list)
    print(f"   Generated {len(masks_array)} mask(s)")
    print("-" * 60)
    print("✅ Segmentation complete!")

    # Visualize results
    visualize_results(image, masks_array, boxes, labels,
                         title=f"Text-Driven Segmentation: '{text_prompt}'")

    return masks_array, boxes, labels

print("✅ Pipeline function ready!")

✅ Pipeline function ready!


In [8]:
# Helper Functions (FIXED)
import torch
from PIL import Image as PILImage
from groundingdino.util.inference import Model, predict # Import predict here
import groundingdino.datasets.transforms as T
import cv2
import numpy as np
import matplotlib.pyplot as plt

def load_image(image_path):
    """Load image from file or URL"""
    if image_path.startswith('http'):
        import urllib.request
        urllib.request.urlretrieve(image_path, 'temp_image.jpg')
        image_path = 'temp_image.jpg'

    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image

def text_to_boxes(image, text_prompt, model):
    """Convert text prompt to bounding boxes using GroundingDINO"""
    # Save image temporarily
    temp_path = "temp_for_dino.jpg"
    cv2.imwrite(temp_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

    # Load as PIL Image
    image_pil = PILImage.open(temp_path).convert("RGB")

    # Apply transforms
    transform = T.Compose([
        T.RandomResize([800], max_size=1333),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    image_transformed, _ = transform(image_pil, None)

    # Predict
    boxes, logits, phrases = predict(
        model=model,
        image=image_transformed,
        caption=text_prompt,
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD,
        device=device
    )

    # Convert to absolute coordinates
    h, w = image.shape[:2]
    boxes = boxes * torch.tensor([w, h, w, h]).to(boxes.device)

    # Convert from cxcywh to xyxy format
    boxes_xyxy = torch.zeros_like(boxes)
    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2  # x1
    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2  # y1
    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2  # x2
    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2  # y2

    return boxes_xyxy.cpu().numpy(), logits.cpu().numpy(), phrases

def visualize_results(image, masks, boxes, labels, title="Segmentation Results"):
    """Visualize segmentation results with masks and boxes"""
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # Original image
    axes[0].imshow(image)
    axes[0].set_title("Original Image", fontsize=14, fontweight='bold')
    axes[0].axis('off')

    # Image with bounding boxes
    img_boxes = image.copy()
    for box, label in zip(boxes, labels):
        x1, y1, x2, y2 = box.astype(int)
        cv2.rectangle(img_boxes, (x1, y1), (x2, y2), (0, 255, 0), 3)
        cv2.putText(img_boxes, label, (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    axes[1].imshow(img_boxes)
    axes[1].set_title("Detected Objects", fontsize=14, fontweight='bold')
    axes[1].axis('off')

    # Segmentation masks overlay
    img_masked = image.copy()
    if len(masks) > 0:
        colors = plt.cm.tab10(np.linspace(0, 1, len(masks)))[:, :3] * 255
        for i, mask in enumerate(masks):
            color = colors[i % len(colors)]
            mask_bool = mask.astype(bool)
            img_masked[mask_bool] = img_masked[mask_bool] * 0.5 + color * 0.5

            # Draw contours
            contours, _ = cv2.findContours(mask.astype(np.uint8),
                                           cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_SIMPLE)
            cv2.drawContours(img_masked, contours, -1, color.tolist(), 3)

    axes[2].imshow(img_masked.astype(np.uint8))
    axes[2].set_title("Final Segmentation", fontsize=14, fontweight='bold')
    axes[2].axis('off')

    plt.suptitle(title, fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()

    return img_masked

print("✅ Helper functions defined!")

✅ Helper functions defined!


In [9]:
# Advanced - Multiple Object Segmentation
def multi_object_segmentation(image_path, text_prompts):
    """Segment multiple different objects using different prompts"""
    print(f"🎯 Multi-Object Segmentation")
    print(f"   Prompts: {', '.join(text_prompts)}")
    print("="*60)

    image = load_image(image_path)

    all_masks = []
    all_boxes = []
    all_labels = []

    # Set image for SAM once
    sam2_predictor.set_image(image)

    for prompt in text_prompts:
        print(f"\n🔍 Processing '{prompt}'...")

        boxes, confidences, labels = text_to_boxes(image, prompt, grounding_model)

        if len(boxes) == 0:
            print(f"   ⚠️  No '{prompt}' detected")
            continue

        print(f"   Found {len(boxes)} instance(s)")

        for box in boxes:
            masks, scores, _ = sam2_predictor.predict(
                point_coords=None,
                point_labels=None,
                box=box,
                multimask_output=False
            )
            all_masks.append(masks[0])
            all_boxes.append(box)
            all_labels.extend(labels)


    if len(all_masks) == 0:
        print("\n❌ No objects detected with any prompt!")
        return None, None, None

    print(f"\n✅ Total objects segmented: {len(all_masks)}")
    print("="*60)

    masks_array = np.array(all_masks)
    boxes_array = np.array(all_boxes)
    visualize_results(image, masks_array, boxes_array, all_labels,
                         title=f"Multi-Object: {', '.join(text_prompts)}")

    return masks_array, boxes_array, all_labels

print("✅ Multi-object function ready!")

# Example usage
print("\n💡 Try it:")
print("   image_url = 'https://images.unsplash.com/photo-1449965408869-eaa3f722e40d?w=800'")
print("   prompts = ['car', 'person']")
print("   multi_object_segmentation(image_url, prompts)")

✅ Multi-object function ready!

💡 Try it:
   image_url = 'https://images.unsplash.com/photo-1449965408869-eaa3f722e40d?w=800'
   prompts = ['car', 'person']
   multi_object_segmentation(image_url, prompts)


In [10]:
# Export & Save Results
import json
from datetime import datetime
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import shutil

def save_segmentation_results(image, masks, boxes, labels, output_name="result"):
    """Save segmentation results to files"""
    print(f"💾 Saving results as '{output_name}'...")

    # Create output directory
    output_dir = f"outputs_{output_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Save overlay image
    img_overlay = image.copy()
    colors = plt.cm.tab10(np.linspace(0, 1, len(masks)))[:, :3] * 255

    for i, mask in enumerate(masks):
        color = colors[i % len(colors)]
        mask_bool = mask.astype(bool)
        img_overlay[mask_bool] = img_overlay[mask_bool] * 0.5 + color * 0.5

        contours, _ = cv2.findContours(mask.astype(np.uint8),
                                       cv2.RETR_EXTERNAL,
                                       cv2.CHAIN_APPROX_SIMPLE)
        cv2.drawContours(img_overlay, contours, -1, color.tolist(), 3)

    overlay_path = os.path.join(output_dir, f"{output_name}_overlay.png")
    cv2.imwrite(overlay_path, cv2.cvtColor(img_overlay.astype(np.uint8), cv2.COLOR_RGB2BGR))
    print(f"   ✅ Saved overlay: {overlay_path}")

    # Save individual masks
    for i, mask in enumerate(masks):
        mask_path = os.path.join(output_dir, f"{output_name}_mask_{i+1}.png")
        cv2.imwrite(mask_path, (mask * 255).astype(np.uint8))
    print(f"   ✅ Saved {len(masks)} individual mask(s)")

    # Save metadata
    metadata = {
        "timestamp": datetime.now().isoformat(),
        "image_shape": image.shape,
        "num_objects": len(masks),
        "labels": labels,
        "boxes": boxes.tolist() if isinstance(boxes, np.ndarray) else boxes,
        "parameters": {
            "box_threshold": BOX_THRESHOLD,
            "text_threshold": TEXT_THRESHOLD,
            "model": "SAM2-Hiera-Small",
            "detector": "GroundingDINO"
        }
    }

    metadata_path = os.path.join(output_dir, f"{output_name}_metadata.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"   ✅ Saved metadata: {metadata_path}")

    # Create zip file
    zip_path = f"{output_name}_results"
    shutil.make_archive(zip_path, 'zip', output_dir)
    print(f"\n📦 Created zip file: {zip_path}.zip")

    return output_dir

print("✅ Save function ready!")

# Example usage
print("\n💡 After running segmentation, save with:")
print("   save_segmentation_results(image, masks, boxes, labels, 'my_segmentation')")

✅ Save function ready!

💡 After running segmentation, save with:
   save_segmentation_results(image, masks, boxes, labels, 'my_segmentation')


In [11]:
# Segmentation Quality Analysis
def analyze_segmentation_quality(masks, boxes, image_shape):
    """Analyze segmentation quality metrics"""
    print("\n📊 Segmentation Quality Analysis")
    print("="*60)

    h, w = image_shape[:2]
    total_pixels = h * w

    for i, (mask, box) in enumerate(zip(masks, boxes)):
        print(f"\n🔍 Object {i+1}:")

        # Mask statistics
        mask_area = np.sum(mask)
        mask_percentage = (mask_area / total_pixels) * 100

        # Box statistics
        x1, y1, x2, y2 = box
        box_area = (x2 - x1) * (y2 - y1)
        box_percentage = (box_area / total_pixels) * 100

        # Mask-to-box ratio
        fill_ratio = (mask_area / box_area) * 100 if box_area > 0 else 0

        print(f"   Mask Area: {mask_area:,} pixels ({mask_percentage:.2f}% of image)")
        print(f"   Box Area: {box_area:,.0f} pixels ({box_percentage:.2f}% of image)")
        print(f"   Fill Ratio: {fill_ratio:.1f}% (mask/box)")

        # Shape compactness
        contours, _ = cv2.findContours(mask.astype(np.uint8),
                                       cv2.RETR_EXTERNAL,
                                       cv2.CHAIN_APPROX_SIMPLE)
        if contours:
            perimeter = cv2.arcLength(contours[0], True)
            compactness = (4 * np.pi * mask_area) / (perimeter ** 2) if perimeter > 0 else 0
            print(f"   Compactness: {compactness:.3f} (1.0 = perfect circle)")

            # Quality indicators
            print(f"   Quality Indicators:")
            if fill_ratio > 80:
                print(f"      ✅ High fill ratio - tight segmentation")
            elif fill_ratio > 50:
                print(f"      ⚠️  Moderate fill ratio")
            else:
                print(f"      ❌ Low fill ratio - may need refinement")

    print("\n" + "="*60)

    print("✅ Analysis function ready!")

    # Example usage
    print("\n💡 After segmentation, analyze with:")
    print("   analyze_segmentation_quality(masks, boxes, image.shape)")

In [12]:
#  Create Comparison Grid
def create_comparison_grid(image_path, text_prompts, save=True):
    """Create a comparison grid of different segmentation prompts"""
    print(f"🎨 Creating comparison grid...")

    image = load_image(image_path)
    n_prompts = len(text_prompts)

    fig, axes = plt.subplots(1, n_prompts + 1, figsize=(6 * (n_prompts + 1), 6))

    # Original image
    axes[0].imshow(image)
    axes[0].set_title("Original Image", fontsize=14, fontweight='bold')
    axes[0].axis('off')

    # Set image for SAM
    sam2_predictor.set_image(image)

    # Process each prompt
    for idx, prompt in enumerate(text_prompts):
        print(f"   Processing '{prompt}'...")

        boxes, confidences, labels = text_to_boxes(image, prompt, grounding_model)

        img_result = image.copy()
        if len(boxes) > 0:
            for box in boxes:
                masks, scores, _ = sam2_predictor.predict(
                    point_coords=None,
                    point_labels=None,
                    box=box,
                    multimask_output=False
                )

                mask = masks[0]
                color = np.array([255, 0, 0])
                mask_bool = mask.astype(bool)
                img_result[mask_bool] = img_result[mask_bool] * 0.5 + color * 0.5

                contours, _ = cv2.findContours(mask.astype(np.uint8),
                                               cv2.RETR_EXTERNAL,
                                               cv2.CHAIN_APPROX_SIMPLE)
                cv2.drawContours(img_result, contours, -1, (255, 0, 0), 2)

        axes[idx + 1].imshow(img_result.astype(np.uint8))
        axes[idx + 1].set_title(f"'{prompt}'\n{len(boxes)} detected",
                               fontsize=12, fontweight='bold')
        axes[idx + 1].axis('off')

    plt.suptitle("Text Prompt Comparison", fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()

    if save:
        plt.savefig("comparison_grid.png", dpi=150, bbox_inches='tight')
        print(f"   ✅ Saved: comparison_grid.png")

    plt.show()

print("✅ Comparison function ready!")

# Example usage
print("\n💡 Try it:")
print("   image_url = 'https://images.unsplash.com/photo-1583511655857-d19b40a7a54e?w=800'")
print("   prompts = ['dog', 'animal', 'pet']")
print("   create_comparison_grid(image_url, prompts)")

✅ Comparison function ready!

💡 Try it:
   image_url = 'https://images.unsplash.com/photo-1583511655857-d19b40a7a54e?w=800'
   prompts = ['dog', 'animal', 'pet']
   create_comparison_grid(image_url, prompts)


In [13]:
# Interactive Segmentation Demo
def run_interactive_demo():
    """Run an interactive demo with multiple examples"""
    print("="*70)
    print("  🎯 INTERACTIVE SEGMENTATION DEMO")
    print("="*70)

    # Example 1: Single object
    print("\n📌 EXAMPLE 1: Single Object Segmentation")
    print("-"*70)
    image_url = "https://images.unsplash.com/photo-1583511655857-d19b40a7a54e?w=800"
    masks, boxes, labels = text_driven_segmentation(image_url, "dog")

    if masks is not None:
        analyze_segmentation_quality(masks, boxes, load_image(image_url).shape)

    # Example 2: Multiple prompts
    print("\n📌 EXAMPLE 2: Multiple Objects")
    print("-"*70)
    street_url = "https://images.unsplash.com/photo-1449965408869-eaa3f722e40d?w=800"
    multi_object_segmentation(street_url, ['car', 'person'])

    # Example 3: Comparison
    print("\n📌 EXAMPLE 3: Prompt Comparison")
    print("-"*70)
    create_comparison_grid(image_url, ['dog', 'animal'], save=False)

    print("\n" + "="*70)
    print("  ✅ DEMO COMPLETE!")
    print("="*70)
    print("\n💡 Now try with your own images and prompts!")

print("✅ Interactive demo ready!")
print("\n▶️  Run: run_interactive_demo()")

✅ Interactive demo ready!

▶️  Run: run_interactive_demo()


In [14]:
#  Threshold Testing
def test_thresholds(image_path, text_prompt, thresholds=[0.20, 0.25, 0.30]):
    """Test different detection thresholds"""
    print(f"🔬 Threshold Sensitivity Analysis: '{text_prompt}'")
    print("="*60)

    image = load_image(image_path)
    global BOX_THRESHOLD

    original_box_threshold = BOX_THRESHOLD # Store original threshold

    for thresh in thresholds:
        BOX_THRESHOLD = thresh

        boxes, confidences, labels = text_to_boxes(image, text_prompt, grounding_model)
        print(f"   Threshold {thresh:.2f}: {len(boxes)} detections")

        if len(boxes) > 0:
            avg_conf = np.mean(confidences)
            print(f"      Average confidence: {avg_conf:.3f}")

    # Reset to original threshold
    BOX_THRESHOLD = original_box_threshold

    print("\n💡 Recommendation:")
    print("   - Lower threshold (0.20): More detections")
    print("   - Default (0.25): Balanced")
    print("   - Higher threshold (0.30): Fewer but more confident")
    print("="*60)

print("✅ Threshold testing ready!")

# Example usage
print("\n💡 Test with:")
print("   test_thresholds('image_url', 'dog', [0.20, 0.25, 0.30])")

✅ Threshold testing ready!

💡 Test with:
   test_thresholds('image_url', 'dog', [0.20, 0.25, 0.30])


In [None]:
# BONUS - Upload and Process Video
from google.colab import files
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import time # Import time for potential delay

print("="*70)
print("  🎬 BONUS: TEXT-DRIVEN VIDEO SEGMENTATION")
print("="*70)
print("\n📋 Instructions:")
print("   1. Upload a video file (10-30 seconds recommended)")
print("   2. Enter the object you want to track")
print("   3. Wait for processing (may take 30-60 seconds)")
print("   4. Watch the segmented video!\n")
print("💡 Tips:")
print("   - Use clear, stable videos for best results")
print("   - Object should be visible in the first frame")
print("   - Shorter videos = faster processing")
print("   - Try: 'person', 'car', 'dog', 'ball', etc.\n")
print("="*70 + "\n")

# Upload video
print("📤 Upload your video file:")
uploaded = files.upload()

if uploaded:
    video_path = list(uploaded.keys())[0]
    print(f"\n✅ Video uploaded: {video_path}")

    # Get video info
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Error: Could not open video file {video_path}")
    else:
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames / fps if fps > 0 else 0
        cap.release()

        print(f"📊 Video Info:")
        print(f"   - Total frames: {total_frames}")
        print(f"   - FPS: {fps:.1f}")
        print(f"   - Duration: {duration:.1f} seconds")

        # Get text prompt
        text_prompt = input("\n🎯 Enter object to track (e.g., 'person', 'car', 'dog'), then PRESS ENTER: ").strip()

        if not text_prompt:
            print("❌ No prompt entered!")
        else:
            # Process video (limit to 90 frames for Colab T4)
            max_frames = min(90, total_frames)

            if total_frames > 90:
                print(f"\n⚠️  Video has {total_frames} frames. Processing first {max_frames} frames.")
                print(f"   This is ~{max_frames/fps:.1f} seconds of video.")

            print(f"\n⏳ Processing '{text_prompt}'... This may take 30-60 seconds...\n")

            # Assuming text_driven_video_segmentation is defined elsewhere and takes video_path, text_prompt, and max_frames
            # Need to implement or use the actual video processing function here.
            # Placeholder for the actual video processing call:
            # output = text_driven_video_segmentation(video_path, text_prompt, max_frames=max_frames)

            # Since the video processing function is not provided, I'll add a placeholder message.
            print("🚧 Video processing function (text_driven_video_segmentation) is not defined in the current notebook state.")
            print("   Please ensure this function is defined and runnable.")
            output = None # Set output to None as processing didn't happen

            if output:
                print(f"\n🎉 SUCCESS! Video segmentation complete!")
                print(f"📁 Output saved as: {output}")
                print(f"\n💾 Download the video from the file browser (left sidebar)")
            else:
                print(f"\n❌ Video segmentation failed or function not found. Try a different prompt or video.")
else:
    print("❌ No video uploaded!")
    print("\n💡 Alternative: Use sample video URL")
    print("   You can also download a video first:")
    print("   !wget https://example.com/sample_video.mp4 -O sample.mp4")
    print("   Then run: text_driven_video_segmentation('sample.mp4', 'person', 60)")

  🎬 BONUS: TEXT-DRIVEN VIDEO SEGMENTATION

📋 Instructions:
   1. Upload a video file (10-30 seconds recommended)
   2. Enter the object you want to track
   3. Wait for processing (may take 30-60 seconds)
   4. Watch the segmented video!

💡 Tips:
   - Use clear, stable videos for best results
   - Object should be visible in the first frame
   - Shorter videos = faster processing
   - Try: 'person', 'car', 'dog', 'ball', etc.


📤 Upload your video file:


Saving 8833602-uhd_4096_2160_25fps.mp4 to 8833602-uhd_4096_2160_25fps.mp4

✅ Video uploaded: 8833602-uhd_4096_2160_25fps.mp4
📊 Video Info:
   - Total frames: 398
   - FPS: 25.0
   - Duration: 15.9 seconds


In [None]:
# Cell 16: BONUS - Fast Video Processing (OPTIMIZED)
from google.colab import files
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import time # Import time for potential delay

print("="*70)
print("  🎬 BONUS: TEXT-DRIVEN VIDEO SEGMENTATION (FAST MODE)")
print("="*70)
print("\n📋 Instructions:")
print("   1. Upload a SHORT video (5-10 seconds recommended)")
print("   2. Enter the object you want to track")
print("   3. Processing time: ~15-30 seconds")
print("\n💡 Tips for FAST processing:")
print("   - Use videos under 10 seconds")
print("   - Lower resolution = faster processing")
print("   - We'll process every 2nd frame for speed\n")
print("="*70 + "\n")

# Upload video
print("📤 Upload your video file:")
uploaded = files.upload()

if uploaded:
    video_path = list(uploaded.keys())[0]
    print(f"\n✅ Video uploaded: {video_path}")

    # Get video info
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Error: Could not open video file {video_path}")
    else:
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames / fps if fps > 0 else 0
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        cap.release()

        print(f"📊 Video Info:")
        print(f"   - Duration: {duration:.1f}s")
        print(f"   - Total frames: {total_frames}")
        print(f"   - Resolution: {width}x{height}")
        print(f"   - FPS: {fps:.1f}")

        # Auto-calculate optimal frame limit for fast processing
        if duration > 10:
            max_frames = 60  # ~2 seconds
            print(f"\n⚠️  Long video detected! Processing first 2 seconds only.")
        elif duration > 5:
            max_frames = 90  # ~3 seconds
            print(f"\n✅ Processing first 3 seconds")
        else:
            max_frames = min(total_frames, 150)  # All frames if short
            print(f"\n✅ Processing entire video")

        print(f"   Will process: {max_frames} frames (~{max_frames/fps:.1f}s)")

        # Get text prompt
        text_prompt = input("\n🎯 Enter object to track, then PRESS ENTER: ").strip()

        if not text_prompt:
            print("❌ No prompt entered!")
        else:
            print(f"\n⏳ Processing '{text_prompt}'... Please wait 15-30 seconds...\n")

            # Process video with optimized settings
            # Assuming text_driven_video_segmentation and create_video_summary are defined elsewhere
            # Placeholder for the actual video processing and summary calls:
            # output = text_driven_video_segmentation(video_path, text_prompt, max_frames=max_frames)
            # if output:
            #     create_video_summary(video_path, output)

            # Since the video processing and summary functions are not provided, I'll add placeholder messages.
            print("🚧 Video processing function (text_driven_video_segmentation) is not defined in the current notebook state.")
            print("   Please ensure this function is defined and runnable.")
            output = None # Set output to None as processing didn't happen
            create_video_summary_placeholder = True # Flag to indicate that summary would be called

            if output:
                print(f"\n🎉 SUCCESS! Video segmentation complete!")
                print(f"📁 Output: {output}")

                # Create comparison
                if create_video_summary_placeholder:
                     print(f"\n📊 Creating before/after comparison...")
                     print("🚧 Video summary function (create_video_summary) is not defined in the current notebook state.")
                     print("   Please ensure this function is defined and runnable.")

                print(f"\n💾 To download:")
                print(f"   - Click folder icon (left sidebar)")
                print(f"   - Right-click '{output}' → Download")
            else:
                print(f"\n❌ Failed. Try:")
                print(f"   - Different prompt (e.g., 'person', 'car', 'dog')")
                print(f"   - Shorter video")
                print(f"   - Object must be visible in first frame")
else:
    print("❌ No video uploaded!")

In [None]:
# Cell 18: BONUS - Video Segmentation Summary

def create_video_summary(video_path, output_path):
    """Create a summary comparison showing original vs segmented"""
    if not os.path.exists(video_path) or not os.path.exists(output_path):
        print("❌ Video files not found!")
        return

    print("📊 Creating video comparison summary...")

    # Read first, middle, and last frames from both videos
    cap_orig = cv2.VideoCapture(video_path)
    cap_seg = cv2.VideoCapture(output_path)

    total_frames = int(cap_orig.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = [0, total_frames//2, total_frames-1]

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle("Video Segmentation: Before & After", fontsize=16, fontweight='bold')

    titles = ['First Frame', 'Middle Frame', 'Last Frame']

    for idx, frame_idx in enumerate(frame_indices):
        # Original
        cap_orig.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame_orig = cap_orig.read()
        if ret:
            axes[0, idx].imshow(cv2.cvtColor(frame_orig, cv2.COLOR_BGR2RGB))
            axes[0, idx].set_title(f"Original - {titles[idx]}", fontweight='bold')
            axes[0, idx].axis('off')

        # Segmented
        cap_seg.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame_seg = cap_seg.read()
        if ret:
            axes[1, idx].imshow(cv2.cvtColor(frame_seg, cv2.COLOR_BGR2RGB))
            axes[1, idx].set_title(f"Segmented - {titles[idx]}", fontweight='bold')
            axes[1, idx].axis('off')

    cap_orig.release()
    cap_seg.release()

    plt.tight_layout()
    plt.savefig("video_summary.png", dpi=150, bbox_inches='tight')
    print("✅ Summary saved as: video_summary.png")
    plt.show()

    print("✅ Video summary function ready!")
    print("\n💡 After video segmentation, create summary with:")
    print("   create_video_summary('input_video.mp4', 'segmented_video.mp4')")