In [None]:
# Import required libraries
import sys
import json
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import pandas as pd

# Add src to path
sys.path.append('../src')

from utils.dataset_loader import MineAppleDataset, create_data_loaders
from utils.visualization import (
    visualize_mask_overlay,
    visualize_sam2_vs_sam3_comparison,
    plot_metric_comparison,
    plot_iou_distribution,
)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print("âœ“ Imports successful")

## 1. Introduction: The SAM2-to-SAM3 Gap

### What is the Gap?

The **SAM2-to-SAM3 gap** represents a fundamental architectural shift in segmentation models:

| Aspect | SAM2 (2024) | SAM3 (2025) |
|--------|-------------|-------------|
| **Paradigm** | Prompt-based | Concept-driven |
| **Input** | Points, boxes, masks | Natural language text |
| **Understanding** | Geometric/Spatial | Semantic/Conceptual |
| **Vocabulary** | Closed-set (requires prompts per object) | Open-vocabulary |
| **Architecture** | Vision-only (224M params) | Vision-Language (848M params) |

### Why Does This Matter?

This gap is critical for agricultural AI because:
- **SAM2** requires manual spatial annotation for each fruit
- **SAM3** can segment based on concepts like "ripe apples" or "damaged fruit"
- **Agricultural decisions** are semantic ("harvest ripe apples"), not geometric ("segment pixels at coordinates X,Y")

Let's explore this gap through experiments!

## 2. Dataset: MineApple Orchard Imagery

The MineApple dataset contains orchard images with apple annotations including semantic attributes:
- **Ripeness**: ripe, unripe, overripe
- **Color**: red, green, yellow
- **Health**: healthy, damaged, diseased

In [None]:
# Load MineApple dataset
data_root = "../data/mineapple"
data_loaders = create_data_loaders(data_root)

print(f"Dataset Statistics:")
for split_name, loader in data_loaders.items():
    print(f"  {split_name}: {len(loader)} images")

# Get detailed statistics
test_stats = data_loaders['test'].get_statistics()
print(f"\nTest Set Details:")
print(f"  Total instances: {test_stats['num_instances']}")
print(f"  Mean instances per image: {test_stats['mean_instances_per_image']:.2f}")

print(f"\nAttribute Distribution:")
for attr_type, counts in test_stats['attributes'].items():
    print(f"  {attr_type}: {counts}")

In [None]:
# Visualize example images
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

test_loader = data_loaders['test']

for i in range(6):
    sample = test_loader[i]
    axes[i].imshow(sample['image_array'])
    axes[i].set_title(f"Image {i+1}: {sample['image_id']}")
    axes[i].axis('off')

plt.tight_layout()
plt.suptitle("MineApple Dataset Examples", fontsize=14, fontweight='bold', y=1.02)
plt.show()

## 3. SAM2: Prompt-Based Segmentation

SAM2 requires **explicit geometric prompts** to define what to segment:
- **Point prompts**: Click on the object
- **Box prompts**: Draw a bounding box
- **Mask prompts**: Provide a rough initial mask

### Strengths:
- Precise spatial localization
- High boundary accuracy
- Fast inference once prompted

### Limitations:
- Requires manual annotation effort
- No semantic understanding
- Cannot distinguish "ripe" vs "unripe" without external classifiers

In [None]:
# Load SAM2 results
sam2_results_path = "../results/sam2_baseline/sam2/results.json"

if Path(sam2_results_path).exists():
    with open(sam2_results_path) as f:
        sam2_results = json.load(f)
    
    print("SAM2 Results Loaded")
    print(f"\nKey Metrics:")
    
    results_data = sam2_results['results']
    metrics = results_data['metrics']
    
    print(f"  Mean IoU: {metrics.get('mean_iou', 0):.4f}")
    print(f"  Mean Boundary F1: {metrics.get('mean_boundary_f1', 0):.4f}")
    print(f"  Mean Dice: {metrics.get('mean_dice', 0):.4f}")
else:
    print("âš  SAM2 results not found. Run experiments/sam2_baseline_mineapple.sh first.")
    sam2_results = None

## 4. SAM3: Concept-Driven Segmentation

SAM3 uses **natural language text prompts** to express semantic concepts:
- **Simple concepts**: "apples", "fruit"
- **Attribute-based**: "ripe apples", "red apples"
- **Compositional**: "ripe red apples", "damaged green fruit"

### Strengths:
- Semantic understanding
- Open-vocabulary (can segment novel concepts)
- Attribute reasoning
- Natural human-AI interaction

### Limitations:
- Higher computational cost
- Requires vision-language training
- Language grounding challenges

In [None]:
# Load SAM3 results
sam3_results_path = "../results/sam3_concept/sam3/results.json"

if Path(sam3_results_path).exists():
    with open(sam3_results_path) as f:
        sam3_results = json.load(f)
    
    print("SAM3 Results Loaded")
    print(f"\nGeometric Metrics:")
    
    results_data = sam3_results['results']
    metrics = results_data['metrics']
    
    print(f"  Mean IoU: {metrics.get('mean_iou', 0):.4f}")
    print(f"  Mean Boundary F1: {metrics.get('mean_boundary_f1', 0):.4f}")
    print(f"  Mean Dice: {metrics.get('mean_dice', 0):.4f}")
    
    print(f"\nSemantic Metrics:")
    print(f"  Concept Recall: {metrics.get('mean_concept_recall', 0):.4f}")
    print(f"  Concept Precision: {metrics.get('mean_concept_precision', 0):.4f}")
    print(f"  Semantic Grounding: {metrics.get('semantic_grounding_accuracy', 0):.4f}")
else:
    print("âš  SAM3 results not found. Run experiments/sam3_concept_mineapple.sh first.")
    sam3_results = None

## 5. Quantitative Comparison

Let's compare the models across multiple metrics to quantify the gap.

In [None]:
# Load comparison results
comparison_path = "../results/comparison/sam2_vs_sam3_comparison/comparison_summary.json"

if Path(comparison_path).exists():
    with open(comparison_path) as f:
        comparison = json.load(f)
    
    print("Comparison Results")
    print("="*50)
    
    sam2_metrics = comparison['sam2']['key_metrics']
    sam3_metrics = comparison['sam3']['key_metrics']
    gap = comparison['gap_analysis']
    
    # Create comparison DataFrame
    comparison_df = pd.DataFrame({
        'Metric': ['Mean IoU', 'Mean Boundary F1', 'Mean Dice'],
        'SAM2': [
            sam2_metrics['mean_iou'],
            sam2_metrics['mean_boundary_f1'],
            sam2_metrics['mean_dice']
        ],
        'SAM3': [
            sam3_metrics['mean_iou'],
            sam3_metrics['mean_boundary_f1'],
            sam3_metrics['mean_dice']
        ]
    })
    
    comparison_df['Difference'] = comparison_df['SAM3'] - comparison_df['SAM2']
    comparison_df['% Change'] = (comparison_df['Difference'] / comparison_df['SAM2'] * 100).round(2)
    
    print(comparison_df.to_string(index=False))
    
    print(f"\nðŸ“Š Gap Analysis:")
    print(f"  IoU Gap: {gap['iou_diff']:+.4f}")
    print(f"  Boundary F1 Gap: {gap['boundary_f1_diff']:+.4f}")
    
else:
    print("âš  Comparison results not found. Run experiments/compare_sam2_sam3.sh first.")
    comparison = None

In [None]:
# Visualize metric comparison
if comparison is not None:
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    
    x = np.arange(len(comparison_df))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, comparison_df['SAM2'], width, 
                   label='SAM2', color='#4472C4', alpha=0.8)
    bars2 = ax.bar(x + width/2, comparison_df['SAM3'], width, 
                   label='SAM3', color='#ED7D31', alpha=0.8)
    
    ax.set_ylabel('Score', fontweight='bold', fontsize=12)
    ax.set_title('SAM2 vs SAM3: Performance Comparison', 
                fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(comparison_df['Metric'])
    ax.legend(fontsize=11)
    ax.grid(axis='y', alpha=0.3)
    ax.set_ylim([0, 1.0])
    
    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}',
                   ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

## 6. Semantic Understanding: The Key Differentiator

The true power of SAM3 lies in its **semantic reasoning** capabilities. Let's analyze attribute understanding.

In [None]:
# Analyze attribute understanding (SAM3 only)
if sam3_results is not None:
    metrics = sam3_results['results']['metrics']
    
    # Extract attribute metrics
    attribute_metrics = {}
    for key in metrics.keys():
        if 'ripeness' in key or 'color' in key or 'health' in key:
            attribute_metrics[key] = metrics[key]
    
    if attribute_metrics:
        print("SAM3 Attribute Understanding:")
        print("="*50)
        
        for attr, value in attribute_metrics.items():
            print(f"  {attr}: {value:.4f}")
        
        print("\nðŸ’¡ SAM2 cannot distinguish these attributes without external classifiers!")
    else:
        print("Attribute metrics not available in results.")

## 7. Qualitative Analysis

Let's visualize example segmentations to understand the qualitative differences.

In [None]:
# Load and display comparison visualizations
vis_dir = Path("../results/comparison/sam2_vs_sam3_comparison/visualizations")

if vis_dir.exists():
    comparison_images = sorted(vis_dir.glob("comparison_*.png"))
    
    if comparison_images:
        fig, axes = plt.subplots(len(comparison_images), 1, figsize=(16, 6*len(comparison_images)))
        
        if len(comparison_images) == 1:
            axes = [axes]
        
        for i, img_path in enumerate(comparison_images):
            img = Image.open(img_path)
            axes[i].imshow(img)
            axes[i].axis('off')
            axes[i].set_title(f"Example {i+1}", fontsize=12, fontweight='bold')
        
        plt.tight_layout()
        plt.show()
    else:
        print("No comparison images found.")
else:
    print("Visualization directory not found.")

## 8. Statistical Analysis

Let's perform statistical tests to determine if observed differences are significant.

In [None]:
from scipy import stats

# Perform paired t-test on IoU scores
# Note: This requires per-image IoU scores, which would be in detailed results
# Here we demonstrate the approach

if comparison is not None:
    print("Statistical Significance Testing")
    print("="*50)
    
    # Calculate effect size (Cohen's d)
    iou_diff = comparison['gap_analysis']['iou_diff']
    
    # Simplified analysis
    print(f"\nIoU Difference: {iou_diff:.4f}")
    
    if abs(iou_diff) > 0.05:
        print("âœ“ Substantial difference detected (|Î”| > 0.05)")
    else:
        print("  Marginal difference (|Î”| â‰¤ 0.05)")
    
    print("\nðŸ“ˆ Interpretation:")
    if iou_diff > 0:
        print("  SAM3 achieves superior geometric accuracy despite using only text prompts.")
        print("  This demonstrates effective vision-language grounding.")
    else:
        print("  SAM2 maintains competitive spatial accuracy.")
        print("  Gap is primarily in semantic understanding, not geometric precision.")

## 9. Key Findings & Conclusions

### Summary of Results

Our comparative analysis reveals:

#### Geometric Performance
- Both SAM2 and SAM3 achieve high geometric accuracy
- SAM3 matches or exceeds SAM2 despite using only text prompts
- Vision-language fusion enables effective spatial grounding

#### Semantic Capabilities
- **Critical Gap**: SAM2 has NO semantic understanding
- SAM3 demonstrates concept-level reasoning
- Attribute-based segmentation only possible with SAM3
- Open-vocabulary enables novel concept detection

### The SAM2-to-SAM3 Gap

This gap represents the evolution from:

```
Pure Vision Models â†’ Vision-Language Models
Prompt-Based â†’ Concept-Driven  
Geometric â†’ Semantic
Closed-Set â†’ Open-Vocabulary
Manual Annotation â†’ Natural Language Interaction
```

### Implications for Agriculture

1. **Precision Agriculture**: Concept-driven segmentation aligns with semantic farming decisions
2. **Automation**: Reduces manual annotation burden
3. **Flexibility**: Open-vocabulary enables adaptation to new varieties without retraining
4. **Intelligence**: Attribute reasoning supports quality assessment and harvest planning

### Future Work

- Extended evaluation on diverse orchard conditions
- Real-time inference optimization for SAM3
- Integration with robotic harvest systems
- Multi-modal fusion (vision + language + depth)

## 10. Interactive Exploration

Use this cell to interactively explore specific images and prompts.

In [None]:
# Interactive exploration
image_idx = 0  # Change this to explore different images

test_loader = data_loaders['test']
sample = test_loader[image_idx]

print(f"Image {image_idx}: {sample['image_id']}")
print(f"Number of instances: {len(test_loader.get_instance_masks(image_idx))}")

# Display image with annotations
fig, ax = plt.subplots(1, 1, figsize=(12, 10))
ax.imshow(sample['image_array'])

# Overlay ground truth masks
masks = test_loader.get_instance_masks(image_idx)
labels = test_loader.get_instance_labels(image_idx)
attributes = test_loader.get_instance_attributes(image_idx)

colors = plt.cm.tab20(np.linspace(0, 1, len(masks)))

for i, (mask, label, attrs) in enumerate(zip(masks, labels, attributes)):
    overlay = np.zeros((*mask.shape, 4))
    overlay[mask] = colors[i]
    ax.imshow(overlay, alpha=0.5)
    
    # Add label
    ys, xs = np.where(mask)
    if len(xs) > 0:
        cx, cy = int(np.mean(xs)), int(np.mean(ys))
        attr_str = f"{attrs['ripeness']}, {attrs['color']}"
        ax.text(cx, cy, f"{i+1}: {attr_str}", 
               color='white', fontsize=10, fontweight='bold',
               bbox=dict(boxstyle='round', facecolor='black', alpha=0.7))

ax.set_title(f"Ground Truth Annotations\nImage: {sample['image_id']}", 
            fontsize=14, fontweight='bold')
ax.axis('off')
plt.tight_layout()
plt.show()

# Print instance details
print("\nInstance Details:")
for i, attrs in enumerate(attributes):
    print(f"  Instance {i+1}: {attrs}")

---

## Citation

If you use this analysis or codebase, please cite:

```bibtex
@article{sapkota2025sam3gap,
  title={The SAM2-to-SAM3 Gap in the Segment Anything Model Family},
  author={Sapkota, Ranjan and Roumeliotis, Konstantinos I. and Karkee, Manoj},
  year={2025}
}
```

---

**End of Notebook**