In [1]:
# notebooks/01_data_acquisition_and_exploration.ipynb
# This notebook will guide you through downloading and initial exploration.

import os
import json
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from collections import defaultdict
import numpy as np

# Import configurations
from config.dataset_config import DATA_ROOT, TACO_ANNOTATION_PATH, TACO_IMAGES_DIR, \
                                   TRASH_ICRA19_ROOT, TRASH_ICRA19_IMAGES_DIR, \
                                   TRASH_ICRA19_ANNOTATIONS_DIR, TRASH_ICRA19_CLASS_NAMES_PATH, \
                                   TRASH_ICRA19_CLASSES, TACO_TO_ICRA19_CLASS_MAP

# Import parsers
from utils.dataset_parsers import parse_taco_annotations, parse_trash_icra19_annotations

print("--- 01_data_acquisition_and_exploration.ipynb ---")
print("Current Project Root:", os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# --- 1. Dataset Acquisition (Manual Step Guidance) ---

# TACO Dataset:
print("\n--- TACO Dataset Acquisition ---")
print("1. Download TACO annotations (all.json): Go to http://tacodataset.org/ -> Download -> annotations/all.json")
print(f"   Save it to: {os.path.dirname(TACO_ANNOTATION_PATH)}")
print("2. Download TACO images: The images are hosted on Flickr. The TACO dataset website provides a `download.py` script.")
print("   Follow instructions on http://tacodataset.org/ to download images. Create a folder named 'images' inside TACO dir.")
print(f"   Images should be saved to: {TACO_IMAGES_DIR}")
print("   Alternatively, you might find pre-packaged TACO datasets on Kaggle or other platforms that include images.")

if not os.path.exists(TACO_ANNOTATION_PATH):
    print(f"WARNING: TACO annotation file not found at {TACO_ANNOTATION_PATH}. Please download it.")
if not os.path.exists(TACO_IMAGES_DIR):
    print(f"WARNING: TACO images directory not found at {TACO_IMAGES_DIR}. Please download images.")

# Trash-ICRA19 Dataset:
print("\n--- Trash-ICRA19 Dataset Acquisition ---")
print("Assuming 'trash_ICRA19' folder is already in your project root.")
print(f"Expected path: {TRASH_ICRA19_ROOT}")
print("Verify the internal structure matches:")
print(f"  Images expected in: {TRASH_ICRA19_IMAGES_DIR}")
print(f"  Annotations expected in: {TRASH_ICRA19_ANNOTATIONS_DIR}")
print(f"  Class names file expected at: {TRASH_ICRA19_CLASS_NAMES_PATH}")

if not os.path.exists(TRASH_ICRA19_IMAGES_DIR):
    print(f"WARNING: Trash-ICRA19 images directory not found at {TRASH_ICRA19_IMAGES_DIR}.")
if not os.path.exists(TRASH_ICRA19_ANNOTATIONS_DIR):
    print(f"WARNING: Trash-ICRA19 annotations directory not found at {TRASH_ICRA19_ANNOTATIONS_DIR}.")
if not os.path.exists(TRASH_ICRA19_CLASS_NAMES_PATH):
    print(f"WARNING: Trash-ICRA19 classes.txt not found at {TRASH_ICRA19_CLASS_NAMES_PATH}.")

# --- 2. Initial Data Exploration ---

print("\n--- Exploring TACO Dataset ---")
taco_raw_data = None
if os.path.exists(TACO_ANNOTATION_PATH):
    try:
        with open(TACO_ANNOTATION_PATH, 'r') as f:
            taco_raw_data = json.load(f)
        print(f"Successfully loaded TACO annotations: {len(taco_raw_data['images'])} images, {len(taco_raw_data['annotations'])} annotations.")

        # Display TACO original categories
        taco_categories = {cat['id']: cat['name'] for cat in taco_raw_data['categories']}
        print("\nTACO Original Categories (first 20):")
        for i, (cat_id, cat_name) in enumerate(taco_categories.items()):
            print(f"- {cat_id}: {cat_name}")
            if i >= 19: break

        # Calculate category distribution
        taco_cat_counts = defaultdict(int)
        for ann in taco_raw_data['annotations']:
            taco_cat_counts[taco_categories[ann['category_id']]] += 1

        print("\nTACO Category Distribution (Top 10):")
        sorted_taco_cats = sorted(taco_cat_counts.items(), key=lambda item: item[1], reverse=True)
        for cat_name, count in sorted_taco_cats[:10]:
            print(f"- {cat_name}: {count} instances")

    except Exception as e:
        print(f"Error loading/parsing TACO JSON: {e}")
else:
    print("TACO annotation file not found, skipping detailed exploration.")

print("\n--- Exploring Trash-ICRA19 Dataset ---")
if os.path.exists(TRASH_ICRA19_IMAGES_DIR) and os.path.exists(TRASH_ICRA19_ANNOTATIONS_DIR):
    # Using the parser to get structured data
    icra19_data_train = parse_trash_icra19_annotations(os.path.join(TRASH_ICRA19_IMAGES_DIR, 'train'),
                                                        os.path.join(TRASH_ICRA19_ANNOTATIONS_DIR, 'train'),
                                                        TRASH_ICRA19_CLASS_NAMES_PATH)
    icra19_data_val = parse_trash_icra19_annotations(os.path.join(TRASH_ICRA19_IMAGES_DIR, 'val'),
                                                      os.path.join(TRASH_ICRA19_ANNOTATIONS_DIR, 'val'),
                                                      TRASH_ICRA19_CLASS_NAMES_PATH)
    icra19_data_test = parse_trash_icra19_annotations(os.path.join(TRASH_ICRA19_IMAGES_DIR, 'test'),
                                                       os.path.join(TRASH_ICRA19_ANNOTATIONS_DIR, 'test'),
                                                       TRASH_ICRA19_CLASS_NAMES_PATH)

    print(f"Trash-ICRA19 Images: Train={len(icra19_data_train)}, Val={len(icra19_data_val)}, Test={len(icra19_data_test)}")
    print(f"Trash-ICRA19 Classes: {TRASH_ICRA19_CLASSES}")

    icra19_cat_counts = defaultdict(int)
    for data_list in [icra19_data_train, icra19_data_val, icra19_data_test]:
        for item in data_list:
            for label_id in item['labels']:
                for class_name, class_idx in TRASH_ICRA19_CLASSES.items():
                    if class_idx == label_id:
                        icra19_cat_counts[class_name] += 1
                        break
    print("\nTrash-ICRA19 Category Distribution:")
    for cat_name, count in icra19_cat_counts.items():
        print(f"- {cat_name}: {count} instances")

    # --- 3. Visualize Sample Images with Annotations ---

    def visualize_annotations(image_path, boxes, labels, class_names, title=""):
        """Displays an image with bounding box annotations."""
        fig, ax = plt.subplots(1, figsize=(10, 10))
        img = Image.open(image_path).convert("RGB")
        ax.imshow(img)
        ax.set_title(title)
        ax.axis('off')

        for bbox, label_id in zip(boxes, labels):
            # Convert [xmin, ymin, w, h] to [xmin, ymin, xmax, ymax] for drawing,
            # if the parser provides w,h. If it's xmax,ymax already, no change needed.
            # Our parsers convert to [xmin, ymin, w, h]
            xmin, ymin, width, height = bbox
            xmax, ymax = xmin + width, ymin + height

            rect = patches.Rectangle((xmin, ymin), width, height,
                                     linewidth=2, edgecolor='r', facecolor='none')
            ax.add_patch(rect)
            class_name = class_names.get(label_id, f"Class {label_id}") # Get name from ID
            plt.text(xmin, ymin - 5, class_name, color='red', fontsize=12,
                     bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
        plt.show()

    print("\n--- Visualizing Sample Images (Trash-ICRA19) ---")
    num_samples_to_show = 3
    for i, item in enumerate(icra19_data_train[:num_samples_to_show]):
        visualize_annotations(item['image_path'], item['boxes'], item['labels'],
                              TRASH_ICRA19_CLASSES, title=f"Trash-ICRA19 Train Sample {i+1}")

    print("\n--- Visualizing Sample Images (TACO - Mapped) ---")
    # For TACO, we need to ensure images are downloaded to `TACO_IMAGES_DIR`
    if taco_raw_data and os.path.exists(TACO_IMAGES_DIR) and len(taco_raw_data['images']) > 0:
        # Get some random TACO images (with annotations)
        taco_parsed = parse_taco_annotations(TACO_ANNOTATION_PATH, TACO_IMAGES_DIR)
        random.shuffle(taco_parsed)
        for i, item in enumerate(taco_parsed[:num_samples_to_show]):
            # The parser returns [xmin, ymin, w, h] for boxes
            visualize_annotations(item['image_path'], item['boxes'], item['labels'],
                                  TRASH_ICRA19_CLASSES, # Use ICRA19 classes as TACO is mapped
                                  title=f"TACO Sample {i+1} (Mapped to ICRA19)")
            if i >= num_samples_to_show - 1: break
    else:
        print("Skipping TACO visualization: Data or images not found.")

else:
    print("Trash-ICRA19 dataset not found, skipping detailed exploration.")

print("\n--- Exploration Complete ---")
print("Review class distributions and sample images. Pay attention to the domain shift (underwater vs. terrestrial) and annotation quality.")

ModuleNotFoundError: No module named 'config'