# Phase 1: Data Preparation

This notebook downloads the **ikigai Pothole V2** dataset for YOLOv8/v9/v10 training.

**Dataset**: ikigai Pothole V2 (1,481 augmented images, 640√ó640)

**Outputs saved to Google Drive**: `/MyDrive/PotholeDetection/`

## Prerequisites
1. Roboflow account (free tier)
2. API key from https://app.roboflow.com/settings/api

## 1. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 2. Setup Output Directory

In [None]:
import os
import shutil

# All outputs will be stored here
OUTPUT_DIR = '/content/drive/MyDrive/PotholeDetection'
DATASET_DIR = f'{OUTPUT_DIR}/dataset'

# Clean up any previous failed downloads
if os.path.exists(DATASET_DIR):
    print(f'Removing existing dataset directory...')
    shutil.rmtree(DATASET_DIR)

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'Output directory: {OUTPUT_DIR}')

## 3. Download ikigai Pothole V2 Dataset

**Dataset Details:**
- Source: 617 images ‚Üí Augmented: 1,481 images
- Classes: 1 (pothole)
- Image size: 640√ó640 (pre-resized for YOLO)
- License: CC BY 4.0
- Benchmark mAP@50: 75.8%

‚ö†Ô∏è **Replace `YOUR_API_KEY` with your actual Roboflow API key**

In [None]:
!pip install -q roboflow

In [None]:
from roboflow import Roboflow

# ========================================
# ‚ö†Ô∏è ENTER YOUR API KEY HERE
# ========================================
ROBOFLOW_API_KEY = "AQCMdziIQXHSgs3xmEME"  # <-- Replace this!

rf = Roboflow(api_key=ROBOFLOW_API_KEY)

# ikigai Pothole V2 Dataset
# - Workspace: ikigai
# - Project: pothole-v2-m6ldn
# - Version: 22 (with augmentations)
project = rf.workspace("ikigai").project("pothole-v2-m6ldn")
dataset = project.version(22).download("yolov8", location=DATASET_DIR)

print(f'\n‚úÖ Dataset downloaded to: {dataset.location}')

## 4. Verify Dataset Structure

In [None]:
import os

dataset_location = dataset.location

print(f"Dataset location: {dataset_location}")
print("\nDataset Structure Verification")
print("=" * 40)

# List top-level contents
print("Contents:", os.listdir(dataset_location))

stats = {}
for split in ['train', 'valid', 'test']:
    # Check multiple possible structures
    possible_paths = [
        (f'{dataset_location}/{split}/images', f'{dataset_location}/{split}/labels'),
        (f'{dataset_location}/images/{split}', f'{dataset_location}/labels/{split}'),
        (f'{dataset_location}/{split}', f'{dataset_location}/{split}'),
    ]
    
    found = False
    for img_path, lbl_path in possible_paths:
        if os.path.exists(img_path):
            images = [f for f in os.listdir(img_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
            labels = [f for f in os.listdir(lbl_path) if f.endswith('.txt')] if os.path.exists(lbl_path) else []
            stats[split] = {'images': len(images), 'labels': len(labels)}
            print(f"{split.upper():6} | Images: {len(images):4} | Labels: {len(labels):4}")
            found = True
            break
    
    if not found:
        print(f"{split.upper():6} | Not found")

print("=" * 40)
total_images = sum(s.get('images', 0) for s in stats.values())
print(f"TOTAL  | Images: {total_images}")

## 5. Dataset Statistics

In [None]:
import json
from PIL import Image
import glob

# Collect image sizes
image_files = glob.glob(f'{dataset_location}/**/*.jpg', recursive=True)
image_files += glob.glob(f'{dataset_location}/**/*.jpeg', recursive=True)
image_files += glob.glob(f'{dataset_location}/**/*.png', recursive=True)

print(f"Found {len(image_files)} images")

sizes = []
for img_path in image_files[:100]:
    try:
        with Image.open(img_path) as img:
            sizes.append(img.size)
    except:
        pass

# Count annotations
label_files = glob.glob(f'{dataset_location}/**/*.txt', recursive=True)
label_files = [f for f in label_files if 'classes' not in f.lower() and 'readme' not in f.lower()]

total_annotations = 0
for lbl_path in label_files:
    try:
        with open(lbl_path, 'r') as f:
            lines = [l for l in f.readlines() if l.strip()]
            total_annotations += len(lines)
    except:
        pass

# Build stats
dataset_stats = {
    'name': 'ikigai Pothole V2',
    'source': 'Roboflow Universe',
    'workspace': 'ikigai',
    'project': 'pothole-v2-m6ldn',
    'version': 22,
    'license': 'CC BY 4.0',
    'total_images': len(image_files),
    'total_annotations': total_annotations,
    'avg_annotations_per_image': round(total_annotations / max(len(image_files), 1), 2),
    'splits': stats,
    'classes': ['pothole'],
    'image_size': '640x640',
    'augmentations': ['horizontal_flip', 'brightness_15pct', 'exposure_15pct']
}

print("\n" + "=" * 40)
print("DATASET STATISTICS")
print("=" * 40)
print(f"Name: {dataset_stats['name']}")
print(f"Total Images: {dataset_stats['total_images']}")
print(f"Total Annotations: {dataset_stats['total_annotations']}")
print(f"Avg Annotations/Image: {dataset_stats['avg_annotations_per_image']}")
print(f"Classes: {dataset_stats['classes']}")
print(f"Image Size: {dataset_stats['image_size']}")
print(f"License: {dataset_stats['license']}")

## 6. Save Statistics to Drive

In [None]:
# Save stats as JSON for DATA_CARD.md
stats_path = f'{OUTPUT_DIR}/dataset_stats.json'
with open(stats_path, 'w') as f:
    json.dump(dataset_stats, f, indent=2, default=str)

print(f'‚úÖ Statistics saved to: {stats_path}')
print('\nüìã Copy these stats to your DATA_CARD.md:')
print(json.dumps(dataset_stats, indent=2))

## 7. Sample Visualization

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import random
import os

def visualize_sample(img_path):
    """Display image with bounding boxes"""
    img = Image.open(img_path)
    w, h = img.size
    
    fig, ax = plt.subplots(1, figsize=(8, 8))
    ax.imshow(img)
    
    # Try to find corresponding label file
    base = os.path.splitext(img_path)[0]
    possible_labels = [
        base + '.txt',
        img_path.replace('/images/', '/labels/').replace('.jpg', '.txt').replace('.png', '.txt')
    ]
    
    for label_path in possible_labels:
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 5:
                        _, x_center, y_center, width, height = map(float, parts[:5])
                        x1 = (x_center - width/2) * w
                        y1 = (y_center - height/2) * h
                        box_w = width * w
                        box_h = height * h
                        
                        rect = patches.Rectangle((x1, y1), box_w, box_h, 
                                                linewidth=2, edgecolor='red', facecolor='none')
                        ax.add_patch(rect)
            break
    
    ax.axis('off')
    plt.title(os.path.basename(img_path))
    plt.show()

# Display 3 random samples
if image_files:
    sample_images = random.sample(image_files, min(3, len(image_files)))
    for img_path in sample_images:
        visualize_sample(img_path)
else:
    print("No images found to visualize")

## ‚úÖ Phase 1 Complete!

**What's saved to Google Drive:**
- Dataset: `/MyDrive/PotholeDetection/dataset/`
- Stats: `/MyDrive/PotholeDetection/dataset_stats.json`

**Dataset Summary:**
- ikigai Pothole V2 (version 22)
- ~1,481 augmented images (640√ó640)
- Ready for YOLOv8/v9/v10/v11 training

**Next Step:** Proceed to Phase 2 (Model Training) using `model_comparison.ipynb`