# RDD2022 India Dataset - Verification

**Week 1: Dataset Preparation and Verification**

**Objectives:**
1. Verify dataset structure
2. Count and validate files
3. Check XML annotation format
4. Generate dataset statistics
5. Prepare for Week 2 EDA

**Dataset Information:**
- Source: RDD2022 Competition
- Region: India
- Format: Pascal VOC XML annotations
- Classes: 6 grouped categories
  - Longitudinal (D00, D01)
  - Transverse (D10, D11)
  - Alligator (D20)
  - Pothole (D40)
  - Marking Blur (D43, D44)
  - Other (D50)

**Note:** Test annotations are not provided (standard for competitions)

In [None]:
# ============================================
# IMPORTS AND SETUP
# ============================================

import os
import sys
import json
from pathlib import Path
from datetime import datetime
import xml.etree.ElementTree as ET
from collections import Counter

# Image validation
try:
    from PIL import Image
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False

print("=" * 60)
print("  WEEK 1: DATASET VERIFICATION")
print("=" * 60)
print(f"  Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)

In [None]:
# ============================================
# CONFIGURATION
# ============================================

# Validation options
VALIDATE_IMAGES = True
VALIDATE_XMLS = True
VALIDATION_SAMPLE = 100
GENERATE_STATS = True

# Class mapping: Original RDD2022 classes → Grouped classes
CLASS_MAPPING = {
    'D00': 'longitudinal',
    'D01': 'longitudinal',
    'D10': 'transverse',
    'D11': 'transverse',
    'D20': 'alligator',
    'D40': 'pothole',
    'D43': 'marking_blur',
    'D44': 'marking_blur',
    'D50': 'other',
    'D0w0': 'longitudinal'
}

GROUPED_CLASSES = ['longitudinal', 'transverse', 'alligator',
                   'pothole', 'marking_blur', 'other']

print("\nConfiguration:")
print("-" * 60)
print(f"  Image validation: {VALIDATE_IMAGES}")
print(f"  XML validation: {VALIDATE_XMLS}")
print(f"  Validation sample: {VALIDATION_SAMPLE}")
print(f"  Generate statistics: {GENERATE_STATS}")
print(f"  Grouped classes: {len(GROUPED_CLASSES)}")
print("=" * 60)

In [None]:
# ============================================
# MOUNT DRIVE AND LOAD CONFIG
# ============================================

print("\n[STEP 1/6] SETUP")
print("-" * 60)

from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = Path("/content/drive/MyDrive/ML/road-damage-detection")
config_file = PROJECT_ROOT / "config.json"

if config_file.exists():
    with open(config_file, 'r') as f:
        config = json.load(f)
    print("[OK] Configuration loaded")
else:
    print("[WARNING] No config.json found")
    config = {}

DATASET_DIR = Path(config.get('dataset_raw', str(PROJECT_ROOT / 'datasets/rdd2022/India')))
RESULTS_DIR = Path(config.get('results_dir', str(PROJECT_ROOT / 'results')))

os.chdir(PROJECT_ROOT / 'notebooks')

print(f"[OK] Project root: {PROJECT_ROOT.name}")
print(f"[OK] Dataset: {DATASET_DIR}")
print(f"[OK] Working directory: {os.getcwd()}")
print("=" * 60)

In [None]:
# ============================================
# VERIFY DIRECTORY STRUCTURE
# ============================================

print("\n[STEP 2/6] VERIFYING STRUCTURE")
print("-" * 60)

required_paths = {
    'train_images': DATASET_DIR / 'train/images',
    'train_annotations': DATASET_DIR / 'train/annotations/xmls',
    'test_images': DATASET_DIR / 'test/images',
}

all_exist = True
for name, path in required_paths.items():
    exists = path.exists()
    is_dir = path.is_dir() if exists else False

    if exists and is_dir:
        print(f"  [OK] {name}")
    else:
        print(f"  [ERROR] {name} - NOT FOUND")
        all_exist = False

test_annotations = DATASET_DIR / 'test/annotations/xmls'
if not test_annotations.exists():
    print(f"  [INFO] test_annotations - Not provided (expected)")

if not all_exist:
    raise FileNotFoundError("Dataset structure incomplete")

print("\n[OK] Directory structure verified")
print("=" * 60)

In [None]:
# ============================================
# COUNT FILES
# ============================================

print("\n[STEP 3/6] COUNTING FILES")
print("-" * 60)

train_imgs = list((DATASET_DIR / 'train/images').glob('*.jpg'))
train_xmls = list((DATASET_DIR / 'train/annotations/xmls').glob('*.xml'))
test_imgs = list((DATASET_DIR / 'test/images').glob('*.jpg'))

file_counts = {
    'train': {'images': len(train_imgs), 'annotations': len(train_xmls)},
    'test': {'images': len(test_imgs), 'annotations': 0}
}

print(f"\nTRAIN:")
print(f"  Images:      {len(train_imgs):>6,}")
print(f"  Annotations: {len(train_xmls):>6,}")

if len(train_imgs) != len(train_xmls):
    diff = abs(len(train_imgs) - len(train_xmls))
    print(f"  [WARNING] Mismatch: {diff} files")

print(f"\nTEST:")
print(f"  Images:      {len(test_imgs):>6,}")
print(f"  Annotations: Not provided")

total_images = len(train_imgs) + len(test_imgs)
total_annotations = len(train_xmls)

print(f"\nTOTAL:")
print(f"  Images:      {total_images:>6,}")
print(f"  Annotations: {total_annotations:>6,}")

if total_images == 0:
    raise ValueError("Dataset appears empty")

print("\n[OK] File counting complete")
print("=" * 60)

In [None]:
# ============================================
# VALIDATE SAMPLE FILES
# ============================================

print("\n[STEP 4/6] VALIDATING SAMPLES")
print("-" * 60)

validation_results = {
    'images_checked': 0,
    'images_valid': 0,
    'images_invalid': 0,
    'xmls_checked': 0,
    'xmls_valid': 0,
    'xmls_invalid': 0,
    'errors': []
}

if VALIDATION_SAMPLE and VALIDATION_SAMPLE < len(train_imgs):
    sample_images = train_imgs[:VALIDATION_SAMPLE]
    print(f"Validating {VALIDATION_SAMPLE} sample files...")
else:
    sample_images = train_imgs
    print(f"Validating all {len(train_imgs):,} files...")

from tqdm import tqdm

for img_path in tqdm(sample_images, desc="  Progress"):
    # Validate image
    if VALIDATE_IMAGES and PIL_AVAILABLE:
        try:
            with Image.open(img_path) as img:
                img.verify()
            validation_results['images_valid'] += 1
        except Exception as e:
            validation_results['images_invalid'] += 1
            validation_results['errors'].append(f"Image error: {img_path.name}")

        validation_results['images_checked'] += 1

    # Validate corresponding XML
    xml_path = DATASET_DIR / 'train/annotations/xmls' / f"{img_path.stem}.xml"

    if VALIDATE_XMLS and xml_path.exists():
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()
            filename = root.find('filename')
            if filename is None:
                raise ValueError("Missing filename tag")
            validation_results['xmls_valid'] += 1
        except Exception as e:
            validation_results['xmls_invalid'] += 1
            error_msg = str(e)[:50]
            validation_results['errors'].append(f"XML error: {xml_path.name} - {error_msg}")

        validation_results['xmls_checked'] += 1

print(f"\nValidation Results:")
print(f"  Images checked: {validation_results['images_checked']:,}")
print(f"  Images valid: {validation_results['images_valid']:,}")
print(f"  Images invalid: {validation_results['images_invalid']:,}")
print(f"  XMLs checked: {validation_results['xmls_checked']:,}")
print(f"  XMLs valid: {validation_results['xmls_valid']:,}")
print(f"  XMLs invalid: {validation_results['xmls_invalid']:,}")

if validation_results['errors']:
    print(f"\n[WARNING] Found {len(validation_results['errors'])} errors")
    for error in validation_results['errors'][:5]:
        print(f"  {error}")
    if len(validation_results['errors']) > 5:
        print(f"  ... and {len(validation_results['errors']) - 5} more")

if validation_results['images_invalid'] == 0 and validation_results['xmls_invalid'] == 0:
    print("\n[OK] All validated files are valid")

print("=" * 60)

In [None]:
# ============================================
# GENERATE STATISTICS WITH CLASS GROUPING
# ============================================

if GENERATE_STATS:
    print("\n[STEP 5/6] GENERATING STATISTICS")
    print("-" * 60)

    stats = {
        'dataset': 'RDD2022 India',
        'verification_date': datetime.now().isoformat(),
        'file_counts': file_counts,
        'total_images': total_images,
        'total_annotations': total_annotations,
        'validation_results': validation_results,
        'class_mapping': CLASS_MAPPING,
        'grouped_classes': GROUPED_CLASSES,
        'original_class_distribution': {},
        'grouped_class_distribution': {},
        'total_objects': 0,
        'negative_samples': 0,
        'unknown_classes': []
    }

    print("Analyzing annotations...")

    original_counts = Counter()
    grouped_counts = Counter()
    total_objects = 0
    negative_count = 0
    unknown_classes = set()

    for xml_path in tqdm(train_xmls, desc="  Progress"):
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()
            objects = root.findall('object')

            if len(objects) == 0:
                negative_count += 1
            else:
                for obj in objects:
                    damage_type = obj.find('name').text
                    original_counts[damage_type] += 1
                    total_objects += 1

                    # Map to grouped class
                    if damage_type in CLASS_MAPPING:
                        grouped_class = CLASS_MAPPING[damage_type]
                        grouped_counts[grouped_class] += 1
                    else:
                        unknown_classes.add(damage_type)
        except Exception:
            continue

    stats['original_class_distribution'] = dict(original_counts)
    stats['grouped_class_distribution'] = dict(grouped_counts)
    stats['total_objects'] = total_objects
    stats['negative_samples'] = negative_count
    stats['unknown_classes'] = list(unknown_classes)

    # Print statistics
    print("\nDataset Statistics:")
    print(f"  Total images: {total_images:,}")
    print(f"  Total objects: {total_objects:,}")
    print(f"  Negative samples: {negative_count:,}")

    print("\nOriginal Class Distribution:")
    for cls in sorted(original_counts.keys()):
        count = original_counts[cls]
        pct = (count / total_objects * 100) if total_objects > 0 else 0
        print(f"  {cls}: {count:>6,} ({pct:5.1f}%)")

    print("\nGrouped Class Distribution:")
    for cls in GROUPED_CLASSES:
        count = grouped_counts.get(cls, 0)
        pct = (count / total_objects * 100) if total_objects > 0 else 0
        print(f"  {cls:15s}: {count:>6,} ({pct:5.1f}%)")

    if unknown_classes:
        print(f"\n[WARNING] Unknown classes found: {list(unknown_classes)}")

    # Save statistics
    stats_file = RESULTS_DIR / 'dataset_verification.json'
    RESULTS_DIR.mkdir(exist_ok=True)

    with open(stats_file, 'w') as f:
        json.dump(stats, f, indent=2)

    print(f"\n[OK] Statistics saved: {stats_file.name}")
    print("=" * 60)
else:
    print("\n[STEP 5/6] STATISTICS")
    print("-" * 60)
    print("[SKIP] Statistics generation disabled")
    print("=" * 60)

In [None]:
# ============================================
# SAMPLE XML INSPECTION
# ============================================

print("\n[STEP 6/6] SAMPLE INSPECTION")
print("-" * 60)

sample_xml = train_xmls[0]
tree = ET.parse(sample_xml)
root = tree.getroot()

filename = root.find('filename').text
size = root.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
objects = root.findall('object')

print(f"\nSample file: {sample_xml.name}")
print(f"  Filename: {filename}")
print(f"  Size: {width}x{height}")
print(f"  Objects: {len(objects)}")

if len(objects) > 0:
    print("\n  Sample annotations:")
    for i, obj in enumerate(objects[:3], 1):
        name = obj.find('name').text
        grouped = CLASS_MAPPING.get(name, 'unknown')
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)

        print(f"    {i}. Original: {name} → Grouped: {grouped}")
        print(f"       BBox: ({xmin}, {ymin}) to ({xmax}, {ymax})")
        print(f"       Size: {xmax-xmin}x{ymax-ymin}")

print("\n[OK] Sample inspection complete")
print("=" * 60)

In [None]:
# ============================================
# FINAL SUMMARY
# ============================================

print("\n" + "="*60)
print("DATASET PREPARATION - COMPLETE")
print("="*60)

print("\nDataset Overview:")
print(f"   Location: {DATASET_DIR}")
print(f"   Total Images: {stats['total_images']}")
print(f"   Total Objects: {stats['total_objects']}")
print(f"   Negative Samples: {stats['negative_samples']}")

print("\nOriginal Classes (8 types):")
for cls in sorted(CLASS_MAPPING.keys()):
    print(f"   • {cls}")

print("\nGrouped Classes (6 types):")
for cls in GROUPED_CLASSES:
    print(f"   • {cls}")

print("\nVerification Status:")
print("   ✓ Directory structure verified")
print("   ✓ File counts confirmed")
print("   ✓ Annotation validation complete")
print("   ✓ Statistics generated and saved")
print("   ✓ Sample inspection verified")

print("\nOutput Files:")
print(f"   • {RESULTS_DIR / 'dataset_verification.json'}")

print("\n" + "="*60)
print("NEXT STEPS - Week 2: Exploratory Data Analysis")
print("="*60)
print("")
print("Coming up in 02_explore_data.ipynb:")
print("   1. Visualize class distributions")
print("   2. Analyze image properties (size, aspect ratio)")
print("   3. Examine bounding box statistics")
print("   4. Check for class imbalances")
print("   5. Generate sample visualizations")
print("   6. Create comprehensive EDA report")
print("")
print("The dataset is ready for analysis!")
print("="*60)