# RDD2022 India Dataset - Verification**Week 1: Dataset Preparation and Verification****Objectives:**1. Verify dataset structure2. Count and validate files3. Check XML annotation format4. Generate dataset statistics5. Prepare for Week 2 EDA**Dataset Information:**- Source: RDD2022 Competition- Region: India- Format: Pascal VOC XML annotations- Classes: D00, D10, D20, D40**Note:** Test annotations are not provided (standard for competitions)

In [ ]:
# ============================================# CELL 1: IMPORTS AND SETUP# ============================================import osimport sysimport jsonimport loggingfrom pathlib import Pathfrom datetime import datetimeimport xml.etree.ElementTree as ET# Image validationtry:    from PIL import Image    PIL_AVAILABLE = Trueexcept ImportError:    PIL_AVAILABLE = Falseprint("=" * 60)print("  WEEK 1: DATASET VERIFICATION")print("=" * 60)print(f"  Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")print("=" * 60)

In [ ]:
# ============================================# CELL 2: CONFIGURATION# ============================================# Validation optionsVALIDATE_IMAGES = True          # Check image file integrityVALIDATE_XMLS = True            # Verify XML structureVALIDATION_SAMPLE = 100         # Number of files to validate (None = all)GENERATE_STATS = True           # Create statistics JSON# Expected dataset propertiesEXPECTED_CLASSES = ['D00', 'D10', 'D20', 'D40']print("\nConfiguration:")print("-" * 60)print(f"  Image validation: {VALIDATE_IMAGES}")print(f"  XML validation: {VALIDATE_XMLS}")print(f"  Sample size: {VALIDATION_SAMPLE if VALIDATION_SAMPLE else 'All files'}")print(f"  Generate statistics: {GENERATE_STATS}")print("=" * 60)

In [ ]:
# ============================================# CELL 3: MOUNT DRIVE AND LOAD CONFIG# ============================================print("\n[STEP 1/6] SETUP")print("-" * 60)from google.colab import drivedrive.mount('/content/drive')# Load project configurationPROJECT_ROOT = Path("/content/drive/MyDrive/ML/road-damage-detection")config_file = PROJECT_ROOT / "config.json"if config_file.exists():    with open(config_file, 'r') as f:        config = json.load(f)    print("[OK] Configuration loaded")else:    print("[WARNING] No config.json found")    config = {}# Set pathsDATASET_DIR = Path(config.get('dataset_raw', str(PROJECT_ROOT / 'datasets/rdd2022/India')))RESULTS_DIR = Path(config.get('results_dir', str(PROJECT_ROOT / 'results')))# Change to notebooks directoryos.chdir(PROJECT_ROOT / 'notebooks')print(f"[OK] Project root: {PROJECT_ROOT.name}")print(f"[OK] Dataset: {DATASET_DIR}")print(f"[OK] Working directory: {os.getcwd()}")print("=" * 60)

In [ ]:
# ============================================# CELL 4: VERIFY DIRECTORY STRUCTURE# ============================================print("\n[STEP 2/6] VERIFYING STRUCTURE")print("-" * 60)required_paths = {    'train_images': DATASET_DIR / 'train/images',    'train_annotations': DATASET_DIR / 'train/annotations/xmls',    'test_images': DATASET_DIR / 'test/images',}all_exist = Truestructure_info = {}for name, path in required_paths.items():    exists = path.exists()    is_dir = path.is_dir() if exists else False        structure_info[name] = {'exists': exists, 'path': str(path)}        if exists and is_dir:        print(f"  [OK] {name}")    else:        print(f"  [ERROR] {name} - NOT FOUND")        all_exist = False# Test annotations are typically not providedtest_annotations = DATASET_DIR / 'test/annotations/xmls'if not test_annotations.exists():    print(f"  [INFO] test_annotations - Not provided (expected)")if not all_exist:    print("\n[ERROR] Required directories missing")    raise FileNotFoundError("Dataset structure incomplete")print("\n[OK] Directory structure verified")print("=" * 60)

In [ ]:
# ============================================# CELL 5: COUNT FILES# ============================================print("\n[STEP 3/6] COUNTING FILES")print("-" * 60)file_counts = {}# Count training filestrain_imgs = list((DATASET_DIR / 'train/images').glob('*.jpg'))train_xmls = list((DATASET_DIR / 'train/annotations/xmls').glob('*.xml'))file_counts['train'] = {    'images': len(train_imgs),    'annotations': len(train_xmls)}print(f"\nTRAIN:")print(f"  Images:      {len(train_imgs):>6,}")print(f"  Annotations: {len(train_xmls):>6,}")if len(train_imgs) != len(train_xmls):    diff = abs(len(train_imgs) - len(train_xmls))    print(f"  [WARNING] Mismatch: {diff} files")# Count test filestest_imgs = list((DATASET_DIR / 'test/images').glob('*.jpg'))file_counts['test'] = {    'images': len(test_imgs),    'annotations': 0  # Not provided}print(f"\nTEST:")print(f"  Images:      {len(test_imgs):>6,}")print(f"  Annotations: Not provided")# Totaltotal_images = len(train_imgs) + len(test_imgs)total_annotations = len(train_xmls)print(f"\nTOTAL:")print(f"  Images:      {total_images:>6,}")print(f"  Annotations: {total_annotations:>6,}")if total_images == 0:    print("\n[ERROR] No images found")    raise ValueError("Dataset appears empty")print("\n[OK] File counting complete")print("=" * 60)

In [ ]:
# ============================================# CELL 6: VALIDATE SAMPLE FILES# ============================================print("\n[STEP 4/6] VALIDATING SAMPLES")print("-" * 60)validation_results = {    'images_checked': 0,    'images_valid': 0,    'images_invalid': 0,    'xmls_checked': 0,    'xmls_valid': 0,    'xmls_invalid': 0,    'errors': []}# Sample images for validationif VALIDATION_SAMPLE and VALIDATION_SAMPLE < len(train_imgs):    sample_images = train_imgs[:VALIDATION_SAMPLE]    print(f"Validating {VALIDATION_SAMPLE} sample files...")else:    sample_images = train_imgs    print(f"Validating all {len(train_imgs):,} files...")from tqdm import tqdmfor img_path in tqdm(sample_images, desc="  Progress"):    # Validate image    if VALIDATE_IMAGES and PIL_AVAILABLE:        try:            with Image.open(img_path) as img:                img.verify()            validation_results['images_valid'] += 1        except Exception as e:            validation_results['images_invalid'] += 1            validation_results['errors'].append(f"Image error: {img_path.name}")                validation_results['images_checked'] += 1        # Validate corresponding XML    xml_path = DATASET_DIR / 'train/annotations/xmls' / f"{img_path.stem}.xml"        if VALIDATE_XMLS and xml_path.exists():        try:            tree = ET.parse(xml_path)            root = tree.getroot()                        # Basic validation            filename = root.find('filename')            if filename is None:                raise ValueError("Missing filename tag")                        validation_results['xmls_valid'] += 1                    except Exception as e:            validation_results['xmls_invalid'] += 1            error_msg = str(e)[:50]            validation_results['errors'].append(f"XML error: {xml_path.name} - {error_msg}")                validation_results['xmls_checked'] += 1# Print resultsprint(f"\nValidation Results:")print(f"  Images checked: {validation_results['images_checked']:,}")print(f"  Images valid: {validation_results['images_valid']:,}")print(f"  Images invalid: {validation_results['images_invalid']:,}")print(f"  XMLs checked: {validation_results['xmls_checked']:,}")print(f"  XMLs valid: {validation_results['xmls_valid']:,}")print(f"  XMLs invalid: {validation_results['xmls_invalid']:,}")if validation_results['errors']:    print(f"\n[WARNING] Found {len(validation_results['errors'])} errors")    if len(validation_results['errors']) <= 5:        for error in validation_results['errors']:            print(f"  {error}")    else:        print("  First 5 errors:")        for error in validation_results['errors'][:5]:            print(f"    {error}")        print(f"  ... and {len(validation_results['errors']) - 5} more")if validation_results['images_invalid'] == 0 and validation_results['xmls_invalid'] == 0:    print("\n[OK] All validated files are valid")print("=" * 60)

In [ ]:
# ============================================# CELL 7: GENERATE STATISTICS (WITH CACHING)# ============================================if GENERATE_STATS:    print("\n[STEP 5/6] GENERATING STATISTICS")    print("-" * 60)        # Check if cached statistics exist    stats_cache_file = RESULTS_DIR / 'statistics_cache.json'        if stats_cache_file.exists():        print("[FOUND] Cached statistics exist")        print(f"  Location: {stats_cache_file.name}")        print("  Loading from cache (fast)...")        print()        print("  To regenerate statistics:")        print(f"    Delete: {stats_cache_file}")        print()                with open(stats_cache_file, 'r') as f:            stats = json.load(f)                # Display cached statistics        print("Cached Dataset Statistics:")        print(f"  Analysis date: {stats.get('verification_date', 'Unknown')}")        print(f"  Total images: {stats.get('total_images', 0):,}")        print(f"  Annotations analyzed: {stats.get('total_annotations', 0):,}")        print(f"  Total objects: {stats.get('total_objects', 0):,}")        print(f"  Negative samples: {stats.get('negative_samples', 0):,}")                if 'class_distribution' in stats:            print("\nClass Distribution:")            class_dist = stats['class_distribution']            total_obj = stats.get('total_objects', 0)            for cls in EXPECTED_CLASSES:                count = class_dist.get(cls, 0)                pct = (count / total_obj * 100) if total_obj > 0 else 0                print(f"  {cls}: {count:>6,} ({pct:5.1f}%)")                print("\n[OK] Statistics loaded from cache (instant)")            else:        print("[NOT FOUND] No cached statistics")        print(f"  Will analyze all {len(train_xmls):,} annotations")        print(f"  This will take 8-12 minutes (one time only)")        print(f"  Results will be cached for future runs")        print()                # Create fresh statistics        stats = {            'dataset': 'RDD2022 India',            'verification_date': datetime.now().isoformat(),            'file_counts': file_counts,            'total_images': total_images,            'total_annotations': total_annotations,            'validation_results': validation_results,            'class_distribution': {},            'total_objects': 0,            'negative_samples': 0,            'cache_note': 'Complete analysis of all annotations'        }                # Analyze ALL XMLs        class_counts = {cls: 0 for cls in EXPECTED_CLASSES}        total_objects = 0        negative_count = 0                print("Analyzing all annotations...")        for xml_path in tqdm(train_xmls, desc="  Progress"):            try:                tree = ET.parse(xml_path)                root = tree.getroot()                objects = root.findall('object')                                if len(objects) == 0:                    negative_count += 1                else:                    for obj in objects:                        damage_type = obj.find('name').text                        if damage_type in class_counts:                            class_counts[damage_type] += 1                            total_objects += 1            except Exception:                continue                stats['class_distribution'] = class_counts        stats['total_objects'] = total_objects        stats['negative_samples'] = negative_count                # Save cache for future runs        RESULTS_DIR.mkdir(exist_ok=True)        with open(stats_cache_file, 'w') as f:            json.dump(stats, f, indent=2)                print(f"\n[OK] Statistics computed and cached")        print(f"[SAVED] Cache: {stats_cache_file.name}")                # Print statistics        print("\nDataset Statistics:")        print(f"  Total images: {total_images:,}")        print(f"  Total objects: {total_objects:,}")        print(f"  Negative samples: {negative_count:,}")                print("\nClass Distribution:")        for cls in EXPECTED_CLASSES:            count = class_counts[cls]            pct = (count / total_objects * 100) if total_objects > 0 else 0            print(f"  {cls}: {count:>6,} ({pct:5.1f}%)")        # Also save to main verification file (always)    stats_file = RESULTS_DIR / 'dataset_verification.json'    with open(stats_file, 'w') as f:        json.dump(stats, f, indent=2)        print(f"\n[OK] Verification report: {stats_file.name}")    print("=" * 60)else:    print("\n[STEP 5/6] STATISTICS")    print("-" * 60)    print("[SKIP] Statistics generation disabled")    print("=" * 60)

In [ ]:
# ============================================# CELL 8: SAMPLE XML INSPECTION# ============================================print("\n[STEP 6/6] SAMPLE INSPECTION")print("-" * 60)# Show sample XML structuresample_xml = train_xmls[0]print(f"\nSample file: {sample_xml.name}")tree = ET.parse(sample_xml)root = tree.getroot()# Basic infofilename = root.find('filename').textsize = root.find('size')width = int(size.find('width').text)height = int(size.find('height').text)print(f"  Filename: {filename}")print(f"  Size: {width}x{height}")# Objectsobjects = root.findall('object')print(f"  Objects: {len(objects)}")if len(objects) > 0:    print("\n  Sample annotations:")    for i, obj in enumerate(objects[:3], 1):        name = obj.find('name').text        bbox = obj.find('bndbox')        xmin = int(bbox.find('xmin').text)        ymin = int(bbox.find('ymin').text)        xmax = int(bbox.find('xmax').text)        ymax = int(bbox.find('ymax').text)                print(f"    {i}. Class: {name}")        print(f"       BBox: ({xmin}, {ymin}) to ({xmax}, {ymax})")        print(f"       Size: {xmax-xmin}x{ymax-ymin}")print("\n[OK] Sample inspection complete")print("=" * 60)

In [ ]:
# ============================================# CELL 9: FINAL SUMMARY# ============================================print("\n" + "=" * 60)print("  DATASET VERIFICATION COMPLETE")print("=" * 60)print("\nDataset Summary:")print(f"  Location: {DATASET_DIR}")print(f"  Training images: {len(train_imgs):,}")print(f"  Training annotations: {len(train_xmls):,}")print(f"  Test images: {len(test_imgs):,}")print(f"  Format: Pascal VOC XML")print("\nVerification Status:")print("  [OK] Structure verified")print("  [OK] Files counted")print("  [OK] Samples validated")if GENERATE_STATS:    print("  [OK] Statistics generated")print("\nWeek 1 Status:")print("  [COMPLETE] Dataset preparation")print("\nNext Steps:")print("  1. Create 02_explore_data.ipynb")print("  2. Begin Week 2: Exploratory Data Analysis")print("  3. Parse all XMLs and create visualizations")print("\n" + "=" * 60)