# Dataset Splitting

This notebook splits the processed dataset into train/val/test sets with the specified distribution:
- Train: 1645 images
- Val: 151 images
- Test: 207 images

In [1]:
import os
import shutil
import random
from pathlib import Path
from tqdm import tqdm

In [2]:
# Configure paths
PROCESSED_PATH = Path('../processed_dataset')
OUTPUT_PATH = Path('../fire_detection_dataset')

# Create directory structure
splits = ['train', 'val', 'test']
subdirs = ['images', 'masks', 'labels']

for split in splits:
    for subdir in subdirs:
        (OUTPUT_PATH / split / subdir).mkdir(parents=True, exist_ok=True)

In [5]:
def split_dataset():
    """Split the dataset into train/val/test sets."""
    # Get all image files
    image_files = sorted(list((PROCESSED_PATH / 'images').glob('*.jpg')))
    random.shuffle(image_files)
    
    # Calculate split indices
    train_size = 1645
    val_size = 151
    test_size = 207
    
    # Split the files
    train_files = image_files[:train_size]
    val_files = image_files[train_size:train_size + val_size]
    test_files = image_files[train_size + val_size:train_size + val_size + test_size]
    
    # Create split mapping
    split_mapping = {
        'train': train_files,
        'val': val_files,
        'test': test_files
    }
    
    # Copy files to respective directories
    for split, files in split_mapping.items():
        print(f"Processing {split} split...")
        for image_path in tqdm(files):
            # Copy image
            shutil.copy2(
                image_path,
                OUTPUT_PATH / split / 'images' / image_path.name
            )
            
            # Copy mask
            mask_path = PROCESSED_PATH / 'masks' / f"{image_path.stem}.png"
            shutil.copy2(
                mask_path,
                OUTPUT_PATH / split / 'masks' / f"{image_path.stem}.png"
            )
            
            # Copy label
            label_path = PROCESSED_PATH / 'labels' / f"{image_path.stem}.txt"
            shutil.copy2(
                label_path,
                OUTPUT_PATH / split / 'labels' / f"{image_path.stem}.txt"
            )

In [6]:
# Split the dataset
split_dataset()

Processing train split...


100%|██████████| 1645/1645 [00:01<00:00, 936.89it/s]


Processing val split...


100%|██████████| 151/151 [00:00<00:00, 968.02it/s]


Processing test split...


100%|██████████| 207/207 [00:00<00:00, 1007.83it/s]


In [7]:
# Create YAML configuration file
yaml_content = f"""path: {str(OUTPUT_PATH.absolute())}  # dataset root dir
train: train/images  # train images
val: val/images  # val images
test: test/images  # test images

# Classes
names:
    0: fire  # fire class
"""

with open(OUTPUT_PATH / 'dataset.yaml', 'w') as f:
    f.write(yaml_content)