In [3]:
import splitfolders

print("Starting stratified split...")

# splitfolders automatically stratifies by iterating through each class directory
splitfolders.ratio(
    input="data_raw", 
    output="dataset",    # It will auto-create this folder
    seed=42,             # Locks the random seed so you get the same split every time
    ratio=(0.8, 0.2),    # 80% Train, 20% Validation
    group_prefix=None, 
    move=True            # False means it copies the files, keeping data_raw untouched
)

print("Split complete! Check the new 'dataset' folder.")


Starting stratified split...
Split complete! Check the new 'dataset' folder.


In [None]:
import os
import shutil
import re

BASE_DIR = r"C:\Users\Shahbaz\Desktop\dl\dataset"

SPLITS = ['train', 'val']

def clean_and_merge_directories(base_path, splits):
    print("Starting Directory Consolidation Protocol...\n")
    
    for split in splits:
        split_dir = os.path.join(base_path, split)
        
        if not os.path.exists(split_dir):
            print(f"Warning: Directory not found: {split_dir}")
            continue
        
        print(f"Scanning {split.upper()} directory: {split_dir}")
        current_folders = [f for f in os.listdir(split_dir) if os.path.isdir(os.path.join(split_dir, f))]
        
        folders_deleted = 0
        images_moved = 0
        
        for folder in current_folders:
            base_building_name = re.sub(r'[\s_]+[0-9]+$', '', folder).strip()
            if base_building_name == folder:
                continue
                
            source_folder_path = os.path.join(split_dir, folder)
            target_folder_path = os.path.join(split_dir, base_building_name)
            
            os.makedirs(target_folder_path, exist_ok=True)
            
            for filename in os.listdir(source_folder_path):
                source_file = os.path.join(source_folder_path, filename)
                
                if os.path.isfile(source_file):
                    safe_filename = f"{folder}_{filename}"
                    target_file = os.path.join(target_folder_path, safe_filename)
                    
                    shutil.move(source_file, target_file)
                    images_moved += 1
            
            os.rmdir(source_folder_path)
            folders_deleted += 1
            
        print(f"-> Result for {split.upper()}: Moved {images_moved} images and consolidated {folders_deleted} angle-folders.\n")
    print("Data consolidation complete. Your dataset is now ready for production training.")

clean_and_merge_directories(BASE_DIR, SPLITS)

Starting Directory Consolidation Protocol...

Scanning TRAIN directory: C:\Users\Shahbaz\Desktop\dl\dataset\train
-> Result for TRAIN: Moved 1487 images and consolidated 16 angle-folders.

Scanning VAL directory: C:\Users\Shahbaz\Desktop\dl\dataset\val
-> Result for VAL: Moved 380 images and consolidated 16 angle-folders.

Data consolidation complete. Your dataset is now ready for production training.


In [4]:
import os

train_dir = r"C:\Users\Shahbaz\Desktop\dl\dataset\train"
val_dir = r"C:\Users\Shahbaz\Desktop\dl\dataset\val"

# sum up all files found inside the directories and their subfolders
train_total = sum(len(files) for _, _, files in os.walk(train_dir))
val_total = sum(len(files) for _, _, files in os.walk(val_dir))

print(f"Total Train Images: {train_total}")
print(f"Total Val Images:   {val_total}")

Total Train Images: 2734
Total Val Images:   688
