In [3]:
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split

def create_directory_structure(base_path):
    """Create the directory structure for train/test/val splits."""
    splits = ['train', 'test', 'val']
    subdirs = ['images', 'masks']
    
    for split in splits:
        for subdir in subdirs:
            os.makedirs(os.path.join(base_path, split, subdir), exist_ok=True)

def copy_files(image_name, src_image_path, src_mask_path, dest_base_path, split):
    """Copy image and mask files to their respective destinations."""
    # Construct file paths
    image_src = os.path.join(src_image_path, f"{image_name}.bmp")
    mask_src = os.path.join(src_mask_path, f"mask_{image_name.split('_')[1]}.jpg")
    
    image_dest = os.path.join(dest_base_path, split, 'images', f"{image_name}.bmp")
    mask_dest = os.path.join(dest_base_path, split, 'masks', f"mask_{image_name.split('_')[1]}.jpg")
    
    # Copy files
    if os.path.exists(image_src) and os.path.exists(mask_src):
        shutil.copy2(image_src, image_dest)
        shutil.copy2(mask_src, mask_dest)
        return True
    return False

def organize_dataset(excel_path, src_image_path, src_mask_path, dest_base_path, num_images=250):
    """Main function to organize the dataset."""
    # Read the Excel file
    df = pd.read_excel(excel_path)
    
    # Create masks for the conditions
    caries_only = (df['Caries'] == 1) & (df['Abscess'] == 0) & \
                  (df['Cyst'] == 0) & (df['Granuloma'] == 0)
    
    # Get images with only caries
    images_with_only_caries = df[caries_only]['image_name'].tolist()
    
    # Take only the first num_images
    if len(images_with_only_caries) > num_images:
        images_with_only_caries = images_with_only_caries[:num_images]
    
    # Create train/test/val splits (60%/20%/20%)
    train_images, temp_images = train_test_split(images_with_only_caries, test_size=0.3, random_state=42)
    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)
    
    # Create directory structure
    create_directory_structure(dest_base_path)
    
    # Copy files to respective directories
    splits = {
        'train': train_images,
        'test': test_images,
        'val': val_images
    }
    
    copied_files = {split: 0 for split in splits}
    for split, images in splits.items():
        for image_name in images:
            if copy_files(image_name, src_image_path, src_mask_path, dest_base_path, split):
                copied_files[split] += 1
    
    # Print summary
    print("\nDataset Organization Summary:")
    print(f"Total images processed: {sum(copied_files.values())}")
    for split, count in copied_files.items():
        print(f"{split.capitalize()} set: {count} images")

# Example usage
if __name__ == "__main__":
    # Define paths
    excel_path = 'DentalAnnotation.xlsx'
    src_image_path = 'caries'  # folder containing original images
    src_mask_path = 'mask'     # folder containing original masks
    dest_base_path = 'dataset' # where to create the organized dataset
    
    organize_dataset(excel_path, src_image_path, src_mask_path, dest_base_path)


Dataset Organization Summary:
Total images processed: 233
Train set: 158 images
Test set: 38 images
Val set: 37 images
