In [1]:
import os
import zipfile
import shutil  # For copying files


def create_small_dataset(original_dataset_path, output_path, size_per_class=4740):
    # Path for the consolidated dataset
    dataset_output_path = os.path.join(output_path, 'smaller_dataset')

    # Ensure output directory exists
    os.makedirs(dataset_output_path, exist_ok=True)
    print(f"Created output directory at: {dataset_output_path}")

    # List all class directories in both Train and Test
    class_directories = {}
    for subfolder in ['train', 'test']:
        subfolder_path = os.path.join(original_dataset_path, subfolder)
        if os.path.exists(subfolder_path):
            print(f"Scanning {subfolder_path} for class directories...")
            for class_name in os.listdir(subfolder_path):
                class_path = os.path.join(subfolder_path, class_name)
                if os.path.isdir(class_path):
                    if class_name not in class_directories:
                        class_directories[class_name] = []
                    class_directories[class_name].append(class_path)
            print(f"Found classes in {subfolder}: {list(class_directories.keys())}")

    # Process all classes
    print("Processing each class...")
    for class_name, paths in class_directories.items():
        print(f"Processing class: {class_name}")
        all_files = []
        for path in paths:
            print(f"Collecting files from: {path}")
            files = [os.path.join(path, file) for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
            all_files.extend(files)
        print(f"Total files found for class {class_name}: {len(all_files)}")

        # Shuffle and select up to size_per_class files
        selected_files = all_files[:min(len(all_files), size_per_class)]
        print(f"Selected {len(selected_files)} files for class {class_name}")

        # Create class directory in the consolidated folder
        class_output_path = os.path.join(dataset_output_path, class_name)
        os.makedirs(class_output_path, exist_ok=True)
        print(f"Created directory for class {class_name}: {class_output_path}")

        # Copy selected files, with progress logging
        for idx, file in enumerate(selected_files, 1):  # Add 1 to make it 1-indexed
            output_file_path = os.path.join(class_output_path, os.path.basename(file))
            shutil.copy(file, output_file_path)
            if idx % 100 == 0 or idx == len(selected_files):  # Log every 100 files or at the end
                print(f"Class {class_name}: {idx}/{len(selected_files)} files done")

        print(f"Finished processing class: {class_name}")

    # Create a zip file of the smaller dataset
    zip_file_path = os.path.join(output_path, 'smaller_dataset.zip')
    print(f"Creating zip file: {zip_file_path}")
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(dataset_output_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, dataset_output_path)
                zipf.write(file_path, arcname)
    print(f"Dataset successfully zipped at: {zip_file_path}")

    print(f"Completed dataset creation. Consolidated dataset is available at: {dataset_output_path}")


# Example usage
original_dataset_path = "/kaggle/input/combined-cropped-fer-dataset"  # Adjust this path
output_path = "/kaggle/working/"  # Adjust this path

create_small_dataset(original_dataset_path, output_path, size_per_class=6991)


Created output directory at: /kaggle/working/smaller_dataset
Scanning /kaggle/input/combined-cropped-fer-dataset/train for class directories...
Found classes in train: ['surprise', 'fear', 'angry', 'neutral', 'sad', 'happy']
Scanning /kaggle/input/combined-cropped-fer-dataset/test for class directories...
Found classes in test: ['surprise', 'fear', 'angry', 'neutral', 'sad', 'happy']
Processing each class...
Processing class: surprise
Collecting files from: /kaggle/input/combined-cropped-fer-dataset/train/surprise
Collecting files from: /kaggle/input/combined-cropped-fer-dataset/test/surprise
Total files found for class surprise: 7590
Selected 6991 files for class surprise
Created directory for class surprise: /kaggle/working/smaller_dataset/surprise
Class surprise: 100/6991 files done
Class surprise: 200/6991 files done
Class surprise: 300/6991 files done
Class surprise: 400/6991 files done
Class surprise: 500/6991 files done
Class surprise: 600/6991 files done
Class surprise: 700/699