In [None]:
# Install Hugging Face datasets
!pip install datasets scikit-learn --quiet

# Import required libraries
from datasets import load_dataset
import os
import shutil
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
dataset = load_dataset("cats_vs_dogs")
full_dataset = dataset["train"]

# Extract labels
labels = full_dataset["labels"]

# Create stratified train/test indices using scikit-learn
train_indices, test_indices = train_test_split(
    np.arange(len(full_dataset)),
    test_size=0.1,
    random_state=42,
    stratify=labels
)

# Create train and test splits
train_dataset = full_dataset.select(train_indices)
test_dataset = full_dataset.select(test_indices)

# Verify the splits
print(f"Training examples: {len(train_dataset)}")
print(f"Testing examples: {len(test_dataset)}")

# Count class distribution in splits
train_cats = sum(1 for example in train_dataset if example["labels"] == 0)
train_dogs = sum(1 for example in train_dataset if example["labels"] == 1)
test_cats = sum(1 for example in test_dataset if example["labels"] == 0)
test_dogs = sum(1 for example in test_dataset if example["labels"] == 1)

print(f"Training set: {train_cats} cats, {train_dogs} dogs")
print(f"Testing set: {test_cats} cats, {test_dogs} dogs")

# Set up directories
save_dir = "/content/drive/MyDrive/cats_vs_dogs"

# Train folders
train_cats_dir = os.path.join(save_dir, "train", "cats")
train_dogs_dir = os.path.join(save_dir, "train", "dogs")

# Test folders
test_cats_dir = os.path.join(save_dir, "test", "cats")
test_dogs_dir = os.path.join(save_dir, "test", "dogs")

# Clear existing directories and create new ones
for directory in [train_cats_dir, train_dogs_dir, test_cats_dir, test_dogs_dir]:
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory, exist_ok=True)

# Save training images
train_cat_counter = 0
train_dog_counter = 0
print("Saving training images...")
for example in tqdm(train_dataset, desc="Saving train images"):
    if example["labels"] == 0:  # Cat
        filename = f"cat_{train_cat_counter}.jpg"
        path = os.path.join(train_cats_dir, filename)
        train_cat_counter += 1
    else:  # Dog
        filename = f"dog_{train_dog_counter}.jpg"
        path = os.path.join(train_dogs_dir, filename)
        train_dog_counter += 1
    example["image"].save(path)

print(f"Saved {train_cat_counter} cats and {train_dog_counter} dogs in training set")

# Save testing images
test_cat_counter = 0
test_dog_counter = 0
print("Saving testing images...")
for example in tqdm(test_dataset, desc="Saving test images"):
    if example["labels"] == 0:  # Cat
        filename = f"cat_{test_cat_counter}.jpg"
        path = os.path.join(test_cats_dir, filename)
        test_cat_counter += 1
    else:  # Dog
        filename = f"dog_{test_dog_counter}.jpg"
        path = os.path.join(test_dogs_dir, filename)
        test_dog_counter += 1
    example["image"].save(path)

print(f"Saved {test_cat_counter} cats and {test_dog_counter} dogs in test set")

# Verify that all directories have images
for directory in [train_cats_dir, train_dogs_dir, test_cats_dir, test_dogs_dir]:
    file_count = len(os.listdir(directory))
    print(f"{directory}: {file_count} images")