# Data preparation

The Simpsons and Dogs vs Cats datasets require a **Kaggle API key**. Set up your credentials before running:

1. Copy `.env.template` to `.env`
2. Fill in your `KAGGLE_USERNAME` and `KAGGLE_KEY` (get them from https://www.kaggle.com/settings)

The Hymenoptera and Caltech 256 datasets are downloaded automatically without any credentials.

In [17]:
import os
import shutil
from utils import (
    split_dataset_folder, 
    convert_image_dataset_to_grayscale,
    validate_image_dataset,
    download_hymenoptera, 
    download_caltech256, 
    download_simpsons, 
    download_dogs_vs_cats
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
DATA_ROOT = './data'

# 1. Download all datasets
HYMENOPTERA_ROOT = download_hymenoptera(DATA_ROOT)
HYMENOPTERA_GRAY_ROOT = os.path.join(DATA_ROOT, 'hymenoptera_gray')

CALTECH256_ORIGINAL = download_caltech256(DATA_ROOT)
CALTECH256_ROOT = os.path.join(DATA_ROOT, 'caltech256', 'caltech256')
CALTECH256_GRAY_ROOT = os.path.join(DATA_ROOT, 'caltech256', 'caltech256_gray')

SIMPSONS_DIR = download_simpsons(DATA_ROOT)  # Requires Kaggle API key
SIMPSONS_RAW = os.path.join(SIMPSONS_DIR, 'simpsons_dataset')
SIMPSONS_SELECTED = os.path.join(DATA_ROOT, 'simpsons', 'simpsons_selected')
SIMPSONS_ROOT = os.path.join(DATA_ROOT, 'simpsons', 'simpsons')
SIMPSONS_GRAY_ROOT = os.path.join(DATA_ROOT, 'simpsons', 'simpsons_gray')

DOGS_CATS_DIR = download_dogs_vs_cats(DATA_ROOT)  # Requires Kaggle API key
DOGS_CATS_RAW = os.path.join(DOGS_CATS_DIR, 'kagglecatsanddogs_3367a', 'PetImages')
DOGS_CATS_ORIGINAL = os.path.join(DATA_ROOT, 'dogs_vs_cats', 'dogs_vs_cats_original')
DOGS_CATS_ROOT = os.path.join(DATA_ROOT, 'dogs_vs_cats', 'dogs_vs_cats')
DOGS_CATS_GRAY_ROOT = os.path.join(DATA_ROOT, 'dogs_vs_cats', 'dogs_vs_cats_gray')

Downloading https://download.pytorch.org/tutorial/hymenoptera_data.zip
Extracting files...
Dataset ready at ./data/hymenoptera_data
Downloading Caltech 256 from https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar
Extracting files...
Dataset ready at ./data/256_ObjectCategories
Downloading Simpsons dataset from Kaggle...
Downloaded to cache: /home/u/.cache/kagglehub/datasets/alexattia/the-simpsons-characters-dataset/versions/4
Dataset ready at ./data/simpsons
Downloading Dogs vs Cats dataset from Kaggle...
Downloaded to cache: /home/u/.cache/kagglehub/datasets/karakaggle/kaggle-cat-vs-dog-dataset/versions/1
Dataset ready at ./data/dogs_vs_cats


In [19]:
# 2. Hymenoptera: validate and convert to grayscale
validate_image_dataset(HYMENOPTERA_ROOT, remove=True, verbose=True)
convert_image_dataset_to_grayscale(HYMENOPTERA_ROOT, HYMENOPTERA_GRAY_ROOT)
print("Hymenoptera grayscale ready")

Validation complete: 0/398 bad files found.
Hymenoptera grayscale ready


In [20]:
# 3. Simpsons: select top 20 classes (by number of images) and split
SIMPSONS_CLASSES = [
    'abraham_grampa_simpson', 'apu_nahasapeemapetilon', 'bart_simpson',
    'charles_montgomery_burns', 'chief_wiggum', 'comic_book_guy',
    'edna_krabappel', 'homer_simpson', 'kent_brockman', 'krusty_the_clown',
    'lenny_leonard', 'lisa_simpson', 'marge_simpson', 'mayor_quimby',
    'milhouse_van_houten', 'moe_szyslak', 'ned_flanders', 'nelson_muntz',
    'principal_skinner', 'sideshow_bob',
]

if not os.path.isdir(SIMPSONS_SELECTED):
    os.makedirs(SIMPSONS_SELECTED, exist_ok=True)
    for cls in SIMPSONS_CLASSES:
        src = os.path.join(SIMPSONS_RAW, cls)
        dst = os.path.join(SIMPSONS_SELECTED, cls)
        if os.path.isdir(src):
            shutil.copytree(src, dst)
            print(f"  {cls}: {len(os.listdir(dst))} images")
    print(f"Simpsons selected: {len(SIMPSONS_CLASSES)} classes")
else:
    print(f"Simpsons selected already exists at {SIMPSONS_SELECTED}")

split_dataset_folder(SIMPSONS_SELECTED, SIMPSONS_ROOT)
print("Simpsons split ready")

  abraham_grampa_simpson: 913 images
  apu_nahasapeemapetilon: 623 images
  bart_simpson: 1342 images
  charles_montgomery_burns: 1193 images
  chief_wiggum: 986 images
  comic_book_guy: 469 images
  edna_krabappel: 457 images
  homer_simpson: 2246 images
  kent_brockman: 498 images
  krusty_the_clown: 1206 images
  lenny_leonard: 310 images
  lisa_simpson: 1354 images
  marge_simpson: 1291 images
  mayor_quimby: 246 images
  milhouse_van_houten: 1079 images
  moe_szyslak: 1452 images
  ned_flanders: 1454 images
  nelson_muntz: 358 images
  principal_skinner: 1194 images
  sideshow_bob: 877 images
Simpsons selected: 20 classes
Simpsons split ready


In [21]:
validate_image_dataset(SIMPSONS_ROOT, remove=True, verbose=True)
convert_image_dataset_to_grayscale(SIMPSONS_ROOT, SIMPSONS_GRAY_ROOT)
print("Simpsons grayscale ready")

Validation complete: 0/19548 bad files found.
Simpsons grayscale ready


In [22]:
# 4. Dogs vs Cats: rename Cat/Dog to cats/dogs and split
if not os.path.isdir(DOGS_CATS_ORIGINAL):
    os.makedirs(DOGS_CATS_ORIGINAL, exist_ok=True)
    for src_name, dst_name in [('Cat', 'cats'), ('Dog', 'dogs')]:
        src = os.path.join(DOGS_CATS_RAW, src_name)
        dst = os.path.join(DOGS_CATS_ORIGINAL, dst_name)
        shutil.copytree(src, dst)
        print(f"  {dst_name}: {len(os.listdir(dst))} images")
    print(f"Dogs vs Cats original ready")
else:
    print(f"Dogs vs Cats original already exists at {DOGS_CATS_ORIGINAL}")

split_dataset_folder(DOGS_CATS_ORIGINAL, DOGS_CATS_ROOT)
print("Dogs vs Cats split ready")

  cats: 12491 images
  dogs: 12470 images
Dogs vs Cats original ready
Dogs vs Cats split ready


In [23]:
validate_image_dataset(DOGS_CATS_ROOT, remove=True, verbose=True)
convert_image_dataset_to_grayscale(DOGS_CATS_ROOT, DOGS_CATS_GRAY_ROOT)
print("Dogs vs Cats grayscale ready")

Bad image ./data/dogs_vs_cats/dogs_vs_cats/val/dogs/9041.jpg: Truncated File Read
Removed ./data/dogs_vs_cats/dogs_vs_cats/val/dogs/9041.jpg
Validation complete: 1/24961 bad files found.
Dogs vs Cats grayscale ready


In [None]:
# 5. Caltech 256: split, validate, and convert to grayscale
split_dataset_folder(CALTECH256_ORIGINAL, CALTECH256_ROOT)
print("Caltech 256 split ready")

In [None]:
validate_image_dataset(CALTECH256_ROOT, remove=True, verbose=True)
convert_image_dataset_to_grayscale(CALTECH256_ROOT, CALTECH256_GRAY_ROOT)
print("Caltech 256 grayscale ready")

Caltech 256 split ready
Validation complete: 0/30608 bad files found.
Caltech 256 grayscale ready
