# Data preparation

The Simpsons and Dogs vs Cats datasets require a **Kaggle API key**. Set up your credentials before running:

1. Copy `.env.template` to `.env`
2. Fill in your `KAGGLE_USERNAME` and `KAGGLE_KEY` (get them from https://www.kaggle.com/settings)

The Hymenoptera and Caltech 256 datasets are downloaded automatically without any credentials.

In [1]:
import os
import shutil
from utils import (
    split_dataset_folder, 
    convert_image_dataset_to_grayscale,
    download_hymenoptera, 
    download_caltech256, 
    download_simpsons, 
    download_dogs_vs_cats
)

%load_ext autoreload
%autoreload 2

In [2]:
DATA_ROOT = './data'

# 1. Download all datasets
HYMENOPTERA_ROOT = download_hymenoptera(DATA_ROOT)
HYMENOPTERA_GRAY_ROOT = os.path.join(DATA_ROOT, 'hymenoptera_gray')

CALTECH256_ORIGINAL = download_caltech256(DATA_ROOT)
CALTECH256_ROOT = os.path.join(DATA_ROOT, 'caltech256', 'caltech256')
CALTECH256_GRAY_ROOT = os.path.join(DATA_ROOT, 'caltech256', 'caltech256_gray')

SIMPSONS_DIR = download_simpsons(DATA_ROOT)  # Requires Kaggle API key
SIMPSONS_RAW = os.path.join(SIMPSONS_DIR, 'simpsons_dataset')
SIMPSONS_SELECTED = os.path.join(DATA_ROOT, 'simpsons', 'simpsons_selected')
SIMPSONS_ROOT = os.path.join(DATA_ROOT, 'simpsons', 'simpsons')
SIMPSONS_GRAY_ROOT = os.path.join(DATA_ROOT, 'simpsons', 'simpsons_gray')

DOGS_CATS_DIR = download_dogs_vs_cats(DATA_ROOT)  # Requires Kaggle API key
DOGS_CATS_RAW = os.path.join(DOGS_CATS_DIR, 'kagglecatsanddogs_3367a', 'PetImages')
DOGS_CATS_ORIGINAL = os.path.join(DATA_ROOT, 'dogs_vs_cats', 'dogs_vs_cats_original')
DOGS_CATS_ROOT = os.path.join(DATA_ROOT, 'dogs_vs_cats', 'dogs_vs_cats')
DOGS_CATS_GRAY_ROOT = os.path.join(DATA_ROOT, 'dogs_vs_cats', 'dogs_vs_cats_gray')

Dataset already downloaded in ./data/hymenoptera_data
Dataset already downloaded in ./data/256_ObjectCategories
Dataset already downloaded in ./data/simpsons
Dataset already downloaded in ./data/dogs_vs_cats


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 2. Hymenoptera: convert to grayscale
convert_image_dataset_to_grayscale(HYMENOPTERA_ROOT, HYMENOPTERA_GRAY_ROOT)
print("Hymenoptera grayscale ready")

Hymenoptera grayscale ready


In [4]:
# 3. Simpsons: select top 20 classes (by number of images) and split
SIMPSONS_CLASSES = [
    'abraham_grampa_simpson', 'apu_nahasapeemapetilon', 'bart_simpson',
    'charles_montgomery_burns', 'chief_wiggum', 'comic_book_guy',
    'edna_krabappel', 'homer_simpson', 'kent_brockman', 'krusty_the_clown',
    'lenny_leonard', 'lisa_simpson', 'marge_simpson', 'mayor_quimby',
    'milhouse_van_houten', 'moe_szyslak', 'ned_flanders', 'nelson_muntz',
    'principal_skinner', 'sideshow_bob',
]

if not os.path.isdir(SIMPSONS_SELECTED):
    os.makedirs(SIMPSONS_SELECTED, exist_ok=True)
    for cls in SIMPSONS_CLASSES:
        src = os.path.join(SIMPSONS_RAW, cls)
        dst = os.path.join(SIMPSONS_SELECTED, cls)
        if os.path.isdir(src):
            shutil.copytree(src, dst)
            print(f"  {cls}: {len(os.listdir(dst))} images")
    print(f"Simpsons selected: {len(SIMPSONS_CLASSES)} classes")
else:
    print(f"Simpsons selected already exists at {SIMPSONS_SELECTED}")

split_dataset_folder(SIMPSONS_SELECTED, SIMPSONS_ROOT)
print("Simpsons split ready")

Simpsons selected already exists at ./data/simpsons/simpsons_selected
Simpsons split ready


In [5]:
convert_image_dataset_to_grayscale(SIMPSONS_ROOT, SIMPSONS_GRAY_ROOT)
print("Simpsons grayscale ready")

KeyboardInterrupt: 

In [None]:
# 4. Dogs vs Cats: rename Cat/Dog to cats/dogs and split
if not os.path.isdir(DOGS_CATS_ORIGINAL):
    os.makedirs(DOGS_CATS_ORIGINAL, exist_ok=True)
    for src_name, dst_name in [('Cat', 'cats'), ('Dog', 'dogs')]:
        src = os.path.join(DOGS_CATS_RAW, src_name)
        dst = os.path.join(DOGS_CATS_ORIGINAL, dst_name)
        shutil.copytree(src, dst)
        print(f"  {dst_name}: {len(os.listdir(dst))} images")
    print(f"Dogs vs Cats original ready")
else:
    print(f"Dogs vs Cats original already exists at {DOGS_CATS_ORIGINAL}")

split_dataset_folder(DOGS_CATS_ORIGINAL, DOGS_CATS_ROOT)
print("Dogs vs Cats split ready")

In [None]:
convert_image_dataset_to_grayscale(DOGS_CATS_ROOT, DOGS_CATS_GRAY_ROOT)
print("Dogs vs Cats grayscale ready")

In [None]:
# 5. Caltech 256: split and convert to grayscale
split_dataset_folder(CALTECH256_ORIGINAL, CALTECH256_ROOT)
print("Caltech 256 split ready")

convert_image_dataset_to_grayscale(CALTECH256_ROOT, CALTECH256_GRAY_ROOT)
print("Caltech 256 grayscale ready")