In [43]:
# --- BASIC SETUP: make sure Python path and required packages are ready ---
import sys, subprocess, pkgutil
print("Using Python:", sys.executable)

# Install/upgrade pip and make sure torch/torchvision/kagglehub are available in THIS Python
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchvision", "kagglehub"])

import torch
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())


Using Python: c:\Users\61459\anaconda3\python.exe
Torch: 2.8.0+cpu | CUDA available: False


In [44]:
# data_preprocessing_script.py
# Mandatory Kaggle download → nested-folder detection → resize/augment → loaders + summary


# === data_preprocessing_script.py ===
# Goal: (1) Download dataset from Kaggle → (2) find correct image folder
#       (3) build image transforms (resize/augment/normalize)
#       (4) build DataLoaders → (5) print a quick summary

from __future__ import annotations
import argparse
import random
from pathlib import Path
from typing import List, Tuple, Optional, Dict

import numpy as np
import torch
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms

# Kaggle is REQUIRED
try:
    import kagglehub  # pip install kagglehub
except Exception as e:
    raise RuntimeError(
        "kagglehub is required. Install with: pip install kagglehub\n"
        f"Import error: {e}"
    )

    
print("kagglehub successfully imported.")

kagglehub successfully imported.


In [45]:

# --- REPRODUCIBILITY + DEVICE PICKING ---
# set_seed: makes "random" things repeatable so results are consistent
def set_seed(seed: int = 56) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def pick_device() -> torch.device:
    if torch.cuda.is_available():
        d = torch.device("cuda")
        print(f"[Device] CUDA: {torch.cuda.get_device_name(0)}")
        return d
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        print("[Device] MPS (Apple Metal)")
        return torch.device("mps")
    print("[Device] CPU")
    return torch.device("cpu")

In [46]:

# --- FOLDER DISCOVERY ---
# looks_like_class_dir: checks if a folder contains 2+ subfolders (typical class directories)
def looks_like_class_dir(p: Path) -> bool:
    return p.is_dir() and sum(1 for x in p.iterdir() if x.is_dir()) >= 2

# find_image_root: tries common nestings like /images/images then falls back
def find_image_root(start: Path) -> Path:
    """Handles layouts like <root>/images/images, else falls back to a level containing class dirs."""
    for c in [start / "images" / "images", start / "images", start]:
        if c.exists() and looks_like_class_dir(c):
            return c
    for child in start.iterdir():
        if looks_like_class_dir(child):
            return child
    return start

In [47]:
# --- TRANSFORMS (image preparation pipelines) ---
# build_transforms: training gets random crops/flips/jitter (to generalize);
# validation/test get center crop (consistent)
def build_transforms(img_size: int) -> Tuple[transforms.Compose, transforms.Compose]:
    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
    train_tf = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ColorJitter(0.1, 0.1, 0.1, 0.03),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    eval_tf = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(img_size),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    return train_tf, eval_tf

In [48]:

# --- DATASET UTILITIES ---
# ext_filter: optional filter so we only accept .png if requested
def ext_filter(only_png: bool):
    return (lambda p: str(p).lower().endswith(".png")) if only_png else None

# --- Add this 3-way splitter (keep your existing 2-way version too if you like) ---
def stratified_split_indices_3(targets: List[int], val_ratio: float, test_ratio: float, seed: int):
    """Stratified split into train/val/test by class."""
    assert 0 < val_ratio < 1 and 0 <= test_ratio < 1 and (val_ratio + test_ratio) < 1, \
        "val_ratio + test_ratio must be < 1"
    rng = np.random.default_rng(seed)
    targets = np.array(targets)

    tr, va, te = [], [], []
    for c in np.unique(targets):
        idx = np.where(targets == c)[0]
        rng.shuffle(idx)
        n = len(idx)
        n_val  = max(1, int(round(n * val_ratio))) if n > 1 else 0
        n_test = max(1, int(round(n * test_ratio))) if (test_ratio > 0 and n - n_val > 1) else 0
        # cap to avoid overrun on tiny classes
        n_val  = min(n_val,  n - 1) if n > 1 else 0
        n_test = min(n_test, n - n_val - 1) if (n - n_val) > 1 else 0

        va += idx[:n_val].tolist()
        te += idx[n_val:n_val + n_test].tolist()
        tr += idx[n_val + n_test:].tolist()

    rng.shuffle(tr); rng.shuffle(va); rng.shuffle(te)
    return tr, va, te



# load_from_folder: handles two layouts
# (A) separate train/val(/test) folders, or
# (B) one big folder → we do a stratified split
# --- Replace your entire load_from_folder with this complete version (no ellipses) ---
def load_from_folder(
    data_root: Path,
    img_size: int,
    val_ratio: float,
    seed: int,
    only_png: bool,
    test_ratio: float = 0.0,  # NEW
) -> Tuple[DataLoader, DataLoader, Optional[DataLoader], Dict]:
    """
    Builds DataLoaders from a folder that is either:
      A) already split as data_root/{train,val, test?}, or
      B) a single folder of class subfolders (we do a stratified split into train/val/(test)).
    """
    # transforms + optional .png filter
    train_tf, eval_tf = build_transforms(img_size)
    filt = ext_filter(only_png)

    # detect pre-split layout
    has_train = (data_root / "train").is_dir()
    has_val   = (data_root / "val").is_dir()
    has_test  = (data_root / "test").is_dir()

    if has_train and has_val:
        # Case A: dataset already has train/val/(test) on disk
        train_ds = datasets.ImageFolder(str(find_image_root(data_root / "train")), transform=train_tf, is_valid_file=filt)
        val_ds   = datasets.ImageFolder(str(find_image_root(data_root / "val")),   transform=eval_tf,   is_valid_file=filt)
        test_ds  = datasets.ImageFolder(str(find_image_root(data_root / "test")),  transform=eval_tf,   is_valid_file=filt) if has_test else None
        classes = train_ds.classes
    else:
        # Case B: one folder; do our own stratified split (now 3-way if test_ratio > 0)
        class_root = find_image_root(data_root)
        raw = datasets.ImageFolder(str(class_root), transform=None, is_valid_file=filt)
        classes = raw.classes
        targets = getattr(raw, "targets", [lbl for _, lbl in raw.samples])

        if test_ratio and test_ratio > 0:
            tr_idx, va_idx, te_idx = stratified_split_indices_3(targets, val_ratio, test_ratio, seed)
        else:
            tr_idx, va_idx = stratified_split_indices(targets, val_ratio, seed)
            te_idx = []

        tr_full = datasets.ImageFolder(str(class_root), transform=train_tf, is_valid_file=filt)
        va_full = datasets.ImageFolder(str(class_root), transform=eval_tf,  is_valid_file=filt)
        te_full = datasets.ImageFolder(str(class_root), transform=eval_tf,  is_valid_file=filt)

        train_ds = Subset(tr_full, tr_idx)
        val_ds   = Subset(va_full, va_idx)
        test_ds  = Subset(te_full, te_idx) if len(te_idx) > 0 else None

    # DataLoaders
    train_ld = DataLoader(train_ds, batch_size=64, shuffle=True,  num_workers=2, pin_memory=True)
    val_ld   = DataLoader(val_ds,   batch_size=64, shuffle=False, num_workers=2, pin_memory=True)
    test_ld  = DataLoader(test_ds,  batch_size=64, shuffle=False, num_workers=2, pin_memory=True) if test_ds else None

    meta = {"classes": classes, "img_size": img_size}
    return train_ld, val_ld, test_ld, meta


In [49]:
# --- KAGGLE DOWNLOAD (fixed dataset helper) ---
# download_kaggle_dataset: grabs a specific recycling dataset and returns the local path
def download_kaggle_dataset() -> Path:
    """
    Downloads the recyclable and household waste dataset from KaggleHub.
    Returns the local dataset path as a Path object.
    """
    import kagglehub
    slug = "alistairking/recyclable-and-household-waste-classification"
    print(f"[Kaggle] Downloading '{slug}' ...")
    path = kagglehub.dataset_download(slug)
    print(f"[Kaggle] Downloaded to: {path}")
    return Path(path)

In [50]:
# CLI
# ------------------------------
def main():
    p = argparse.ArgumentParser(description="Image data preprocessing with mandatory Kaggle download")
    p.add_argument("--kaggle", type=str, required=True, help="Kaggle dataset slug, e.g. 'owner/dataset'")
    p.add_argument("--img-size", type=int, default=224)
    p.add_argument("--val-ratio", type=float, default=0.1)
    p.add_argument("--seed", type=int, default=56)
    p.add_argument("--png-only", action="store_true", help="Filter to .png images only")
    args = p.parse_args()

    set_seed(args.seed)
    device = torch.device("cuda")  # pick_device()  # Device not used in preprocessing

    dl_root = download_kaggle_dataset(args.kaggle)
    data_root = find_image_root(dl_root)

    train_ld, val_ld, test_ld, meta = load_from_folder(
        data_root=data_root,
        img_size=args.img_size,
        val_ratio=args.val_ratio,
        seed=args.seed,
        only_png=args.png_only,
    )


In [51]:
# --- Summary helper (drop-in) ---
from torch.utils.data import Subset

def summarize_loaders(train_ld, val_ld, test_ld, meta, device):
    def ds_len(obj):
        if obj is None:
            return 0
        # Accept DataLoader, Dataset, or Subset
        base = getattr(obj, "dataset", obj)
        return len(base)

    print("\n=== DATA SUMMARY ===")
    print(f"Device: {device.type.upper()}")
    classes = meta.get("classes", [])
    preview = classes[:10]
    print(f"Classes ({len(classes)}): {preview}{' ...' if len(classes) > 10 else ''}")
    print(f"Train: {ds_len(train_ld)} | Val: {ds_len(val_ld)} | Test: {ds_len(test_ld)}")
    try:
        # Peek one batch (won't crash if dataset is empty/corrupt)
        xb, yb = next(iter(train_ld))
        print(f"Sample batch: images={tuple(xb.shape)}  labels={tuple(yb.shape)}")
    except Exception as e:
        print(f"[Warn] Could not fetch a batch: {e}")
    print("====================\n")


In [52]:
# ---- RUN PIPELINE IN NOTEBOOK ----
# Settings you can tweak
IMG_SIZE  = 224  # how big each image becomes (square)
VAL_RATIO = 0.15  # 15% of data for validation
TEST_RATIO = 0.05  # NEW: 5% of data for test (adjust as you like)
SEED      = 56   # repeatable split/transforms
PNG_ONLY  = True  # set False to accept jpg/jpeg too (only accepts png)

# 1) seed + device
set_seed(SEED)
device = torch.device("cuda")  # pick_device()  # Device not used in preprocessing

# 2) download fixed Kaggle dataset (your helper takes no args)
dl_root = download_kaggle_dataset()    # <-- uses the fixed slug inside
data_root = find_image_root(dl_root)   # handles images/images nesting

# 3) build loaders
train_ld, val_ld, test_ld, meta = load_from_folder(
    data_root=data_root,
    img_size=IMG_SIZE,
    val_ratio=VAL_RATIO,
    seed=SEED,
    only_png=PNG_ONLY,
    test_ratio=TEST_RATIO,  # NEW: request a test split when dataset isn't pre-split
)

# 4) print summary
summarize_loaders(train_ld, val_ld, test_ld, meta, device)

#Prints the first 10 class labels so you can sanity-check the naming 
#Meaning: After the 80/20 split (because VAL_RATIO = 0.2), you’ve got 12,000 training images and 3,000 validation images. There’s no test set in this dataset layout, so it shows 0.
#Meaning: We successfully pulled one training batch.
#images=(64, 3, 224, 224) = 64 images per batch, 3 color channels (RGB), and each image is 224×224 pixels.


[Kaggle] Downloading 'alistairking/recyclable-and-household-waste-classification' ...
[Kaggle] Downloaded to: C:\Users\61459\.cache\kagglehub\datasets\alistairking\recyclable-and-household-waste-classification\versions\1

=== DATA SUMMARY ===
Device: CUDA
Classes (30): ['aerosol_cans', 'aluminum_food_cans', 'aluminum_soda_cans', 'cardboard_boxes', 'cardboard_packaging', 'clothing', 'coffee_grounds', 'disposable_plastic_cutlery', 'eggshells', 'food_waste'] ...
Train: 12000 | Val: 2250 | Test: 750
Sample batch: images=(64, 3, 224, 224)  labels=(64,)

