In [8]:
import sys, subprocess, pkgutil
print("Using Python:", sys.executable)

# install into THIS kernel's interpreter
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchvision", "kagglehub"])

import torch
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())


Using Python: d:\downloads_from_edge\engg123\.venv\Scripts\python.exe


FileNotFoundError: [WinError 2] The system cannot find the file specified

In [3]:
# data_preprocessing_script.py
# Mandatory Kaggle download → nested-folder detection → resize/augment → loaders + summary

from __future__ import annotations
import argparse
import random
from pathlib import Path
from typing import List, Tuple, Optional, Dict

import numpy as np
import torch
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms

# Kaggle is REQUIRED
try:
    import kagglehub  # pip install kagglehub
except Exception as e:
    raise RuntimeError(
        "kagglehub is required. Install with: pip install kagglehub\n"
        f"Import error: {e}"
    )

    
print("kagglehub successfully imported.")

kagglehub successfully imported.


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Reproducibility + device pick
# ------------------------------
def set_seed(seed: int = 56) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def pick_device() -> torch.device:
    if torch.cuda.is_available():
        d = torch.device("cuda")
        print(f"[Device] CUDA: {torch.cuda.get_device_name(0)}")
        return d
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        print("[Device] MPS (Apple Metal)")
        return torch.device("mps")
    print("[Device] CPU")
    return torch.device("cpu")

In [54]:
# Folder discovery
# ------------------------------
def looks_like_class_dir(p: Path) -> bool:
    return p.is_dir() and sum(1 for x in p.iterdir() if x.is_dir()) >= 2

def find_image_root(start: Path) -> Path:
    """Handles layouts like <root>/images/images, else falls back to a level containing class dirs."""
    for c in [start / "images" / "images", start / "images", start]:
        if c.exists() and looks_like_class_dir(c):
            return c
    for child in start.iterdir():
        if looks_like_class_dir(child):
            return child
    return start

In [55]:
# Transforms
# ------------------------------
def build_transforms(img_size: int) -> Tuple[transforms.Compose, transforms.Compose]:
    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
    train_tf = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ColorJitter(0.1, 0.1, 0.1, 0.03),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    eval_tf = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(img_size),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    return train_tf, eval_tf

In [56]:
# Dataset utils
# ------------------------------
def ext_filter(only_png: bool):
    return (lambda p: str(p).lower().endswith(".png")) if only_png else None

def stratified_split_indices(targets: List[int], val_ratio: float, seed: int):
    rng = np.random.default_rng(seed)
    targets = np.array(targets)
    tr, va = [], []
    for c in np.unique(targets):
        idx = np.where(targets == c)[0]
        rng.shuffle(idx)
        n_val = max(1, int(round(len(idx) * val_ratio)))
        va += idx[:n_val].tolist()
        tr += idx[n_val:].tolist()
    rng.shuffle(tr); rng.shuffle(va)
    return tr, va

def load_from_folder(
    data_root: Path, img_size: int, val_ratio: float, seed: int, only_png: bool,
) -> Tuple[DataLoader, DataLoader, Optional[DataLoader], Dict]:
    train_tf, eval_tf = build_transforms(img_size)
    filt = ext_filter(only_png)

    has_train = (data_root / "train").is_dir()
    has_val   = (data_root / "val").is_dir()
    has_test  = (data_root / "test").is_dir()

    if has_train and has_val:
        train_ds = datasets.ImageFolder(str(find_image_root(data_root / "train")), transform=train_tf, is_valid_file=filt)
        val_ds   = datasets.ImageFolder(str(find_image_root(data_root / "val")),   transform=eval_tf,   is_valid_file=filt)
        test_ds  = datasets.ImageFolder(str(find_image_root(data_root / "test")),  transform=eval_tf,   is_valid_file=filt) if has_test else None
        classes = train_ds.classes
    else:
        class_root = find_image_root(data_root)
        raw = datasets.ImageFolder(str(class_root), transform=None, is_valid_file=filt)
        classes = raw.classes
        targets = getattr(raw, "targets", [lbl for _, lbl in raw.samples])
        tr_idx, va_idx = stratified_split_indices(targets, val_ratio, seed)
        tr_full = datasets.ImageFolder(str(class_root), transform=train_tf, is_valid_file=filt)
        va_full = datasets.ImageFolder(str(class_root), transform=eval_tf,  is_valid_file=filt)
        train_ds = Subset(tr_full, tr_idx)
        val_ds   = Subset(va_full, va_idx)
        test_ds  = None

    train_ld = DataLoader(train_ds, batch_size=64, shuffle=True,  num_workers=2, pin_memory=True)
    val_ld   = DataLoader(val_ds,   batch_size=64, shuffle=False, num_workers=2, pin_memory=True)
    test_ld  = DataLoader(test_ds,  batch_size=64, shuffle=False, num_workers=2, pin_memory=True) if test_ds else None

    meta = {"classes": classes, "img_size": img_size}
    return train_ld, val_ld, test_ld, meta

In [57]:
# ------------------------------
# Kaggle (mandatory, fixed dataset)
# ------------------------------
def download_kaggle_dataset() -> Path:
    """
    Downloads the recyclable and household waste dataset from KaggleHub.
    Returns the local dataset path as a Path object.
    """
    import kagglehub
    slug = "alistairking/recyclable-and-household-waste-classification"
    print(f"[Kaggle] Downloading '{slug}' ...")
    path = kagglehub.dataset_download(slug)
    print(f"[Kaggle] Downloaded to: {path}")
    return Path(path)

In [61]:
# CLI
# ------------------------------
def main():
    p = argparse.ArgumentParser(description="Image data preprocessing with mandatory Kaggle download")
    p.add_argument("--kaggle", type=str, required=True, help="Kaggle dataset slug, e.g. 'owner/dataset'")
    p.add_argument("--img-size", type=int, default=224)
    p.add_argument("--val-ratio", type=float, default=0.1)
    p.add_argument("--seed", type=int, default=56)
    p.add_argument("--png-only", action="store_true", help="Filter to .png images only")
    args = p.parse_args()

    set_seed(args.seed)
    device = torch.device("cuda")  # pick_device()  # Device not used in preprocessing

    dl_root = download_kaggle_dataset(args.kaggle)
    data_root = find_image_root(dl_root)

    train_ld, val_ld, test_ld, meta = load_from_folder(
        data_root=data_root,
        img_size=args.img_size,
        val_ratio=args.val_ratio,
        seed=args.seed,
        only_png=args.png_only,
    )


In [62]:
# --- Summary helper (drop-in) ---
from torch.utils.data import Subset

def summarize_loaders(train_ld, val_ld, test_ld, meta, device):
    def ds_len(obj):
        if obj is None:
            return 0
        # Accept DataLoader, Dataset, or Subset
        base = getattr(obj, "dataset", obj)
        return len(base)

    print("\n=== DATA SUMMARY ===")
    print(f"Device: {device.type.upper()}")
    classes = meta.get("classes", [])
    preview = classes[:10]
    print(f"Classes ({len(classes)}): {preview}{' ...' if len(classes) > 10 else ''}")
    print(f"Train: {ds_len(train_ld)} | Val: {ds_len(val_ld)} | Test: {ds_len(test_ld)}")
    try:
        # Peek one batch (won't crash if dataset is empty/corrupt)
        xb, yb = next(iter(train_ld))
        print(f"Sample batch: images={tuple(xb.shape)}  labels={tuple(yb.shape)}")
    except Exception as e:
        print(f"[Warn] Could not fetch a batch: {e}")
    print("====================\n")


In [63]:
# ---- RUN PIPELINE IN NOTEBOOK ----
# Settings you can tweak
IMG_SIZE  = 224
VAL_RATIO = 0.2
SEED      = 56
PNG_ONLY  = True  # set False to accept jpg/jpeg too

# 1) seed + device
set_seed(SEED)
device = torch.device("cuda")  # pick_device()  # Device not used in preprocessing

# 2) download fixed Kaggle dataset (your helper takes no args)
dl_root = download_kaggle_dataset()    # <-- uses the fixed slug inside
data_root = find_image_root(dl_root)   # handles images/images nesting

# 3) build loaders
train_ld, val_ld, test_ld, meta = load_from_folder(
    data_root=data_root,
    img_size=IMG_SIZE,
    val_ratio=VAL_RATIO,
    seed=SEED,
    only_png=PNG_ONLY,
)

# 4) print summary
summarize_loaders(train_ld, val_ld, test_ld, meta, device)


[Kaggle] Downloading 'alistairking/recyclable-and-household-waste-classification' ...
[Kaggle] Downloaded to: C:\Users\61459\.cache\kagglehub\datasets\alistairking\recyclable-and-household-waste-classification\versions\1

=== DATA SUMMARY ===
Device: CUDA
Classes (30): ['aerosol_cans', 'aluminum_food_cans', 'aluminum_soda_cans', 'cardboard_boxes', 'cardboard_packaging', 'clothing', 'coffee_grounds', 'disposable_plastic_cutlery', 'eggshells', 'food_waste'] ...
Train: 12000 | Val: 3000 | Test: 0




Sample batch: images=(64, 3, 224, 224)  labels=(64,)

