In [2]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset, WeightedRandomSampler
import numpy as np
from sklearn.model_selection import train_test_split
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from pathlib import Path

# allow jpg + png
def is_img(p): 
    p=str(p).lower()
    return p.endswith((".png",".jpg",".jpeg",".bmp",".webp"))

# FIX: Point to the correct directory with /images/images
base_path = Path("C:/Users/22913/.cache/kagglehub/datasets/alistairking/recyclable-and-household-waste-classification/versions/1")
candidates = [
    base_path / "images" / "images",
    base_path / "images",
    base_path,
]

root = None
for c in candidates:
    if c.exists() and any(d.is_dir() for d in c.iterdir()):
        root = c
        break

if root is None:
    raise FileNotFoundError(f"Could not find image folder in: {base_path}")

print(f"Using dataset root: {root}")
full = datasets.ImageFolder(root=str(root), transform=None, is_valid_file=is_img)
print(f"Found {len(full.classes)} classes: {full.classes[:5]}...")
targets = [y for _, y in full.samples]

# stratified split
tr_idx, va_idx = train_test_split(
    np.arange(len(targets)), test_size=0.2, random_state=56, stratify=targets
)

train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
])
val_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
])

train_ds = Subset(datasets.ImageFolder(str(root), transform=train_tfms, is_valid_file=is_img), tr_idx)
val_ds   = Subset(datasets.ImageFolder(str(root), transform=val_tfms,   is_valid_file=is_img), va_idx)

# optional balancing
counts = np.bincount(np.array(targets)[tr_idx])
class_w = 1.0 / np.clip(counts, 1, None)
weights = class_w[np.array(targets)[tr_idx]]
sampler = WeightedRandomSampler(weights, num_samples=len(tr_idx), replacement=True)

train_dl = DataLoader(train_ds, batch_size=64, sampler=sampler, num_workers=4, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=128, shuffle=False, num_workers=4, pin_memory=True)
num_classes = len(full.classes)


  from .autonotebook import tqdm as notebook_tqdm


Using dataset root: C:\Users\22913\.cache\kagglehub\datasets\alistairking\recyclable-and-household-waste-classification\versions\1\images\images
Found 30 classes: ['aerosol_cans', 'aluminum_food_cans', 'aluminum_soda_cans', 'cardboard_boxes', 'cardboard_packaging']...


In [3]:
# Verify dataset is loaded correctly
print(f"✓ Number of classes: {num_classes}")
print(f"✓ Total samples: {len(full.samples)}")
print(f"✓ Train samples: {len(tr_idx)}")
print(f"✓ Val samples: {len(va_idx)}")
print(f"✓ No overlap: {set(tr_idx).isdisjoint(set(va_idx))}")

# Quick check for duplicates (sample first 100 files)
from hashlib import md5
def quick_hash_check(indices, n=100):
    hashes = set()
    for i in indices[:n]:
        path = full.samples[i][0]
        with open(path, 'rb') as f:
            hashes.add(md5(f.read()).hexdigest())
    return hashes

train_hashes = quick_hash_check(tr_idx.tolist())
val_hashes = quick_hash_check(va_idx.tolist())
duplicates = train_hashes & val_hashes
print(f"✓ Duplicate images in sample (should be 0): {len(duplicates)}")

if num_classes == 1:
    print("⚠️  WARNING: Only 1 class detected - check dataset path!")
if len(duplicates) > 0:
    print(f"⚠️  WARNING: Found {len(duplicates)} duplicate images between train/val!")


✓ Number of classes: 30
✓ Total samples: 15000
✓ Train samples: 12000
✓ Val samples: 3000
✓ No overlap: True
✓ Duplicate images in sample (should be 0): 0


In [4]:
import timm, torch
from torch import nn

def make_model(name, num_classes):
    m = timm.create_model(name, pretrained=True, drop_rate=0.2, drop_path_rate=0.1, num_classes=num_classes)
    return m

m_small = make_model("mobilenetv3_small_100", num_classes)
m_large = make_model("mobilenetv3_large_100", num_classes)


In [5]:
from torch.amp import autocast, GradScaler

def train_model(model, train_dl, val_dl, epochs=15, lr=5e-4, wd=0.05, device="cuda"):
    model.to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    warmup = torch.optim.lr_scheduler.LinearLR(opt, start_factor=0.1, total_iters=3)
    cosine = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs-3)
    sched = torch.optim.lr_scheduler.SequentialLR(opt, [warmup, cosine], milestones=[3])
    crit = nn.CrossEntropyLoss(label_smoothing=0.1)
    scaler = GradScaler('cuda', enabled=(device.startswith("cuda")))
    best = {"f1": -1, "state": None}

    for ep in range(epochs):
        model.train()
        for x,y in train_dl:
            x,y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            opt.zero_grad(set_to_none=True)
            with autocast('cuda', enabled=(device.startswith("cuda"))):
                logits = model(x)
                loss = crit(logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
        sched.step()

        # eval
        model.eval()
        preds, gts = [], []
        with torch.no_grad():
            for x,y in val_dl:
                x = x.to(device, non_blocking=True)
                logits = model(x)
                preds.append(logits.argmax(1).cpu())
                gts.append(y)
        import numpy as np
        from sklearn.metrics import f1_score, accuracy_score
        p = torch.cat(preds).numpy(); g = torch.cat(gts).numpy()
        f1 = f1_score(g, p, average="macro"); acc = accuracy_score(g, p)
        if f1 > best["f1"]:
            best = {"f1": f1, "state": model.state_dict()}
        print(f"ep {ep+1}: acc {acc:.4f}  macroF1 {f1:.4f}")
    model.load_state_dict(best["state"])
    return model


In [6]:
# Add Vision Transformer models
vit_tiny = make_model("vit_tiny_patch16_224", num_classes)
vit_small = make_model("vit_small_patch16_224", num_classes)

print(f"✓ MobileNetV3-Small params: {sum(p.numel() for p in m_small.parameters())/1e6:.2f}M")
print(f"✓ MobileNetV3-Large params: {sum(p.numel() for p in m_large.parameters())/1e6:.2f}M")
print(f"✓ ViT-Tiny params: {sum(p.numel() for p in vit_tiny.parameters())/1e6:.2f}M")
print(f"✓ ViT-Small params: {sum(p.numel() for p in vit_small.parameters())/1e6:.2f}M")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✓ MobileNetV3-Small params: 1.55M
✓ MobileNetV3-Large params: 4.24M
✓ ViT-Tiny params: 5.53M
✓ ViT-Small params: 21.68M


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
import torch; print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


True
NVIDIA GeForce GTX 1650


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
small = train_model(m_small, train_dl, val_dl, epochs=20, device=device)
large = train_model(m_large, train_dl, val_dl, epochs=20, device=device)


ep 1: acc 0.5863  macroF1 0.5823
ep 2: acc 0.7217  macroF1 0.7234




ep 3: acc 0.7083  macroF1 0.7077
ep 4: acc 0.7120  macroF1 0.7156
ep 5: acc 0.7173  macroF1 0.7183
ep 6: acc 0.7157  macroF1 0.7283
ep 7: acc 0.7737  macroF1 0.7765
ep 8: acc 0.8003  macroF1 0.8009
ep 9: acc 0.8093  macroF1 0.8089
ep 10: acc 0.8117  macroF1 0.8107
ep 11: acc 0.8130  macroF1 0.8145
ep 12: acc 0.8453  macroF1 0.8454
ep 13: acc 0.8427  macroF1 0.8430
ep 14: acc 0.8620  macroF1 0.8619
ep 15: acc 0.8507  macroF1 0.8507
ep 16: acc 0.8577  macroF1 0.8576
ep 17: acc 0.8673  macroF1 0.8670
ep 18: acc 0.8647  macroF1 0.8642
ep 19: acc 0.8660  macroF1 0.8662
ep 20: acc 0.8640  macroF1 0.8639
ep 1: acc 0.0333  macroF1 0.0022
ep 2: acc 0.0333  macroF1 0.0022




ep 3: acc 0.0333  macroF1 0.0022
ep 4: acc 0.0333  macroF1 0.0022
ep 5: acc 0.0333  macroF1 0.0022
ep 6: acc 0.0333  macroF1 0.0022
ep 7: acc 0.0333  macroF1 0.0022
ep 8: acc 0.0333  macroF1 0.0022
ep 9: acc 0.0333  macroF1 0.0022
ep 10: acc 0.0333  macroF1 0.0022
ep 11: acc 0.0333  macroF1 0.0022
ep 12: acc 0.0333  macroF1 0.0022
ep 13: acc 0.0333  macroF1 0.0022
ep 14: acc 0.0333  macroF1 0.0022
ep 15: acc 0.0333  macroF1 0.0022
ep 16: acc 0.0333  macroF1 0.0022
ep 17: acc 0.0333  macroF1 0.0022
ep 18: acc 0.0333  macroF1 0.0022
ep 19: acc 0.0333  macroF1 0.0022
ep 20: acc 0.0333  macroF1 0.0022


In [None]:
# Train Vision Transformer models
# Note: ViT may need slightly lower learning rate and longer warmup
print("Training ViT-Tiny...")
vit_tiny_trained = train_model(vit_tiny, train_dl, val_dl, epochs=20, lr=3e-4, wd=0.05, device=device)

print("\nTraining ViT-Small...")
vit_small_trained = train_model(vit_small, train_dl, val_dl, epochs=20, lr=3e-4, wd=0.05, device=device)


Training ViT-Tiny...
ep 1: acc 0.5583  macroF1 0.5322
ep 2: acc 0.7740  macroF1 0.7717




ep 3: acc 0.7943  macroF1 0.7901
ep 4: acc 0.8073  macroF1 0.8065
ep 5: acc 0.8263  macroF1 0.8266
ep 6: acc 0.8310  macroF1 0.8291
ep 7: acc 0.8460  macroF1 0.8466
ep 8: acc 0.8427  macroF1 0.8413
ep 9: acc 0.8560  macroF1 0.8548
ep 10: acc 0.8583  macroF1 0.8567
ep 11: acc 0.8633  macroF1 0.8624
ep 12: acc 0.8647  macroF1 0.8637
ep 13: acc 0.8713  macroF1 0.8711
ep 14: acc 0.8793  macroF1 0.8776
ep 15: acc 0.8740  macroF1 0.8734
ep 16: acc 0.8830  macroF1 0.8813
ep 17: acc 0.8820  macroF1 0.8812
ep 18: acc 0.8900  macroF1 0.8896
ep 19: acc 0.8867  macroF1 0.8858
ep 20: acc 0.8877  macroF1 0.8870

Training ViT-Small...
ep 1: acc 0.7557  macroF1 0.7481
ep 2: acc 0.8303  macroF1 0.8242




ep 3: acc 0.8410  macroF1 0.8363
ep 4: acc 0.8230  macroF1 0.8236
ep 5: acc 0.8440  macroF1 0.8402
ep 6: acc 0.8530  macroF1 0.8515


In [None]:
# Compare inference speed across all models
import time, torch
def ms_per_image(model, device="cuda"):
    model.eval().to(device)
    x = torch.randn(1,3,224,224, device=device)
    # warmup
    for _ in range(5): model(x)
    if device=="cuda": torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(50): model(x)
    if device=="cuda": torch.cuda.synchronize()
    return 1000*(time.time()-t0)/50

print("=== Inference Speed Comparison ===")
print(f"MobileNetV3-Small: {ms_per_image(small, device):.2f} ms/img")
print(f"MobileNetV3-Large: {ms_per_image(large, device):.2f} ms/img")
print(f"ViT-Tiny:          {ms_per_image(vit_tiny_trained, device):.2f} ms/img")
print(f"ViT-Small:         {ms_per_image(vit_small_trained, device):.2f} ms/img")

# Get final validation metrics for comparison
def get_final_metrics(model, val_dl, device):
    model.eval().to(device)
    preds, gts = [], []
    with torch.no_grad():
        for x,y in val_dl:
            x = x.to(device, non_blocking=True)
            logits = model(x)
            preds.append(logits.argmax(1).cpu())
            gts.append(y)
    from sklearn.metrics import f1_score, accuracy_score
    p = torch.cat(preds).numpy(); g = torch.cat(gts).numpy()
    return accuracy_score(g, p), f1_score(g, p, average="macro")

print("\n=== Final Validation Performance ===")
for name, model in [("MobileNetV3-Small", small), ("MobileNetV3-Large", large), 
                     ("ViT-Tiny", vit_tiny_trained), ("ViT-Small", vit_small_trained)]:
    acc, f1 = get_final_metrics(model, val_dl, device)
    params = sum(p.numel() for p in model.parameters())/1e6
    print(f"{name:20s} | Acc: {acc:.4f} | F1: {f1:.4f} | Params: {params:.2f}M")


In [None]:
# Save Vision Transformer models (if they perform well)
# Save ViT-Tiny
torch.save({
    "model": "vit_tiny_patch16_224",
    "classes": full.classes,
    "state_dict": vit_tiny_trained.state_dict()
}, "vit_tiny.pt")

# Save ViT-Small
torch.save({
    "model": "vit_small_patch16_224",
    "classes": full.classes,
    "state_dict": vit_small_trained.state_dict()
}, "vit_small.pt")

# Optional: Export best performing ViT to ONNX
# Uncomment to export ViT-Tiny to ONNX
# dummy = torch.randn(1,3,224,224)
# vit_tiny_trained.eval().cpu()
# torch.onnx.export(vit_tiny_trained, dummy, "vit_tiny.onnx", 
#                   input_names=["input"], output_names=["logits"], opset_version=17)

print("✓ Saved ViT models as vit_tiny.pt and vit_small.pt")


In [None]:
import time, torch
def ms_per_image(model, device="cuda"):
    model.eval().to(device)
    x = torch.randn(1,3,224,224, device=device)
    # warmup
    for _ in range(5): model(x)
    if device=="cuda": torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(50): model(x)
    if device=="cuda": torch.cuda.synchronize()
    return 1000*(time.time()-t0)/50

print("Small ms/img:", ms_per_image(small, device))
print("Large ms/img:", ms_per_image(large, device))


Small ms/img: 11.293988227844238
Large ms/img: 12.904634475708008


In [None]:
torch.save({"model":"mobilenetv3_small_100","classes":full.classes,"state_dict":small.state_dict()}, "mobilenetv3_small.pt")

# ONNX for CPU apps
dummy = torch.randn(1,3,224,224)
small.eval().cpu()
torch.onnx.export(small, dummy, "mobilenetv3_small.onnx", input_names=["input"], output_names=["logits"], opset_version=17)
