In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset, WeightedRandomSampler
import numpy as np
from sklearn.model_selection import train_test_split
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from pathlib import Path

# allow jpg + png
def is_img(p): 
    p=str(p).lower()
    return p.endswith((".png",".jpg",".jpeg",".bmp",".webp"))

# FIX: Point to the correct directory with /images/images
base_path = Path("C:/Users/hoang/.cache/kagglehub/datasets/alistairking/recyclable-and-household-waste-classification/versions/1")
candidates = [
    base_path / "images" / "images",
    base_path / "images",
    base_path,
]

root = None
for c in candidates:
    if c.exists() and any(d.is_dir() for d in c.iterdir()):
        root = c
        break

if root is None:
    raise FileNotFoundError(f"Could not find image folder in: {base_path}")

print(f"Using dataset root: {root}")
full = datasets.ImageFolder(root=str(root), transform=None, is_valid_file=is_img)
print(f"Found {len(full.classes)} classes: {full.classes[:5]}...")
targets = [y for _, y in full.samples]

# stratified split
tr_idx, va_idx = train_test_split(
    np.arange(len(targets)), test_size=0.2, random_state=56, stratify=targets
)

train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
])
val_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
])

train_ds = Subset(datasets.ImageFolder(str(root), transform=train_tfms, is_valid_file=is_img), tr_idx)
val_ds   = Subset(datasets.ImageFolder(str(root), transform=val_tfms,   is_valid_file=is_img), va_idx)

# optional balancing
counts = np.bincount(np.array(targets)[tr_idx])
class_w = 1.0 / np.clip(counts, 1, None)
weights = class_w[np.array(targets)[tr_idx]]
sampler = WeightedRandomSampler(weights, num_samples=len(tr_idx), replacement=True)

train_dl = DataLoader(train_ds, batch_size=64, sampler=sampler, num_workers=4, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=128, shuffle=False, num_workers=4, pin_memory=True)
num_classes = len(full.classes)


  from .autonotebook import tqdm as notebook_tqdm


Using dataset root: C:\Users\hoang\.cache\kagglehub\datasets\alistairking\recyclable-and-household-waste-classification\versions\1\images\images
Found 30 classes: ['aerosol_cans', 'aluminum_food_cans', 'aluminum_soda_cans', 'cardboard_boxes', 'cardboard_packaging']...


In [2]:
# Verify dataset is loaded correctly
print(f"✓ Number of classes: {num_classes}")
print(f"✓ Total samples: {len(full.samples)}")
print(f"✓ Train samples: {len(tr_idx)}")
print(f"✓ Val samples: {len(va_idx)}")
print(f"✓ No overlap: {set(tr_idx).isdisjoint(set(va_idx))}")

# Quick check for duplicates (sample first 100 files)
from hashlib import md5
def quick_hash_check(indices, n=100):
    hashes = set()
    for i in indices[:n]:
        path = full.samples[i][0]
        with open(path, 'rb') as f:
            hashes.add(md5(f.read()).hexdigest())
    return hashes

train_hashes = quick_hash_check(tr_idx.tolist())
val_hashes = quick_hash_check(va_idx.tolist())
duplicates = train_hashes & val_hashes
print(f"✓ Duplicate images in sample (should be 0): {len(duplicates)}")

if num_classes == 1:
    print("⚠️  WARNING: Only 1 class detected - check dataset path!")
if len(duplicates) > 0:
    print(f"⚠️  WARNING: Found {len(duplicates)} duplicate images between train/val!")


✓ Number of classes: 30
✓ Total samples: 15000
✓ Train samples: 12000
✓ Val samples: 3000
✓ No overlap: True
✓ Duplicate images in sample (should be 0): 0


In [3]:
import timm, torch
from torch import nn

def make_model(name, num_classes):
    m = timm.create_model(name, pretrained=True, drop_rate=0.2, drop_path_rate=0.1, num_classes=num_classes)
    return m

m_small = make_model("mobilenetv3_small_100", num_classes)
m_large = make_model("mobilenetv3_large_100", num_classes)


In [4]:
from torch.amp import autocast, GradScaler

def train_model(model, train_dl, val_dl, epochs=15, lr=5e-4, wd=0.05, device="cuda"):
    model.to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    warmup = torch.optim.lr_scheduler.LinearLR(opt, start_factor=0.1, total_iters=3)
    cosine = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs-3)
    sched = torch.optim.lr_scheduler.SequentialLR(opt, [warmup, cosine], milestones=[3])
    crit = nn.CrossEntropyLoss(label_smoothing=0.1)
    scaler = GradScaler('cuda', enabled=(device.startswith("cuda")))
    best = {"f1": -1, "state": None}

    for ep in range(epochs):
        model.train()
        for x,y in train_dl:
            x,y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            opt.zero_grad(set_to_none=True)
            with autocast('cuda', enabled=(device.startswith("cuda"))):
                logits = model(x)
                loss = crit(logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
        sched.step()

        # eval
        model.eval()
        preds, gts = [], []
        with torch.no_grad():
            for x,y in val_dl:
                x = x.to(device, non_blocking=True)
                logits = model(x)
                preds.append(logits.argmax(1).cpu())
                gts.append(y)
        import numpy as np
        from sklearn.metrics import f1_score, accuracy_score
        p = torch.cat(preds).numpy(); g = torch.cat(gts).numpy()
        f1 = f1_score(g, p, average="macro"); acc = accuracy_score(g, p)
        if f1 > best["f1"]:
            best = {"f1": f1, "state": model.state_dict()}
        print(f"ep {ep+1}: acc {acc:.4f}  macroF1 {f1:.4f}")
    model.load_state_dict(best["state"])
    return model


In [5]:
# Add Vision Transformer models
vit_tiny = make_model("vit_tiny_patch16_224", num_classes)
vit_small = make_model("vit_small_patch16_224", num_classes)

print(f"✓ MobileNetV3-Small params: {sum(p.numel() for p in m_small.parameters())/1e6:.2f}M")
print(f"✓ MobileNetV3-Large params: {sum(p.numel() for p in m_large.parameters())/1e6:.2f}M")
print(f"✓ ViT-Tiny params: {sum(p.numel() for p in vit_tiny.parameters())/1e6:.2f}M")
print(f"✓ ViT-Small params: {sum(p.numel() for p in vit_small.parameters())/1e6:.2f}M")


✓ MobileNetV3-Small params: 1.55M
✓ MobileNetV3-Large params: 4.24M
✓ ViT-Tiny params: 5.53M
✓ ViT-Small params: 21.68M


In [6]:
import torch; print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


True
NVIDIA GeForce RTX 4070 Laptop GPU


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
small = train_model(m_small, train_dl, val_dl, epochs=20, device=device)
large = train_model(m_large, train_dl, val_dl, epochs=20, device=device)


ep 1: acc 0.5367  macroF1 0.5302
ep 2: acc 0.6903  macroF1 0.6902




ep 3: acc 0.7007  macroF1 0.6969
ep 4: acc 0.6977  macroF1 0.6912
ep 5: acc 0.7457  macroF1 0.7425
ep 6: acc 0.7717  macroF1 0.7709
ep 7: acc 0.7870  macroF1 0.7881
ep 8: acc 0.7850  macroF1 0.7842
ep 9: acc 0.7993  macroF1 0.7992
ep 10: acc 0.8243  macroF1 0.8213
ep 11: acc 0.8290  macroF1 0.8294
ep 12: acc 0.8347  macroF1 0.8351
ep 13: acc 0.8387  macroF1 0.8391
ep 14: acc 0.8457  macroF1 0.8453
ep 15: acc 0.8597  macroF1 0.8587
ep 16: acc 0.8590  macroF1 0.8588
ep 17: acc 0.8600  macroF1 0.8597
ep 18: acc 0.8627  macroF1 0.8623
ep 19: acc 0.8673  macroF1 0.8670
ep 20: acc 0.8657  macroF1 0.8655
ep 1: acc 0.5953  macroF1 0.5928
ep 2: acc 0.7707  macroF1 0.7695




ep 3: acc 0.8290  macroF1 0.8282
ep 4: acc 0.8410  macroF1 0.8401
ep 5: acc 0.8583  macroF1 0.8588
ep 6: acc 0.8557  macroF1 0.8553
ep 7: acc 0.8670  macroF1 0.8659
ep 8: acc 0.8773  macroF1 0.8770
ep 9: acc 0.8673  macroF1 0.8674
ep 10: acc 0.8727  macroF1 0.8724
ep 11: acc 0.8730  macroF1 0.8728
ep 12: acc 0.8770  macroF1 0.8768
ep 13: acc 0.8823  macroF1 0.8816
ep 14: acc 0.8803  macroF1 0.8802
ep 15: acc 0.8773  macroF1 0.8771
ep 16: acc 0.8733  macroF1 0.8734
ep 17: acc 0.8790  macroF1 0.8785
ep 18: acc 0.8747  macroF1 0.8746
ep 19: acc 0.8807  macroF1 0.8800
ep 20: acc 0.8800  macroF1 0.8793


In [8]:
# Train Vision Transformer models
# Note: ViT may need slightly lower learning rate and longer warmup
print("Training ViT-Tiny...")
vit_tiny_trained = train_model(vit_tiny, train_dl, val_dl, epochs=20, lr=3e-4, wd=0.05, device=device)

print("\nTraining ViT-Small...")
vit_small_trained = train_model(vit_small, train_dl, val_dl, epochs=20, lr=3e-4, wd=0.05, device=device)


Training ViT-Tiny...
ep 1: acc 0.6180  macroF1 0.6017
ep 2: acc 0.7900  macroF1 0.7871




ep 3: acc 0.7993  macroF1 0.8004
ep 4: acc 0.7923  macroF1 0.7814
ep 5: acc 0.8083  macroF1 0.8054
ep 6: acc 0.8290  macroF1 0.8288
ep 7: acc 0.8433  macroF1 0.8427
ep 8: acc 0.8423  macroF1 0.8398
ep 9: acc 0.8423  macroF1 0.8428
ep 10: acc 0.8677  macroF1 0.8680
ep 11: acc 0.8530  macroF1 0.8513
ep 12: acc 0.8623  macroF1 0.8614
ep 13: acc 0.8753  macroF1 0.8744
ep 14: acc 0.8727  macroF1 0.8719
ep 15: acc 0.8843  macroF1 0.8838
ep 16: acc 0.8830  macroF1 0.8820
ep 17: acc 0.8823  macroF1 0.8816
ep 18: acc 0.8840  macroF1 0.8832
ep 19: acc 0.8857  macroF1 0.8853
ep 20: acc 0.8867  macroF1 0.8863

Training ViT-Small...
ep 1: acc 0.7607  macroF1 0.7554
ep 2: acc 0.8480  macroF1 0.8472




ep 3: acc 0.8463  macroF1 0.8445
ep 4: acc 0.8247  macroF1 0.8155
ep 5: acc 0.8470  macroF1 0.8414
ep 6: acc 0.8397  macroF1 0.8381
ep 7: acc 0.8583  macroF1 0.8582
ep 8: acc 0.8563  macroF1 0.8553
ep 9: acc 0.8697  macroF1 0.8692
ep 10: acc 0.8817  macroF1 0.8817
ep 11: acc 0.8710  macroF1 0.8716
ep 12: acc 0.8743  macroF1 0.8743
ep 13: acc 0.8880  macroF1 0.8877
ep 14: acc 0.8900  macroF1 0.8893
ep 15: acc 0.8943  macroF1 0.8934
ep 16: acc 0.8893  macroF1 0.8882
ep 17: acc 0.8930  macroF1 0.8929
ep 18: acc 0.8910  macroF1 0.8903
ep 19: acc 0.8927  macroF1 0.8922
ep 20: acc 0.8937  macroF1 0.8932


In [9]:
# Compare inference speed across all models
import time, torch
def ms_per_image(model, device="cuda"):
    model.eval().to(device)
    x = torch.randn(1,3,224,224, device=device)
    # warmup
    for _ in range(5): model(x)
    if device=="cuda": torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(50): model(x)
    if device=="cuda": torch.cuda.synchronize()
    return 1000*(time.time()-t0)/50

print("=== Inference Speed Comparison ===")
print(f"MobileNetV3-Small: {ms_per_image(small, device):.2f} ms/img")
print(f"MobileNetV3-Large: {ms_per_image(large, device):.2f} ms/img")
print(f"ViT-Tiny:          {ms_per_image(vit_tiny_trained, device):.2f} ms/img")
print(f"ViT-Small:         {ms_per_image(vit_small_trained, device):.2f} ms/img")

# Get final validation metrics for comparison
def get_final_metrics(model, val_dl, device):
    model.eval().to(device)
    preds, gts = [], []
    with torch.no_grad():
        for x,y in val_dl:
            x = x.to(device, non_blocking=True)
            logits = model(x)
            preds.append(logits.argmax(1).cpu())
            gts.append(y)
    from sklearn.metrics import f1_score, accuracy_score
    p = torch.cat(preds).numpy(); g = torch.cat(gts).numpy()
    return accuracy_score(g, p), f1_score(g, p, average="macro")

print("\n=== Final Validation Performance ===")
for name, model in [("MobileNetV3-Small", small), ("MobileNetV3-Large", large), 
                     ("ViT-Tiny", vit_tiny_trained), ("ViT-Small", vit_small_trained)]:
    acc, f1 = get_final_metrics(model, val_dl, device)
    params = sum(p.numel() for p in model.parameters())/1e6
    print(f"{name:20s} | Acc: {acc:.4f} | F1: {f1:.4f} | Params: {params:.2f}M")


=== Inference Speed Comparison ===
MobileNetV3-Small: 5.70 ms/img
MobileNetV3-Large: 6.50 ms/img
ViT-Tiny:          6.02 ms/img
ViT-Small:         5.42 ms/img

=== Final Validation Performance ===
MobileNetV3-Small    | Acc: 0.8657 | F1: 0.8655 | Params: 1.55M
MobileNetV3-Large    | Acc: 0.8800 | F1: 0.8793 | Params: 4.24M
ViT-Tiny             | Acc: 0.8867 | F1: 0.8863 | Params: 5.53M
ViT-Small            | Acc: 0.8937 | F1: 0.8932 | Params: 21.68M


In [10]:
# Save Vision Transformer models (if they perform well)
# Save ViT-Tiny
torch.save({
    "model": "vit_tiny_patch16_224",
    "classes": full.classes,
    "state_dict": vit_tiny_trained.state_dict()
}, "vit_tiny.pt")

# Save ViT-Small
torch.save({
    "model": "vit_small_patch16_224",
    "classes": full.classes,
    "state_dict": vit_small_trained.state_dict()
}, "vit_small.pt")

# Optional: Export best performing ViT to ONNX
# Uncomment to export ViT-Tiny to ONNX
# dummy = torch.randn(1,3,224,224)
# vit_tiny_trained.eval().cpu()
# torch.onnx.export(vit_tiny_trained, dummy, "vit_tiny.onnx", 
#                   input_names=["input"], output_names=["logits"], opset_version=17)

print("✓ Saved ViT models as vit_tiny.pt and vit_small.pt")


✓ Saved ViT models as vit_tiny.pt and vit_small.pt


In [11]:
import time, torch
def ms_per_image(model, device="cuda"):
    model.eval().to(device)
    x = torch.randn(1,3,224,224, device=device)
    # warmup
    for _ in range(5): model(x)
    if device=="cuda": torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(50): model(x)
    if device=="cuda": torch.cuda.synchronize()
    return 1000*(time.time()-t0)/50

print("Small ms/img:", ms_per_image(small, device))
print("Large ms/img:", ms_per_image(large, device))


Small ms/img: 6.42244815826416
Large ms/img: 7.686314582824707


In [14]:
torch.save({"model":"mobilenetv3_small_100","classes":full.classes,"state_dict":small.state_dict()}, "mobilenetv3_small.pt")

# ONNX for CPU apps
dummy = torch.randn(1,3,224,224)
small.eval().cpu()
torch.onnx.export(small, dummy, "mobilenetv3_small.onnx", input_names=["input"], output_names=["logits"], opset_version=17)
