# Garbage Classification — Data Preprocessing & Inspection

In this notebook, we will perform the full **data preprocessing and inspection** steps to make the dataset ready for modeling.  
We aim to satisfy the following criteria:

- Load & inspect the dataset (≈ 13.9k images)  
- Confirm the 6 classes (`plastic`, `metal`, `glass`, `cardboard`, `paper`, `trash`)  
- Verify class balance (≈ 2,300–2,500 images per class)  
- Detect (and optionally remove or flag) duplicates  
- Confirm image sizes and color channels (expected: 256×256, 3 channels RGB)  
- Ensure labels align correctly with image files  
- Split into train / validation / (test) sets, with stratification  
- Normalize / standardize pixel values (record method)  
- (Optional) Set up data augmentation  
- Build a pipeline or loader to ensure a batch can go through a baseline CNN  

We’ll break this into sections.

---

## 0. Download Kaggle Garbage Images Dataset


In [10]:
from google.colab import files
# Upload your kaggle.json
    # Only needs to be done once
    # If you have not uploaded kaggle.json file here before,
        # follow instructions below on acquiring kaggle.json
files.upload()
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mv: cannot stat 'kaggle.json': No such file or directory


In [11]:
!pip install -q kaggle

import os

# Make sure the Kaggle API key is available
if not os.path.exists("/root/.kaggle/kaggle.json"):
    print("UPLOAD KAGGLE.JSON FILE ABOVE")
    print("WATCH VIDEO I (AGUSTIN) SENT IN GROUP CHAT ON GETTING KAGGLE.JSON")

# Create data directory if not exists
os.makedirs("data", exist_ok=True)

# Download + unzip only if not already present
if not os.path.exists("data/Garbage_Dataset_Classification"):
    !kaggle datasets download -d zlatan599/garbage-dataset-classification -p data/
    !unzip -q data/garbage-dataset-classification.zip -d data/
    print("Dataset downloaded and extracted!")
else:
    print("Dataset already exists, skipping download.")

Dataset already exists, skipping download.


## 1. Setup & Imports (install if not already done)

In [4]:
%pip install imagededup



In [5]:
# Required libraries
import os
from pathlib import Path
from collections import Counter
import random
import hashlib

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# For splitting
from sklearn.model_selection import train_test_split

# For deduplication
from imagededup.methods import PHash

## 2. Define dataset root & discover classes (change path in codeblock)

In [6]:
# Adjust this path to where your dataset is unzipped
dataset_root = Path("/Users/hadibhidya/Desktop/cs325/project/Garbage_Dataset_Classification/images")

assert dataset_root.exists(), f"Dataset root not found: {dataset_root}"

# List subdirectories as candidate classes
classes = [d.name for d in dataset_root.iterdir() if d.is_dir()]
classes = sorted(classes)
print("Found classes:", classes)

AssertionError: Dataset root not found: /Users/hadibhidya/Desktop/cs325/project/Garbage_Dataset_Classification/images

## 3. Confirm expected classes & label consistency

In [None]:
expected = {"plastic", "metal", "glass", "cardboard", "paper", "trash"}
found = set(classes)
print("Expected classes:", expected)
print("Found classes:", found)

if found == expected:
    print("✅ The classes match exactly the expected ones.")
else:
    print("⚠️ Class mismatch.")
    print("Missing:", expected - found)
    print("Extra:", found - expected)


## 4. Count images per class & class balance

In [None]:
# We assume only .jpg and .jpeg
IMG_EXTS = (".jpg", ".jpeg")

class_counts = {}
for cls in classes:
    cls_dir = dataset_root / cls
    imgs = []
    for ext in IMG_EXTS:
        imgs.extend(list(cls_dir.glob(f"*{ext}")))
    # Also check for any unexpected extensions
    other = list(cls_dir.glob("*"))
    others = [p for p in other if p.suffix.lower() not in IMG_EXTS]
    if others:
        print(f"Warning: found {len(others)} files in {cls} with unexpected extension(s): {set(p.suffix for p in others)}")
    class_counts[cls] = len(imgs)

print("Counts per class:")
for cls, cnt in class_counts.items():
    print(f"  {cls}: {cnt}")
counts = np.array(list(class_counts.values()))
print("Total images:", counts.sum())
print("Min , Max , Mean:", counts.min(), ",", counts.max(), ",", counts.mean())


# Step 5: Duplicate Detection, Preview & Cleaned Copy (change path in code block)

In [None]:
# --- Step 5: Duplicate Detection, Preview & Cleaned Copy (self-contained) ---

import shutil
from imagededup.methods import PHash

# Supported extensions
IMG_EXTS = (".jpg", ".jpeg", ".png")

# 1. Run imagededup PHash per class
ph = PHash()
dups_all = {}

for cls in classes:
    cls_dir = dataset_root / cls
    print(f"Encoding class: {cls}")
    encodings = ph.encode_images(image_dir=str(cls_dir), recursive=False)
    dups = ph.find_duplicates(encoding_map=encodings, max_distance_threshold=3) # 3 (≈ 95% similarity)
    dups_all[cls] = dups
    n_dup_keys = len([k for k, v in dups.items() if v])
    print(f"  {n_dup_keys} keys have duplicates in {cls}")

# 2. Collect duplicate pairs across all classes
def collect_duplicate_pairs(dups_dict, cls):
    pairs = []
    for fname, dup_list in dups_dict.items():
        for dup in dup_list:
            pairs.append((cls, fname, dup))
    return pairs

all_pairs = []
for cls, dups in dups_all.items():
    all_pairs.extend(collect_duplicate_pairs(dups, cls))

print(f"\nTotal duplicate pairs found: {len(all_pairs)}")

# 3. Preview first 5 duplicate pairs
def preview_duplicate_pairs(pairs, n=5):
    for idx, (cls, f1, f2) in enumerate(pairs[:n]):
        path1 = dataset_root / cls / f1
        path2 = dataset_root / cls / f2

        fig, axes = plt.subplots(1, 2, figsize=(6, 3))
        try:
            img1 = Image.open(path1)
            img2 = Image.open(path2)

            axes[0].imshow(img1)
            axes[0].set_title(f"{f1}", fontsize=8)
            axes[0].axis("off")

            axes[1].imshow(img2)
            axes[1].set_title(f"{f2}", fontsize=8)
            axes[1].axis("off")

            plt.suptitle(f"Class: {cls} — Duplicate Pair {idx}")
            plt.show()
        except Exception as e:
            print(f"Error loading {f1}, {f2}:", e)

preview_duplicate_pairs(all_pairs, n=5)

# 4. Build cleaned dataset copy
cleaned_root = Path("/Users/hadibhidya/Desktop/cs325/project/Garbage_Dataset_Classification/images_cleaned")
cleaned_root.mkdir(parents=True, exist_ok=True)

# Mark duplicates to remove (always second file in pair)
to_remove = set([p[2] for p in all_pairs])
print("Total duplicate files to remove:", len(to_remove))

# Before counts
print("\nClass counts BEFORE cleaning:")
for cls in classes:
    total = sum(len(list((dataset_root/cls).glob(f"*{ext}"))) for ext in IMG_EXTS)
    print(f"  {cls}: {total}")

# Copy files, skipping duplicates
for cls in classes:
    src_dir = dataset_root / cls
    dst_dir = cleaned_root / cls
    dst_dir.mkdir(parents=True, exist_ok=True)

    for ext in IMG_EXTS:
        for file in src_dir.glob(f"*{ext}"):
            if file.name not in to_remove:
                shutil.copy(file, dst_dir / file.name)

# After counts
print("\nClass counts AFTER cleaning:")
for cls in classes:
    total = sum(len(list((cleaned_root/cls).glob(f"*{ext}"))) for ext in IMG_EXTS)
    print(f"  {cls}: {total}")

print("\n✅ Cleaned dataset created at:", cleaned_root)


# Step 6: Confirm Image Sizes and Color Channels

In [None]:
shape_counter = Counter()
channel_counter = Counter()
bad_images = []

for cls in classes:
    cls_dir = cleaned_root / cls   # use the CLEANED dataset
    for ext in IMG_EXTS:
        for p in cls_dir.glob(f"*{ext}"):
            try:
                with Image.open(p) as img:
                    arr = np.array(img)
                shape_counter[arr.shape] += 1
                if arr.ndim == 3:
                    channel_counter[arr.shape[2]] += 1
                else:
                    channel_counter[1] += 1
            except Exception as e:
                bad_images.append((p, str(e)))

print("Image shape distribution (H, W, [C]):")
for shp, cnt in shape_counter.items():
    print(f"  {shp}: {cnt} images")

print("\nChannel counts:")
for c, cnt in channel_counter.items():
    print(f"  {c} channels: {cnt} images")

print("\nNumber of images that failed to load:", len(bad_images))
if bad_images:
    print("Sample failed images:", bad_images[:5])

# Step 7: Verify Labels Align with Images (using metadata.csv)

In [None]:
import pandas as pd

# 1. Verify every image in cleaned_root is inside the expected class folder
problems = []
for cls in classes:
    cls_dir = cleaned_root / cls
    for ext in IMG_EXTS:
        for p in cls_dir.glob(f"*{ext}"):
            if p.parent.name != cls:
                problems.append((p, p.parent.name, cls))

if problems:
    print(f"⚠️ Found {len(problems)} images in the wrong folder:")
    print(problems[:10])
else:
    print("✅ All images are in the correct class folders.")

# 2. Cross-check with metadata.csv located at the dataset's parent folder
metadata_path = dataset_root.parent / "metadata.csv"

if metadata_path.exists():
    meta = pd.read_csv(metadata_path)
    meta_map = dict(zip(meta.filename, meta.label))

    mismatches = []
    for cls in classes:
        cls_dir = cleaned_root / cls
        for ext in IMG_EXTS:
            for p in cls_dir.glob(f"*{ext}"):
                fname = p.name
                if fname in meta_map and meta_map[fname] != cls:
                    mismatches.append((fname, cls, meta_map[fname]))

    if mismatches:
        print(f"⚠️ Found {len(mismatches)} mismatches with metadata.csv:")
        print(mismatches[:10])
    else:
        print("✅ Folder labels match metadata.csv for all files checked.")
else:
    print("⚠️ metadata.csv not found at:", metadata_path)

# Step 8: Stratified Split using StratifiedShuffleSplit (80/10/10)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from collections import Counter

# Rebuild data list from cleaned dataset
data = []
for cls in classes:
    cls_dir = cleaned_root / cls
    for ext in IMG_EXTS:
        for p in cls_dir.glob(f"*{ext}"):
            data.append((p, cls))

print("Total cleaned samples:", len(data))

paths = [p for p, lbl in data]
labels = [lbl for p, lbl in data]

# Choose your split ratios
test_ratio = 0.10
val_ratio = 0.10
train_ratio = 1.0 - (test_ratio + val_ratio)
assert train_ratio > 0, "Make sure ratios sum to less than 1"

# 1. Split off test set
sss = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=42)
for train_valid_idx, test_idx in sss.split(paths, labels):
    pass

train_valid_paths = [paths[i] for i in train_valid_idx]
train_valid_labels = [labels[i] for i in train_valid_idx]
test_paths = [paths[i] for i in test_idx]
test_labels = [labels[i] for i in test_idx]

# 2. Split train_valid into train + validation
rel_val = val_ratio / (train_ratio + val_ratio)
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=rel_val, random_state=42)
for train_idx2, val_idx in sss2.split(train_valid_paths, train_valid_labels):
    pass

train_paths = [train_valid_paths[i] for i in train_idx2]
train_labels = [train_valid_labels[i] for i in train_idx2]
val_paths = [train_valid_paths[i] for i in val_idx]
val_labels = [train_valid_labels[i] for i in val_idx]

# Build final splits
train_set = list(zip(train_paths, train_labels))
valid_set = list(zip(val_paths, val_labels))
test_set = list(zip(test_paths, test_labels))

# Print sizes
print("Total:", len(data))
print("Train:", len(train_set), "Validation:", len(valid_set), "Test:", len(test_set))
print()

def print_dist(split, name):
    c = Counter(lbl for _, lbl in split)
    print(f"{name} class counts:")
    for cls in classes:
        print(f"  {cls}: {c.get(cls, 0)}")
    print()

print_dist(train_set, "Train")
print_dist(valid_set, "Validation")
print_dist(test_set, "Test")
