# Assignment 1 — Fish Dataset (KaggleHub) — EDA + KFold CNN Baseline

This notebook keeps the same workflow as before (EDA → model → KFold evaluation),
with the **minimal changes needed** to use the Kaggle dataset `crowww/a-large-scale-fish-dataset` via `kagglehub`.


In [1]:
# If you get ModuleNotFoundError, run this cell once
%pip -q install kagglehub torch torchvision scikit-learn matplotlib pillow numpy tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import random
from pathlib import Path
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import torch
from torch import nn
from torch.utils.data import DataLoader, Subset
from torchvision import transforms
from torchvision.datasets import ImageFolder

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm.auto import tqdm

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

  from .autonotebook import tqdm as notebook_tqdm


'cpu'

## 1) Download dataset with KaggleHub
KaggleHub downloads the dataset to a local cache and returns the path.


In [6]:
import kagglehub

path = kagglehub.dataset_download("crowww/a-large-scale-fish-dataset")
dataset_root = Path(path)
print("Path to dataset files:", dataset_root)

# The images are typically under Fish_Dataset/
DATA_DIR = dataset_root / "Fish_Dataset" / "Fish_Dataset"
print("DATA_DIR:", DATA_DIR)
print("DATA_DIR exists:", DATA_DIR.exists())

# If this is False (dataset structure changed), inspect subfolders and set DATA_DIR accordingly.
if not DATA_DIR.exists():
    print("Subfolders under dataset_root:")
    for p in dataset_root.iterdir():
        if p.is_dir():
            print(" -", p.name)

Path to dataset files: /Users/danbrima/.cache/kagglehub/datasets/crowww/a-large-scale-fish-dataset/versions/2
DATA_DIR: /Users/danbrima/.cache/kagglehub/datasets/crowww/a-large-scale-fish-dataset/versions/2/Fish_Dataset/Fish_Dataset
DATA_DIR exists: True


## 2) EDA
We compute:
- number of classes
- total images
- class distribution
- sample image size/mode
- show example images per class


In [7]:
def get_class_names(root_dir: Path):
    return sorted([d.name for d in root_dir.iterdir() if d.is_dir()])


def get_image_paths(root_dir: Path):
    exts = (".jpg", ".jpeg", ".png")
    return [p for p in root_dir.glob("*/*") if p.is_file() and p.suffix.lower() in exts]


def inspect_image(image_path: Path):
    with Image.open(image_path) as img:
        return img.size, img.mode


def plot_class_distribution(counter: Counter, title: str):
    plt.figure(figsize=(12, 4))
    plt.bar(counter.keys(), counter.values())
    plt.xticks(rotation=90)
    plt.title(title)
    plt.ylabel("Number of images")
    plt.tight_layout()
    plt.show()


def show_samples_per_class(root_dir: Path, samples_per_class=2):
    class_dirs = get_class_names(root_dir)
    plt.figure(figsize=(samples_per_class * 3, len(class_dirs) * 3))
    plot_idx = 1

    for cls in class_dirs:
        images = [p for p in (root_dir / cls).iterdir() if p.is_file()
                  and p.suffix.lower() in (".jpg", ".jpeg", ".png")]
        if not images:
            continue
        samples = random.sample(images, min(samples_per_class, len(images)))

        for img_path in samples:
            with Image.open(img_path) as img:
                plt.subplot(len(class_dirs), samples_per_class, plot_idx)
                plt.imshow(img)
                plt.axis("off")
                plt.title(cls)
                plot_idx += 1

    plt.tight_layout()
    plt.show()

In [8]:
all_images = get_image_paths(DATA_DIR)
classes = get_class_names(DATA_DIR)

print("[DATASET SIZE]")
print("Total images:", len(all_images))

print("\n[CLASSES]")
print("Number of classes:", len(classes))
print(classes)

sample_size, sample_mode = inspect_image(all_images[0])
print("\n[SAMPLE IMAGE INFO]")
print("Resolution (W x H):", sample_size)
print("Color mode:", sample_mode)

[DATASET SIZE]
Total images: 0

[CLASSES]
Number of classes: 9
['Black Sea Sprat', 'Gilt-Head Bream', 'Hourse Mackerel', 'Red Mullet', 'Red Sea Bream', 'Sea Bass', 'Shrimp', 'Striped Red Mullet', 'Trout']


IndexError: list index out of range

In [None]:
labels = [p.parent.name for p in all_images]
class_counts = Counter(labels)

print("[CLASS DISTRIBUTION]")
print("Min class size:", min(class_counts.values()))
print("Max class size:", max(class_counts.values()))

plot_class_distribution(class_counts, "Class distribution (Fish dataset)")

In [None]:
show_samples_per_class(DATA_DIR, samples_per_class=2)

## 3) Model + KFold Cross Validation (K ≥ 5)

We keep the same simple CNN baseline. Main changes vs the zip-based dataset:
- use `ImageFolder(DATA_DIR, ...)`
- build train/val subsets using `StratifiedKFold` over `dataset.samples`


In [None]:
IMG_SIZE = 224

train_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    # Random horizontal flipping is used as a data augmentation technique.
    # For fish species classification, left/right orientation does not change the label,
    # so flipping helps the model learn orientation-invariant features.
    # This increases data diversity, reduces overfitting, and improves generalization.
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
])

eval_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

base_ds_for_targets = ImageFolder(DATA_DIR, transform=eval_tf)
num_classes = len(base_ds_for_targets.classes)
print("num_classes:", num_classes)
print("classes:", base_ds_for_targets.classes)

In [None]:
class SmallCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * (IMG_SIZE//8) * (IMG_SIZE//8), 256), nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [None]:
def run_epoch(model, loader, criterion, optimizer=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()

    losses, preds, targets = [], [], []

    with torch.set_grad_enabled(is_train):
        for x, y in tqdm(loader, leave=False):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            losses.append(loss.item())
            preds.append(logits.argmax(dim=1).detach().cpu().numpy())
            targets.append(y.detach().cpu().numpy())

    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    return float(np.mean(losses)), accuracy_score(targets, preds), preds, targets


def plot_history(hist, title=""):
    plt.figure()
    plt.plot(hist["train_loss"], label="train_loss")
    plt.plot(hist["val_loss"], label="val_loss")
    plt.legend()
    plt.title(title + " loss")
    plt.show()

    plt.figure()
    plt.plot(hist["train_acc"], label="train_acc")
    plt.plot(hist["val_acc"], label="val_acc")
    plt.legend()
    plt.title(title + " accuracy")
    plt.show()

In [None]:
K = 5
epochs = 8
batch_size = 32
lr = 1e-3

targets = np.array([y for _, y in base_ds_for_targets.samples])
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=RANDOM_SEED)

fold_results = []
fold_val_preds = []
fold_val_targets = []

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(targets)), targets), start=1):
    print(f"\n===== Fold {fold}/{K} =====")

    train_ds = ImageFolder(DATA_DIR, transform=train_tf)
    val_ds = ImageFolder(DATA_DIR, transform=eval_tf)

    train_subset = Subset(train_ds, train_idx)
    val_subset = Subset(val_ds, val_idx)

    train_loader = DataLoader(
        train_subset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(
        val_subset, batch_size=batch_size, shuffle=False, num_workers=2)

    model = SmallCNN(num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    hist = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

    for ep in range(1, epochs+1):
        tr_loss, tr_acc, * \
            _ = run_epoch(model, train_loader, criterion, optimizer)
        va_loss, va_acc, va_pred, va_true = run_epoch(
            model, val_loader, criterion)

        hist["train_loss"].append(tr_loss)
        hist["train_acc"].append(tr_acc)
        hist["val_loss"].append(va_loss)
        hist["val_acc"].append(va_acc)

        print(f"Epoch {ep:02d}: train acc={tr_acc:.3f} val acc={va_acc:.3f}")

    plot_history(hist, title=f"Fold {fold}")

    fold_results.append({
        "fold": fold,
        "final_train_acc": hist["train_acc"][-1],
        "final_val_acc": hist["val_acc"][-1],
        "final_val_loss": hist["val_loss"][-1],
    })

    fold_val_preds.append(va_pred)
    fold_val_targets.append(va_true)

fold_results

## 4) Misclassification overview (confusion matrix)
Aggregated over all validation predictions across folds.


In [None]:
all_val_pred = np.concatenate(fold_val_preds)
all_val_true = np.concatenate(fold_val_targets)

cm = confusion_matrix(all_val_true, all_val_pred)
plt.figure(figsize=(7, 7))
plt.imshow(cm)
plt.title("Confusion matrix (all folds validation)")
plt.axis("off")
plt.show()

## 5) Optional: show example predictions on one fold
This trains a small model for a few epochs on fold 1 and shows a few validation images with predictions.


In [None]:
def show_examples_from_indices(ds_eval, indices, model, n=8):
    model.eval()
    class_names = ds_eval.classes
    pick = random.sample(list(indices), min(n, len(indices)))

    plt.figure(figsize=(12, 3))
    for j, idx in enumerate(pick):
        img_path, y = ds_eval.samples[idx]
        img = Image.open(img_path)

        x, _ = ds_eval[idx]
        with torch.no_grad():
            probs = torch.softmax(
                model(x.unsqueeze(0).to(device)), dim=1).cpu().numpy()[0]
        yp = int(probs.argmax())
        conf = float(probs.max())

        plt.subplot(1, len(pick), j+1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"T:{class_names[y]}\nP:{class_names[yp]}\n{conf:.2f}")
    plt.tight_layout()
    plt.show()


train_idx, val_idx = next(iter(skf.split(np.zeros(len(targets)), targets)))

train_ds_demo = ImageFolder(DATA_DIR, transform=train_tf)
val_ds_demo = ImageFolder(DATA_DIR, transform=eval_tf)

train_loader_demo = DataLoader(Subset(
    train_ds_demo, train_idx), batch_size=batch_size, shuffle=True, num_workers=2)

model_demo = SmallCNN(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_demo.parameters(), lr=lr)

for ep in range(3):
    run_epoch(model_demo, train_loader_demo, criterion, optimizer)

show_examples_from_indices(val_ds_demo, val_idx, model_demo, n=8)