In [1]:
import platform, torch, subprocess
print("Host:", platform.node())               # gorina6
print("CUDA:", torch.cuda.is_available())     # True
print(subprocess.getoutput("nvidia-smi | head -n 5"))

Host: gorina6
CUDA: True
Fri Oct  3 14:24:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |


In [2]:
import torch, platform
print("Host:", platform.node())
print("Torch:", torch.__version__)
print("CUDA (build):", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("cuDNN:", torch.backends.cudnn.version())
print("GPU count:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(i, torch.cuda.get_device_name(i))

Host: gorina6
Torch: 2.7.1+cu118
CUDA (build): 11.8
CUDA available: True
cuDNN: 90100
GPU count: 8
0 Tesla V100-PCIE-32GB
1 Tesla V100-PCIE-32GB
2 Tesla V100-PCIE-32GB
3 Tesla V100-PCIE-32GB
4 Tesla V100-PCIE-32GB
5 Tesla V100-PCIE-32GB
6 Tesla V100-PCIE-32GB
7 Tesla V100-PCIE-32GB


In [3]:
import torch
free, total = torch.cuda.mem_get_info()
print(f"GPU mem free/total: {free/1e9:.2f} / {total/1e9:.2f} GB")

GPU mem free/total: 15.83 / 34.07 GB


In [4]:
import torch
print(torch.cuda.get_device_name(0))

Tesla V100-PCIE-32GB


In [13]:
import os, torch
print("CUDA_VISIBLE_DEVICES =", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("Visible count =", torch.cuda.device_count())
print("Using:", torch.cuda.get_device_name(0))

CUDA_VISIBLE_DEVICES = None
Visible count = 8
Using: Tesla V100-PCIE-32GB


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


In [3]:
import torch
torch.cuda.device_count()  # should be 1

1

In [4]:
from pathlib import Path
import pandas as pd
metadata_csv = Path("/home/stud/fwag/bhome/ele670_project/data/raw/metadata.csv")
dataset_root = Path("/home/stud/fwag/bhome/ele670_project/data/raw/labelled_images")

df = pd.read_csv(metadata_csv, sep=";")
df["finding_class"] = df["finding_class"].astype(str).str.strip()
df["filename"] = df["filename"].astype(str).str.strip()
# Build relative image_path that matches your folder layout
# labelled_images/<finding_class>/<filename>
df["image_path"] = df.apply(
    lambda r: str(Path(r["finding_class"]) / r["filename"]), axis=1
)

bad = df[~df["image_path"].apply(lambda p: (dataset_root / p).exists())]
print(f"Missing: {len(bad)} / {len(df)}")
print(bad.head())

Missing: 786 / 47248
                       filename          video_id  frame_number  \
6865  eb0203196e284797_1157.jpg  eb0203196e284797          1157   
6866  eb0203196e284797_1158.jpg  eb0203196e284797          1158   
6867  eb0203196e284797_1160.jpg  eb0203196e284797          1160   
6868  eb0203196e284797_1167.jpg  eb0203196e284797          1167   
6869  eb0203196e284797_1168.jpg  eb0203196e284797          1168   

     finding_category     finding_class     x1     y1     x2     y2     x3  \
6865          Anatomy  Ampulla of Vater  238.0  196.0  327.0  196.0  327.0   
6866          Anatomy  Ampulla of Vater  138.0    0.0  251.0    0.0  251.0   
6867          Anatomy  Ampulla of Vater   69.0    0.0  153.0    0.0  153.0   
6868          Anatomy  Ampulla of Vater    4.0  115.0   56.0  115.0   56.0   
6869          Anatomy  Ampulla of Vater   57.0  182.0  137.0  182.0  137.0   

         y3     x4     y4                                  image_path  
6865  289.0  238.0  289.0  Ampulla 

In [None]:
# CELL 1: SCAN + NORMALIZE + SAVE PROCESSED DF
from pathlib import Path
import pandas as pd
import numpy as np

# --- config / paths ---
dataset_root = Path("/home/stud/fwag/bhome/ele670_project/data/raw/labelled_images")
processed_df_pkl = Path("/home/stud/fwag/bhome/ele670_project/data/processed/metadata_cleaned.pkl")
metadata_csv = Path("/home/stud/fwag/bhome/ele670_project/data/raw/metadata.csv")

df = pd.read_csv(metadata_csv, sep=";")
n_videos = df["video_id"].nunique()
print(f"Total distinct videos: {n_videos}")

# --- preconditions ---
if "df" not in globals():
    raise NameError("df is not defined. Load your original CSV/DataFrame into `df` before running this cell.")
required_cols = {"finding_class", "filename"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"`df` is missing required columns: {sorted(missing)}")

# --- build case-insensitive map of class folders on disk (SLOW) ---
classes_on_disk = {p.name.lower().strip(): p.name for p in dataset_root.iterdir() if p.is_dir()}

# --- check CSV classes vs folders ---
csv_classes = sorted(df["finding_class"].astype(str).str.strip().unique())
no_match = [c for c in csv_classes if c.lower().strip() not in classes_on_disk]
print(f"[INFO] Class names with no folder match (case-insensitive): {len(no_match)}")
print(no_match[:20])

# --- optional manual fixes for known variants ---
manual_map = {
    # "CSV value": "Exact folder name on disk"
    # e.g., "Reduced Mucosal View": "Reduced mucosal view",
}

def resolve_class_folder(name: str) -> str:
    name = str(name).strip()
    if name in manual_map:
        return manual_map[name]
    return classes_on_disk.get(name.lower().strip(), name)  # fallback to original (will be flagged if missing)

# --- rebuild image_path using resolved folder name ---
df["image_path"] = df.apply(
    lambda r: str(Path(resolve_class_folder(r["finding_class"])) / r["filename"]),
    axis=1
)

# --- validate paths on disk ---
bad = df[~df["image_path"].apply(lambda p: (dataset_root / p).exists())]
print(f"[CHECK] Missing after folder mapping: {len(bad)} / {len(df)}")
display(bad.head(10))

# --- persist the processed df so Cell 2 can run without rescanning ---
processed_df_pkl.parent.mkdir(parents=True, exist_ok=True)
df.to_pickle(processed_df_pkl)
print(f"[SAVED] Processed DataFrame with image_path → {processed_df_pkl}")

[INFO] Class names with no folder match (case-insensitive): 0
[]
[CHECK] Missing after folder mapping: 0 / 47248


Unnamed: 0,filename,video_id,frame_number,finding_category,finding_class,x1,y1,x2,y2,x3,y3,x4,y4,image_path


[SAVED] Processed DataFrame with image_path → /home/stud/fwag/bhome/ele670_project/data/processed/metadata_cleaned.pkl


In [10]:
# CELL 2: EXPORT METADATA + LABELS (NO RESCAN)
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

processed_df_pkl = Path("/home/stud/fwag/bhome/ele670_project/data/processed/metadata_cleaned.pkl")
out_csv          = Path("/home/stud/fwag/bhome/ele670_project/data/processed/metadata_cleaned.csv")
out_classes      = Path("/home/stud/fwag/bhome/ele670_project/data/processed/label_classes_cleaned.npy")

# Prefer loading the persisted df (so we don't re-run the slow scan)
if processed_df_pkl.exists():
    df_proc = pd.read_pickle(processed_df_pkl)
elif "df" in globals():
    # Fallback: use in-memory df if it already has image_path (e.g., you just ran Cell 1)
    if "image_path" not in df.columns:
        raise ValueError("In-memory `df` has no 'image_path'. Run Cell 1 or load the processed df.")
    df_proc = df
else:
    raise FileNotFoundError(
        f"Processed df not found at {processed_df_pkl}. Run Cell 1 first to generate it."
    )

# Keep only columns needed for training
keep_cols = ["image_path", "finding_class", "video_id"]
missing_keep = [c for c in keep_cols if c not in df_proc.columns]
if missing_keep:
    raise ValueError(f"Processed df missing required columns {missing_keep}. Ensure Cell 1 ran on the correct input.")

df_out = df_proc[keep_cols].copy()

# Encode labels and persist artifacts
lb = LabelEncoder()
df_out["encoded_label"] = lb.fit_transform(df_out["finding_class"])
np.save("/home/stud/fwag/bhome/ele670_project/data/processed/label_classes.npy", lb.classes_)

out_csv.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(out_csv, index=False)
np.save(out_classes, lb.classes_)

print(f"[SAVED] Normalized metadata CSV → {out_csv}")
print(f"[SAVED] Label classes (.npy)     → {out_classes}")
display(df_out.head())

[SAVED] Normalized metadata CSV → /home/stud/fwag/bhome/ele670_project/data/processed/metadata_cleaned.csv
[SAVED] Label classes (.npy)     → /home/stud/fwag/bhome/ele670_project/data/processed/label_classes_cleaned.npy


Unnamed: 0,image_path,finding_class,video_id,encoded_label
0,Normal clean mucosa/0728084c8da942d9_22803.jpg,Normal clean mucosa,0728084c8da942d9,9
1,Normal clean mucosa/0728084c8da942d9_22804.jpg,Normal clean mucosa,0728084c8da942d9,9
2,Normal clean mucosa/0728084c8da942d9_22805.jpg,Normal clean mucosa,0728084c8da942d9,9
3,Normal clean mucosa/0728084c8da942d9_22806.jpg,Normal clean mucosa,0728084c8da942d9,9
4,Normal clean mucosa/0728084c8da942d9_22807.jpg,Normal clean mucosa,0728084c8da942d9,9


In [18]:
# CELL 3: FILTER SMALL / SINGLE-VIDEO CLASSES + PRINT STATS + EXPORT + PER-VIDEO INSIGHTS
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# --- config / thresholds ---
min_videos_per_class = 2     # threshold 1: a class must appear in at least this many DISTINCT videos
min_frames_per_class = 20    # threshold 2: a class must have at least this many frames overall

# --- standardized file paths ---
processed_df_pkl = Path("/home/stud/fwag/bhome/ele670_project/data/processed/metadata_cleaned.pkl")  # cleaned df from Cell 1
out_csv_filtered  = Path("/home/stud/fwag/bhome/ele670_project/data/processed/metadata_filtered.csv")  # final filtered metadata
out_classes       = Path("/home/stud/fwag/bhome/ele670_project/data/processed/label_classes_filtered.npy")  # label names after filtering
out_stats_csv     = Path("/home/stud/fwag/bhome/ele670_project/data/processed/class_stats_before_filter.csv")  # class-level stats BEFORE filtering
out_per_video_csv = Path("/home/stud/fwag/bhome/ele670_project/data/processed/per_video_class_counts.csv")  # n_classes per video (pre-filter)
out_per_video_dist_csv = Path("/home/stud/fwag/bhome/ele670_project/data/processed/per_video_class_count_distribution.csv")  # distribution over n_classes per video (pre-filter)
out_single_class_videos = Path("/home/stud/fwag/bhome/ele670_project/data/processed/single_class_videos.csv")  # list of videos that contain exactly one class (pre-filter)

# --- load cleaned DataFrame (output of Cell 1) ---
if processed_df_pkl.exists():
    # preferred path: load the persisted, cleaned dataframe so we don't rescan the filesystem
    df_proc = pd.read_pickle(processed_df_pkl)
elif "df" in globals():
    # fallback: use in-memory df only if it already has 'image_path' (implies Cell 1 ran in this session)
    if "image_path" not in df.columns:
        raise ValueError("In-memory `df` has no 'image_path'. Run Cell 1 or load the cleaned df.")
    df_proc = df
else:
    # neither persisted nor in-memory cleaned df exists → instruct user to run Cell 1
    raise FileNotFoundError(f"Cleaned df not found at {processed_df_pkl}. Run Cell 1 first.")

# --- sanity check ---
for c in ["finding_class", "video_id", "image_path"]:
    # ensure the key columns exist; otherwise the following stats/filters would fail
    if c not in df_proc.columns:
        raise ValueError(f"Column '{c}' missing in df_proc — ensure Cell 1 ran correctly.")

# ===================================================
# 1) PER-CLASS STATS
# ===================================================
# Build per-class aggregates:
# - frames: total number of rows/images belonging to the class
# - videos: number of DISTINCT video_id values where the class appears
stats = (
    df_proc.groupby("finding_class")
           .agg(frames=("image_path", "size"),
                videos=("video_id", lambda s: s.nunique(dropna=True)))  # nunique over video_id counts distinct videos with that class
           .reset_index()
           .sort_values(["videos", "frames", "finding_class"])
)

# Persist these pre-filter stats for auditing/repro
out_stats_csv.parent.mkdir(parents=True, exist_ok=True)
stats.to_csv(out_stats_csv, index=False)

print("[STATS] Distinct videos and frame counts per class (before filtering):")
print(stats.to_string(index=False, max_rows=40))  # print a readable table (truncate beyond 40 rows)

# ===================================================
# 2) PER-VIDEO CLASS DIVERSITY
# ===================================================
# For each video_id, count how many DISTINCT classes appear in that video
per_video_classes = (
    df_proc.groupby("video_id")["finding_class"]
           .nunique(dropna=True)
           .sort_values(ascending=False)
)

# Save per-video class counts and the distribution (how many videos have 1,2,3,... classes)
per_video_classes.to_csv(out_per_video_csv, header=["n_classes"])

per_video_dist = per_video_classes.value_counts().sort_index()
per_video_dist.to_csv(out_per_video_dist_csv, header=["n_videos"])

print("\n[INFO] How many classes each video has (top 10):")
print(per_video_classes.head(10).to_string())  # show the 'busiest' videos by class diversity

print("\n[INFO] Distribution of 'number of classes per video':")
print("Classes_per_video | Num_videos")
print(per_video_dist.rename_axis("Classes_per_video")
                     .reset_index(name="Num_videos")
                     .to_string(index=False))

# ===================================================
# 3) NEW: IDENTIFY SINGLE-CLASS VIDEOS
# ===================================================
# Find videos where only ONE class appears (diagnostic only; not filtering them out here)
single_class_videos = per_video_classes[per_video_classes == 1].index.tolist()
print(f"\n[INFO] Videos containing only ONE class: {len(single_class_videos)} found")
if single_class_videos:
    print("[INFO] Example single-class videos (up to 10 shown):")
    print(single_class_videos[:10])

    # For those videos, also record WHICH class they contain (array per video)
    single_class_info = (
        df_proc[df_proc["video_id"].isin(single_class_videos)]
        .groupby("video_id")["finding_class"]
        .unique()
        .reset_index()
    )
    single_class_info.to_csv(out_single_class_videos, index=False)
    print(f"[SAVED] Full list of single-class videos → {out_single_class_videos}")

# Pick one example video (the one with most classes) and list its classes (sanity check)
example_video = per_video_classes.index[0]  # video with most distinct classes
classes_in_example = (
    df_proc.loc[df_proc["video_id"] == example_video, "finding_class"]
          .dropna().unique()
)
print(f"\n[INFO] Classes found in video {example_video}:")
print(np.sort(classes_in_example))

# ===================================================
# 4) FILTERING BY CLASS SIZE / VIDEO COUNT
# ===================================================
# Identify classes that FAIL either threshold:
# - 'removed_by_videos': classes that appear in fewer than min_videos_per_class distinct videos
# - 'removed_by_frames': classes that have fewer than min_frames_per_class total frames
removed_by_videos = stats.loc[stats["videos"] < min_videos_per_class, "finding_class"]
removed_by_frames = stats.loc[stats["frames"] < min_frames_per_class, "finding_class"]

# Union of both failure sets → final set of classes to remove
to_remove = set(removed_by_videos) | set(removed_by_frames)

# Log what will be removed and why
print(f"\n[CONFIG] min_videos_per_class = {min_videos_per_class}, min_frames_per_class = {min_frames_per_class}")
print(f"[FILTER] Too few videos (<{min_videos_per_class}): {len(removed_by_videos)} classes")
if len(removed_by_videos): print(sorted(removed_by_videos.tolist())[:40])
print(f"[FILTER] Too few frames (<{min_frames_per_class}): {len(removed_by_frames)} classes")
if len(removed_by_frames): print(sorted(removed_by_frames.tolist())[:40])
print(f"[FILTER] Total unique classes removed: {len(to_remove)}")

# Apply class-level filter: DROP ALL ROWS whose 'finding_class' is in 'to_remove'
# (i.e., remove every frame belonging to low-support classes)
df_filt = df_proc[~df_proc["finding_class"].isin(to_remove)].copy()

# Summaries pre vs post
print(f"\n[SUMMARY] Frames before: {len(df_proc):,} | after: {len(df_filt):,}")
print(f"[SUMMARY] Classes before: {stats.shape[0]} | after: {df_filt['finding_class'].nunique()}")

# ===================================================
# 5) ENCODE + SAVE FILTERED DATA
# ===================================================
# Keep just the columns needed downstream and add an integer label encoding for 'finding_class'
df_out = df_filt[["image_path", "finding_class", "video_id"]].copy()
lb = LabelEncoder()
df_out["encoded_label"] = lb.fit_transform(df_out["finding_class"])

# Save filtered metadata (CSV) and the label names (NPY)
out_csv_filtered.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(out_csv_filtered, index=False)
np.save(out_classes, lb.classes_)

print(f"\n[SAVED] Filtered metadata CSV  → {out_csv_filtered}")
print(f"[SAVED] Filtered label classes → {out_classes}")
print("[SAVED] Per-video class counts →", out_per_video_csv)
print("[SAVED] Per-video class-count distribution →", out_per_video_dist_csv)
print("\n[HEAD]")
print(df_out.head().to_string(index=False))

[STATS] Distinct videos and frame counts per class (before filtering):
       finding_class  frames  videos
    Ampulla of Vater      10       1
     Blood - hematin      12       1
               Polyp      55       1
       Blood - fresh     446       2
            Erythema     159       3
    Lymphangiectasia     592       3
               Ulcer     854       3
        Foreign Body     776       4
         Angiectasia     866       6
Reduced Mucosal View    2906       7
             Erosion     507       9
             Pylorus    1538      32
     Ileocecal valve    4189      34
 Normal clean mucosa   34338      37

[INFO] How many classes each video has (top 10):
video_id
04a78ef00c5245e0    6
48579eec79784294    6
64440803f87b4843    6
8ebf0e483cac48d6    6
8885668afb844852    5
5e59c7fdb16c4228    5
eb0203196e284797    5
3ada4222967f421d    4
131368cc17e44240    4
7a47e8eacea04e64    4

[INFO] Distribution of 'number of classes per video':
Classes_per_video | Num_videos
 Classes_

In [7]:
import pandas as pd

# --- Load the dataset ---
csv_path = "/bhome/fwag/ele670_project/data/processed/metadata_normalized.csv"  # <-- change this to your file
df = pd.read_csv(csv_path)

print(f"[INFO] Loaded {len(df)} rows from {csv_path}")
print("[INFO] Columns:", list(df.columns))
print(df.head())

# --- Safety check ---
required_cols = {"video_id", "finding_class"}
missing = required_cols - set(df.columns)
assert not missing, f"DataFrame is missing required columns: {missing}"

# --- Per-class ↔ videos stats ---
class_to_n_videos = (
    df.groupby("finding_class")["video_id"]
      .nunique()
      .sort_values(ascending=False)
)

print("\n[INFO] Number of DISTINCT videos per class:")
print(class_to_n_videos)

# Full stats table per class
class_stats = (
    df.groupby("finding_class")
      .agg(
          n_frames=("finding_class", "size"),
          n_videos=("video_id", "nunique"),
      )
      .sort_values(["n_videos", "n_frames"], ascending=[False, False])
)
class_stats["only_one_video"] = class_stats["n_videos"] == 1

print("\n[INFO] Per-class stats (frames, distinct videos, single-video flag):")
print(class_stats)

# Which classes are only in a single video?
single_video_classes = class_stats[class_stats["only_one_video"]].copy()
print("\n[INFO] Classes with frames coming from ONLY ONE video:")
if single_video_classes.empty:
    print("None 🎉")
else:
    print(single_video_classes)

# Map each class -> list of distinct video_ids
videos_list_per_class = (
    df.groupby("finding_class")["video_id"]
      .apply(lambda s: sorted(s.unique()))
      .to_dict()
)

# 🔹 NEW: print ALL classes, not just top-10
print("\n[INFO] Distinct video_ids for ALL classes:")
for cls in class_stats.index:  # <-- removed .head(10)
    vids = videos_list_per_class.get(cls, [])
    print(f"- {cls}: {len(vids)} video(s) → {vids}")

# --- Optional: per (video_id, class) breakdown ---
per_video_class_counts = (
    df.groupby(["video_id", "finding_class"])
      .size()
      .rename("n_frames")
      .reset_index()
)

if not single_video_classes.empty:
    print("\n[INFO] Breakdown for single-video classes:")
    for cls in single_video_classes.index:
        rows = per_video_class_counts[per_video_class_counts["finding_class"] == cls]
        print(f"\nClass: {cls}")
        print(rows.sort_values("n_frames", ascending=False).head(10).to_string(index=False))

[INFO] Loaded 47248 rows from /bhome/fwag/ele670_project/data/processed/metadata_normalized.csv
[INFO] Columns: ['image_path', 'finding_class', 'video_id', 'encoded_label']
                                       image_path        finding_class  \
0  Normal clean mucosa/0728084c8da942d9_22803.jpg  Normal clean mucosa   
1  Normal clean mucosa/0728084c8da942d9_22804.jpg  Normal clean mucosa   
2  Normal clean mucosa/0728084c8da942d9_22805.jpg  Normal clean mucosa   
3  Normal clean mucosa/0728084c8da942d9_22806.jpg  Normal clean mucosa   
4  Normal clean mucosa/0728084c8da942d9_22807.jpg  Normal clean mucosa   

           video_id  encoded_label  
0  0728084c8da942d9              9  
1  0728084c8da942d9              9  
2  0728084c8da942d9              9  
3  0728084c8da942d9              9  
4  0728084c8da942d9              9  

[INFO] Number of DISTINCT videos per class:
finding_class
Normal clean mucosa     37
Ileocecal valve         34
Pylorus                 32
Erosion             

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import torch
print("CUDA visible devices:", torch.cuda.device_count())
print("Using GPU:", torch.cuda.get_device_name(0))

CUDA visible devices: 1
Using GPU: Tesla V100-PCIE-32GB


In [2]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T

# ==== CONFIG ====
CSV_PATH = "/bhome/fwag/ele670_project/data/processed/metadata_normalized.csv"
IMG_ROOT = "/bhome/fwag/ele670_project/data/raw/labelled_images"
BATCH_SIZE = 8
NUM_WORKERS = 4

# ==== DATASET ====
class KvasirDataset(Dataset):
    def __init__(self, csv_path, img_root, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_root = img_root
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_root, row["image_path"])
        img = Image.open(img_path).convert("RGB")
        label = int(row["encoded_label"])
        
        if self.transform:
            img = self.transform(img)
        return img, label

# Simple transforms for sanity check
transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor()
])

dataset = KvasirDataset(CSV_PATH, IMG_ROOT, transform)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

# ==== MODEL ====
num_classes = dataset.df["encoded_label"].nunique()
model = models.resnet50(weights=None)   # no pretrained, just sanity check
model.fc = nn.Linear(model.fc.in_features, num_classes)

# ==== SANITY TRAIN LOOP (1 batch only) ====
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

images, labels = next(iter(loader))
images, labels = images.to(device), labels.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# forward
outputs = model(images)
loss = criterion(outputs, labels)

# backward
optimizer.zero_grad()
loss.backward()
optimizer.step()

print(f"Sanity check done ✅ | Batch size: {images.size(0)}, Loss: {loss.item():.4f}")

Sanity check done ✅ | Batch size: 8, Loss: 3.3002


In [None]:
# ==== Tiny ResNet50 sanity trainer for Kvasir-Capsule ====
# - Beginner-friendly, step-by-step
# - Prints debug info along the way
# - Optimized for Tesla V100 (mixed precision, pinned memory, etc.)
# =========================================================

import os, time
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision.models as models

from sklearn.model_selection import GroupShuffleSplit
from collections import Counter

# ---------------- CONFIG ----------------
CSV_PATH = "/bhome/fwag/ele670_project/data/processed/metadata_normalized.csv"
IMG_ROOT   = "/bhome/fwag/ele670_project/data/raw/labelled_images"
IMG_SIZE   = 224
BATCH_SIZE = 32     # safe for V100 (can go higher if memory allows)
NUM_WORKERS = 4     # adjust depending on system
EPOCHS     = 2      # keep small for quick sanity check
LR         = 1e-3
SEED       = 42

# ----------------------------------------
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(torch.cuda.get_device_name(0))
    print("Memory allocated:", round(torch.cuda.memory_allocated(0)/1024**3, 2), "GB")

# --------------- DATASET ----------------
class KvasirDataset(Dataset):
    """Custom dataset to read image paths + labels from CSV."""
    def __init__(self, df, img_root, transform):
        self.df = df.reset_index(drop=True)
        self.img_root = img_root
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Join root with relative path from CSV
        img_path = os.path.join(self.img_root, row["image_path"])
        try:
            img = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"[WARNING] Failed to open {img_path}: {e}")
            # use black image as fallback
            img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (0,0,0))
        img = self.transform(img)
        label = int(row["encoded_label"])
        return img, label

# --------------- LOAD CSV ----------------
df = pd.read_csv(CSV_PATH, sep=",")
print("First rows of metadata_processed.csv:")
print(df.head())

# How many unique classes?
num_classes = df["encoded_label"].nunique()
print(f"Number of classes: {num_classes}")

# --------------- TRAIN/VAL SPLIT ----------------
# Keep videos separated (no leakage between train/val)
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=SEED)
train_idx, val_idx = next(gss.split(df, groups=df["video_id"]))
df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]

print(f"Train samples: {len(df_train)} | Val samples: {len(df_val)}")
print(f"Train videos: {df_train['video_id'].nunique()} | Val videos: {df_val['video_id'].nunique()}")

# --------------- TRANSFORMS ----------------
# Simple transforms for sanity check
mean = (0.485, 0.456, 0.406)
std  = (0.229, 0.224, 0.225)

train_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean, std),
])

val_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean, std),
])

# --------------- DATA LOADERS ----------------
train_ds = KvasirDataset(df_train, IMG_ROOT, train_tf)
val_ds   = KvasirDataset(df_val,   IMG_ROOT, val_tf)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True)

print("One training batch (images, labels):")
sample_imgs, sample_labels = next(iter(train_loader))
print("Image batch shape:", sample_imgs.shape)
print("Label batch shape:", sample_labels.shape)

# --------------- MODEL ----------------
# Load ResNet50 with ImageNet weights (helps it train faster)
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
model.fc = nn.Linear(model.fc.in_features, num_classes)  # replace classifier

model = model.to(device)

# Loss + optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# Mixed precision (faster on V100)
scaler = torch.cuda.amp.GradScaler()

# --------------- TRAINING LOOP ----------------
def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss, total_correct, total_samples = 0.0, 0, 0

    for step, (imgs, labels) in enumerate(loader, start=1):
        imgs = imgs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        with torch.cuda.amp.autocast(enabled=True):
            outputs = model(imgs)
            loss = criterion(outputs, labels)

        if train:
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        # Accuracy calculation
        preds = outputs.argmax(dim=1)
        correct = (preds == labels).sum().item()
        total_correct += correct
        total_loss += loss.item() * labels.size(0)
        total_samples += labels.size(0)

        # Print debug info every 10 steps
        if step % 10 == 0 or step == 1:
            print(f"  Step {step}/{len(loader)} | Loss: {loss.item():.4f} | "
                  f"Batch acc: {correct/labels.size(0):.2f}")

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples
    return avg_loss, avg_acc

# --------------- TRAIN FOR A FEW EPOCHS ----------------
print("\n==== START TRAINING ====\n")
best_val_acc = 0.0
start_time = time.time()

for epoch in range(1, EPOCHS+1):
    print(f"\nEpoch {epoch}/{EPOCHS}")

    train_loss, train_acc = run_epoch(train_loader, train=True)
    print(f"Train: loss={train_loss:.4f}, acc={train_acc:.3f}")

    val_loss, val_acc = run_epoch(val_loader, train=False)
    print(f"Val:   loss={val_loss:.4f}, acc={val_acc:.3f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "tiny_resnet50_best.pt")
        print(f"  ✅ New best model saved (acc={best_val_acc:.3f})")

print("\nTraining finished!")
print("Best validation accuracy:", round(best_val_acc, 3))
print("Elapsed time: %.1f sec" % (time.time() - start_time))

Using device: cuda
Tesla V100-PCIE-32GB
Memory allocated: 0.38 GB
First rows of metadata.csv:
                                       image_path        finding_class  \
0  Normal clean mucosa/0728084c8da942d9_22803.jpg  Normal clean mucosa   
1  Normal clean mucosa/0728084c8da942d9_22804.jpg  Normal clean mucosa   
2  Normal clean mucosa/0728084c8da942d9_22805.jpg  Normal clean mucosa   
3  Normal clean mucosa/0728084c8da942d9_22806.jpg  Normal clean mucosa   
4  Normal clean mucosa/0728084c8da942d9_22807.jpg  Normal clean mucosa   

           video_id  encoded_label  
0  0728084c8da942d9              9  
1  0728084c8da942d9              9  
2  0728084c8da942d9              9  
3  0728084c8da942d9              9  
4  0728084c8da942d9              9  
Number of classes: 14
Train samples: 37430 | Val samples: 9818
Train videos: 34 | Val videos: 9
One training batch (images, labels):
Image batch shape: torch.Size([32, 3, 224, 224])
Label batch shape: torch.Size([32])

==== START TRAINING 

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(enabled=True):


  Step 1/1170 | Loss: 2.7401 | Batch acc: 0.00
  Step 10/1170 | Loss: 0.9073 | Batch acc: 0.75
  Step 20/1170 | Loss: 0.5375 | Batch acc: 0.84
  Step 30/1170 | Loss: 1.3675 | Batch acc: 0.69
  Step 40/1170 | Loss: 1.3706 | Batch acc: 0.62
  Step 50/1170 | Loss: 0.9294 | Batch acc: 0.75
  Step 60/1170 | Loss: 0.6208 | Batch acc: 0.78
  Step 70/1170 | Loss: 1.0341 | Batch acc: 0.59
  Step 80/1170 | Loss: 0.5050 | Batch acc: 0.88
  Step 90/1170 | Loss: 0.6382 | Batch acc: 0.84
  Step 100/1170 | Loss: 0.4670 | Batch acc: 0.84
  Step 110/1170 | Loss: 0.3199 | Batch acc: 0.94
  Step 120/1170 | Loss: 0.6109 | Batch acc: 0.78
  Step 130/1170 | Loss: 0.3193 | Batch acc: 0.94
  Step 140/1170 | Loss: 0.5663 | Batch acc: 0.81
  Step 150/1170 | Loss: 0.4441 | Batch acc: 0.84
  Step 160/1170 | Loss: 0.6226 | Batch acc: 0.81
  Step 170/1170 | Loss: 0.3706 | Batch acc: 0.94
  Step 180/1170 | Loss: 0.4049 | Batch acc: 0.97
  Step 190/1170 | Loss: 0.4931 | Batch acc: 0.84
  Step 200/1170 | Loss: 0.1632 

In [6]:
# %% [markdown]
# # Tiny ResNet50 sanity trainer for Kvasir-Capsule (debug-friendly)
# 
# * Beginner-friendly, heavily commented
# * Safe, small training run (2 epochs) to check end-to-end wiring
# * Works well on Tesla V100 (uses mixed precision)
# * Video-independent split (by `video_id`) to avoid leakage
# * Extra debug prints (device, memory, batch shapes, step logs)
# * Optional: freeze backbone for very fast convergence of the classifier head
# * Validation summary includes confusion matrix + per-class metrics
# 
# ➕ Changes vs earlier cell:
# - Uses **new AMP API**: `from torch import amp; amp.autocast('cuda')` to avoid deprecation warnings
# - Adds **label smoothing** + optional **weight decay** for stability
# - Adds **GPU-2 pinning** (can be disabled by setting GPU_INDEX=None)
# - Adds **confusion matrix** + **classification report** after validation
# 
# ---

# %%
import os, time, math, warnings
from collections import Counter

import pandas as pd
import numpy as np
from PIL import Image, UnidentifiedImageError

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision.models as models
from torch import amp

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report

warnings.simplefilter("ignore", UserWarning)

# ---------------- CONFIG ----------------
CSV_PATH      = "/bhome/fwag/ele670_project/data/processed/metadata_normalized.csv"
IMG_ROOT      = "/bhome/fwag/ele670_project/data/raw/labelled_images"
RESULTS_DIR   = "/bhome/fwag/ele670_project/results"
SAVE_PATH     = os.path.join(RESULTS_DIR, "tiny_resnet50_best.pt")

IMG_SIZE      = 224
BATCH_SIZE    = 32          # Try 64 or 96 if you want to use more VRAM
NUM_WORKERS   = 6           # Tune for your server (4-12 typically fine)
EPOCHS        = 2           # Keep it short for sanity checks
LR            = 1e-3
WEIGHT_DECAY  = 1e-4        # small regularization
LABEL_SMOOTH  = 0.05        # improves stability a bit
FREEZE_BACKBONE = True     # set True for super-fast head-only training
USE_PRETRAINED = True       # use ImageNet weights to converge fast
SEED          = 42

# If you want to lock to a specific GPU (e.g., GPU 2), set this to an int.
# Set to None to use the default CUDA device.
GPU_INDEX     = 2

# Log every N steps for train/val loops
LOG_EVERY     = 10

# ----------------------------------------
# (1) Reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

# (2) Pin to a specific GPU before creating tensors/models
if torch.cuda.is_available() and GPU_INDEX is not None:
    # Option A (recommended for notebooks): hide other GPUs so chosen one is cuda:0
    os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU_INDEX)

# After potentially setting CUDA_VISIBLE_DEVICES, re-check availability
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
if DEVICE == "cuda":
    print(torch.cuda.get_device_name(0))
    print("Allocated:", round(torch.cuda.memory_allocated()/1024**3, 2), "GB")
    print("Reserved: ", round(torch.cuda.memory_reserved()/1024**3, 2), "GB")

# Create results directory if missing
os.makedirs(RESULTS_DIR, exist_ok=True)

# Enable cuDNN heuristics (faster for fixed shapes)
torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

# --------------- DATASET ----------------
class KvasirDataset(Dataset):
    """Reads image paths + encoded labels from a dataframe.
    
    Returns (image_tensor, label_int).
    If an image fails to load, returns a black placeholder and prints a warning.
    """
    def __init__(self, df: pd.DataFrame, img_root: str, transform):
        self.df = df.reset_index(drop=True)
        self.img_root = img_root
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        rel = row["image_path"]
        path = os.path.join(self.img_root, rel)
        try:
            img = Image.open(path).convert("RGB")
        except (FileNotFoundError, UnidentifiedImageError) as e:
            print(f"[WARNING] Failed to open {path}: {e}")
            img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (0, 0, 0))
        x = self.transform(img)
        y = int(row["encoded_label"])  # already numeric
        return x, y

# --------------- LOAD CSV ----------------
df = pd.read_csv(CSV_PATH)
print("First rows of metadata_normalized.csv:")
print(df.head())

# Basic validations
required_cols = {"image_path", "encoded_label", "video_id"}
missing = required_cols - set(df.columns)
assert not missing, f"CSV is missing required columns: {missing}"

num_classes = df["encoded_label"].nunique()
print(f"Number of classes: {num_classes}")

# Try to recover (optional) human-readable names per label for metrics display
# If multiple names map to same label, we pick the most frequent one in train split later.
if "finding_class" in df.columns:
    # Placeholder; will finalize names after split so we use train frequency
    label_to_names = None
else:
    label_to_names = None

# --------------- TRAIN/VAL SPLIT (video-independent) ----------------
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=SEED)
train_idx, val_idx = next(gss.split(df, groups=df["video_id"]))
df_train, df_val = df.iloc[train_idx].copy(), df.iloc[val_idx].copy()

print(f"Train samples: {len(df_train)} | Val samples: {len(df_val)}")
print(f"Train videos: {df_train['video_id'].nunique()} | Val videos: {df_val['video_id'].nunique()}")

# Now finalize readable class names if available
if "finding_class" in df.columns:
    # Map each encoded_label to its most common finding_class in the TRAIN set
    names = (
        df_train.groupby(["encoded_label", "finding_class"]).size()
        .reset_index(name="count")
        .sort_values(["encoded_label", "count"], ascending=[True, False])
        .drop_duplicates(subset=["encoded_label"])
    )
    label_to_names = dict(zip(names["encoded_label"].astype(int), names["finding_class"].astype(str)))
else:
    label_to_names = {i: f"class_{i}" for i in range(num_classes)}

# --------------- TRANSFORMS ----------------
mean = (0.485, 0.456, 0.406)
std  = (0.229, 0.224, 0.225)

train_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean, std),
])

val_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean, std),
])

# --------------- DATA LOADERS ----------------
train_ds = KvasirDataset(df_train, IMG_ROOT, train_tf)
val_ds   = KvasirDataset(df_val,   IMG_ROOT, val_tf)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=True,
    persistent_workers=True, prefetch_factor=4, drop_last=True
)

val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=True,
    persistent_workers=True, prefetch_factor=4, drop_last=False
)

print("One training batch (images, labels):")
sample_imgs, sample_labels = next(iter(train_loader))
print("Image batch shape:", sample_imgs.shape)   # [B, 3, H, W]
print("Label batch shape:", sample_labels.shape) # [B]

# --------------- MODEL ----------------
if USE_PRETRAINED:
    try:
        weights = models.ResNet50_Weights.IMAGENET1K_V2
    except AttributeError:
        weights = "IMAGENET1K_V2"
    model = models.resnet50(weights=weights)
else:
    model = models.resnet50(weights=None)

# Replace the classifier head to match your dataset
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Optionally freeze the backbone (train only the final classifier layer)
if FREEZE_BACKBONE:
    for name, p in model.named_parameters():
        if not name.startswith("fc."):
            p.requires_grad_(False)

model = model.to(DEVICE)

# --------------- LOSS & OPTIMIZER ---------------
# Label smoothing helps generalization slightly and prevents over-confident spikes
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=LR, weight_decay=WEIGHT_DECAY)

# Mixed precision scaler (new API)
scaler = amp.GradScaler('cuda') if DEVICE == 'cuda' else None

# --------------- TRAIN / EVAL HELPERS ---------------
@torch.no_grad()
def _accuracy(logits: torch.Tensor, labels: torch.Tensor) -> float:
    return (logits.argmax(dim=1) == labels).float().mean().item()


def run_epoch(loader, train: bool = True, desc: str = "train"):
    model.train(train)

    total_loss, total_correct, total_seen = 0.0, 0, 0

    for step, (images, labels) in enumerate(loader, 1):
        images = images.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)

        # Forward (mixed precision if on CUDA)
        if DEVICE == 'cuda':
            autocast_ctx = amp.autocast('cuda', enabled=True)
        else:
            # No AMP on CPU
            from contextlib import nullcontext
            autocast_ctx = nullcontext()

        with autocast_ctx:
            logits = model(images)
            loss = criterion(logits, labels)

        if train:
            optimizer.zero_grad(set_to_none=True)
            if scaler is not None:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

        # Statistics (computed in full precision)
        with torch.no_grad():
            preds = logits.argmax(dim=1)
            correct = (preds == labels).sum().item()
            batch_sz = labels.size(0)
            total_correct += correct
            total_seen += batch_sz
            total_loss += float(loss.item()) * batch_sz

        # Debug print
        if step % LOG_EVERY == 0 or step == 1:
            batch_acc = correct / batch_sz
            print(f"  Step {step}/{len(loader)} | Loss: {loss.item():.4f} | Batch acc: {batch_acc:.2f}")

    avg_loss = total_loss / max(1, total_seen)
    avg_acc  = total_correct / max(1, total_seen)
    return avg_loss, avg_acc


def collect_predictions(loader):
    """Run model in eval mode and collect logits/labels for metrics.
    Returns: y_true (np.array), y_pred (np.array)
    """
    model.eval()
    all_labels = []
    all_preds  = []

    for images, labels in loader:
        images = images.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)
        with torch.no_grad():
            if DEVICE == 'cuda':
                with amp.autocast('cuda', enabled=True):
                    logits = model(images)
            else:
                logits = model(images)
            preds = logits.argmax(dim=1)
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

    y_true = np.concatenate(all_labels)
    y_pred = np.concatenate(all_preds)
    return y_true, y_pred

# --------------- TRAIN FOR A FEW EPOCHS ---------------
print("\n==== START TRAINING ====\n")
best_val_acc = 0.0
start_time = time.time()

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")

    tr_loss, tr_acc = run_epoch(train_loader, train=True, desc="train")
    print(f"Train: loss={tr_loss:.4f}, acc={tr_acc:.3f}")

    vl_loss, vl_acc = run_epoch(val_loader, train=False, desc="val")
    print(f"Val:   loss={vl_loss:.4f}, acc={vl_acc:.3f}")

    # Save best checkpoint
    if vl_acc > best_val_acc:
        best_val_acc = vl_acc
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"  ✅ New best model saved → {SAVE_PATH} (acc={best_val_acc:.3f})")

    # After each epoch, print confusion matrix + classification report
    y_true, y_pred = collect_predictions(val_loader)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes)))
    print("\nValidation Confusion Matrix (rows=true, cols=pred):")
    # Pretty-print small CM; for large, you might want to save to file
    with np.printoptions(threshold=200, linewidth=200):
        print(cm)

    target_names = [label_to_names.get(i, f"class_{i}") for i in range(num_classes)]
    print("\nPer-class metrics (validation):")
    print(classification_report(y_true, y_pred, labels=list(range(num_classes)), target_names=target_names, digits=3))

print("\nTraining finished!")
print("Best validation accuracy:", round(best_val_acc, 3))
print("Elapsed time: %.1f sec" % (time.time() - start_time))


Using device: cuda
Tesla V100-PCIE-32GB
Allocated: 0.56 GB
Reserved:  3.44 GB
First rows of metadata_normalized.csv:
                                       image_path        finding_class  \
0  Normal clean mucosa/0728084c8da942d9_22803.jpg  Normal clean mucosa   
1  Normal clean mucosa/0728084c8da942d9_22804.jpg  Normal clean mucosa   
2  Normal clean mucosa/0728084c8da942d9_22805.jpg  Normal clean mucosa   
3  Normal clean mucosa/0728084c8da942d9_22806.jpg  Normal clean mucosa   
4  Normal clean mucosa/0728084c8da942d9_22807.jpg  Normal clean mucosa   

           video_id  encoded_label  
0  0728084c8da942d9              9  
1  0728084c8da942d9              9  
2  0728084c8da942d9              9  
3  0728084c8da942d9              9  
4  0728084c8da942d9              9  
Number of classes: 14
Train samples: 37430 | Val samples: 9818
Train videos: 34 | Val videos: 9
One training batch (images, labels):
Image batch shape: torch.Size([32, 3, 224, 224])
Label batch shape: torch.Size([32]

In [19]:
# %% [markdown]
# # Tiny ResNet50 sanity trainer for Kvasir-Capsule (debug-friendly)
# 
# * Beginner-friendly, **now extremely commented**
# * Safe, small training run (2 epochs) to check end-to-end wiring
# * Works well on Tesla V100 (uses mixed precision)
# * Video-independent split (by `video_id`) to avoid leakage
# * Extra debug prints (device, memory, batch shapes, step logs)
# * Optional: freeze backbone for very fast convergence of the classifier head
# * Validation summary includes confusion matrix + per-class metrics
# 
# ➕ Changes vs earlier cell:
# - Uses **new AMP API**: `from torch import amp; amp.autocast('cuda')` to avoid deprecation warnings
# - Adds **label smoothing** + optional **weight decay** for stability
# - Adds **GPU-2 pinning** (can be disabled by setting GPU_INDEX=None)
# - Adds **confusion matrix** + **classification report** after validation
# 
# NOTE: The logic below is *unchanged*; only comments were added for clarity.

# %%
# -------------------------
# Standard library imports
# -------------------------
# os       : filesystem paths, environment variables (e.g., CUDA_VISIBLE_DEVICES)
# time     : simple wall-clock timing of the whole train run
# math     : not strictly needed here, but often handy for schedulers, etc.
# warnings : to silence benign warnings for a cleaner notebook output
import os, time, math, warnings
from collections import Counter  # (not used below, but often useful for label histograms)

# -------------------------
# Third-party imports
# -------------------------
# pandas      : reading the metadata CSV with image paths / labels / groups
# numpy       : CPU-side numeric ops, array concatenation for metrics
# PIL.Image   : robust image loading; handles various formats
import pandas as pd
import numpy as np
from PIL import Image, UnidentifiedImageError

# -------------------------
# PyTorch / TorchVision
# -------------------------
# torch             : tensors, autograd, CUDA utils
# torch.nn          : neural network layers + loss functions
# DataLoader        : mini-batching, shuffling, prefetching with workers
# torchvision       : common transforms and pretrained CNN backbones
# amp               : Automatic Mixed Precision for speed + memory savings
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision.models as models
from torch import amp

# -------------------------
# Scikit-learn utilities
# -------------------------
# GroupShuffleSplit : ensures train/val split is **video-independent** (group = video_id)
# Metrics           : confusion matrix + per-class precision/recall/F1 report
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report

# Silence overly chatty user warnings (e.g., PIL/torchvision), keeps output readable.
warnings.simplefilter("ignore", UserWarning)

# ========================
# CONFIGURATION CONSTANTS
# ========================
# Paths to data and outputs. Adjust these to your environment.
# Paths to data and outputs. Adjust these to your environment.
CSV_PATH      = "/home/stud/fwag/bhome/ele670_project/data/processed/metadata_filtered.csv"
IMG_ROOT      = "/home/stud/fwag/bhome/ele670_project/data/raw/labelled_images"
RESULTS_DIR   = "/home/stud/fwag/bhome/ele670_project/results"
SAVE_PATH     = os.path.join(RESULTS_DIR, "tiny_resnet50_best.pt")

# Optional: path to class-name mapping saved by LabelEncoder during preprocessing
LABELS_NPY    = "/home/stud/fwag/bhome/ele670_project/data/processed/label_classes_filtered.npy"

# Core training hyperparameters
IMG_SIZE      = 224           # input resolution expected by ResNet50
BATCH_SIZE    = 32            # try 64/96 if VRAM allows; affects throughput + stability
NUM_WORKERS   = 6             # dataloader subprocesses; tune for server CPU
EPOCHS        = 2             # keep short for sanity runs; increase when confident
LR            = 1e-3          # Adam learning rate
WEIGHT_DECAY  = 1e-4          # small L2 regularization for generalization
LABEL_SMOOTH  = 0.05          # reduces overconfidence; helps class-imbalance a bit
FREEZE_BACKBONE = True        # train only final FC layer for a *very* fast sanity pass
USE_PRETRAINED = True         # start from ImageNet weights for quick convergence
SEED          = 42            # reproducibility (affects split/shuffle/initialization)

# Optional: pin training to a specific GPU index (e.g., GPU 2 on a multi-GPU server).
# If None, uses default CUDA device selection.
GPU_INDEX     = 2

# How often to print batch-level logs during training/validation loops
LOG_EVERY     = 10

# ----------------------------
# (1) Reproducibility seeding
# ----------------------------
torch.manual_seed(SEED)  # torch RNG seed (CPU + CUDA if available)
np.random.seed(SEED)     # numpy RNG seed

# -----------------------------------------------------------
# (2) Pin to a specific GPU *before* constructing CUDA objects
# -----------------------------------------------------------
# Best practice in notebooks: mask other GPUs so the chosen one looks like cuda:0
if torch.cuda.is_available() and GPU_INDEX is not None:
    os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU_INDEX)

# Now, after potentially masking devices, re-detect CUDA availability
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
if DEVICE == "cuda":
    # Report which GPU and current memory state; helps verify the right GPU is in use.
    print(torch.cuda.get_device_name(0))
    print("Allocated:", round(torch.cuda.memory_allocated()/1024**3, 2), "GB")
    print("Reserved: ", round(torch.cuda.memory_reserved()/1024**3, 2), "GB")

# Ensure output directory exists
os.makedirs(RESULTS_DIR, exist_ok=True)

# Enable cudNN autotuner: faster convs when input shapes are static (typical in CV)
torch.backends.cudnn.benchmark = True
try:
    # Slightly faster matmul kernels on Ampere+; safe to ignore if unsupported
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

# ========================
# DATASET IMPLEMENTATION
# ========================
class KvasirDataset(Dataset):
    """Thin PyTorch Dataset wrapper around a DataFrame of image rows.
    
    Each row must contain:
    - image_path     : relative path to the image file on disk (under IMG_ROOT)
    - encoded_label  : integer class id (0..num_classes-1)
    
    Returns (image_tensor, label_int) for each index. If an image fails to load,
    we emit a warning and return a black placeholder to keep the batch shapes valid.
    """
    def __init__(self, df: pd.DataFrame, img_root: str, transform):
        # Keep a defensive copy with consecutive indices
        self.df = df.reset_index(drop=True)
        self.img_root = img_root
        self.transform = transform

    def __len__(self):
        # Required for PyTorch to know how many items are in the dataset
        return len(self.df)

    def __getitem__(self, idx):
        # Index into the DataFrame to fetch the sample
        row = self.df.iloc[idx]
        rel = row["image_path"]  # relative path under IMG_ROOT
        path = os.path.join(self.img_root, rel)
        try:
            # Open image robustly and convert to 3-channel RGB
            img = Image.open(path).convert("RGB")
        except (FileNotFoundError, UnidentifiedImageError) as e:
            # If the file is missing or corrupted, warn and provide a dummy image
            print(f"[WARNING] Failed to open {path}: {e}")
            img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (0, 0, 0))
        # Apply torchvision transforms (resize, flip, tensorize, normalize)
        x = self.transform(img)
        # Labels are expected to already be numeric in the CSV
        y = int(row["encoded_label"])  # integer class id
        return x, y

# ==================
# LOAD & VALIDATE CSV
# ==================
# The CSV is expected to contain at least: image_path, encoded_label, video_id
# (finding_class is optional, used only for pretty printing class names later.)
df = pd.read_csv(CSV_PATH)
print("First rows of metadata_normalized.csv:")
print(df.head())

# Basic schema checks to fail fast if the CSV is malformed
required_cols = {"image_path", "encoded_label", "video_id"}
missing = required_cols - set(df.columns)
assert not missing, f"CSV is missing required columns: {missing}"

# Determine number of unique classes from the encoded label column
num_classes = df["encoded_label"].nunique()
print(f"Number of classes: {num_classes}")

# Prepare (optional) mapping from integer labels to human-readable names.
# We'll finalize it *after* the split, using the most common name per label in TRAIN.
if "finding_class" in df.columns:
    label_to_names = None  # filled later from the training subset
else:
    label_to_names = None  # will default to generic names

# ==============================================
# TRAIN/VAL SPLIT — **VIDEO-INDEPENDENT** SPLIT
# ==============================================
# GroupShuffleSplit ensures all frames from the same video_id are kept together
# (i.e., no leakage where the model sees frames of the same video in both splits.)
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=SEED)
train_idx, val_idx = next(gss.split(df, groups=df["video_id"]))
df_train, df_val = df.iloc[train_idx].copy(), df.iloc[val_idx].copy()

print(f"Train samples: {len(df_train)} | Val samples: {len(df_val)}")
print(f"Train videos: {df_train['video_id'].nunique()} | Val videos: {df_val['video_id'].nunique()}")

# --------------- LABEL NAMES (preferred: from LabelEncoder) ---------------
# Try to load class names saved during preprocessing with scikit-learn's LabelEncoder.
# This guarantees the names align EXACTLY with how `encoded_label` was produced.
try:
    classes = np.load(LABELS_NPY, allow_pickle=True)
    label_to_names = {i: str(name) for i, name in enumerate(classes)}
    print(f"Loaded {len(label_to_names)} class names from {LABELS_NPY}")
except Exception as e:
    # Graceful fallback: infer from TRAIN frequencies (previous behavior)
    print(f"[WARN] Could not load {LABELS_NPY} ({e}). Falling back to inferring from TRAIN.")
    if "finding_class" in df.columns:
        names = (
            df_train.groupby(["encoded_label", "finding_class"]).size()
            .reset_index(name="count")
            .sort_values(["encoded_label", "count"], ascending=[True, False])
            .drop_duplicates(subset=["encoded_label"])  # keep top name per label
        )
        label_to_names = dict(zip(names["encoded_label"].astype(int), names["finding_class"].astype(str)))
    else:
        label_to_names = {i: f"class_{i}" for i in range(num_classes)}

# =====================
# IMAGE TRANSFORMATIONS
# =====================
# Standard ImageNet normalization statistics for ResNet-family backbones
mean = (0.485, 0.456, 0.406)
std  = (0.229, 0.224, 0.225)

# TRAIN pipeline: deterministic resize → light augmentation → tensorize → normalize
train_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.RandomHorizontalFlip(p=0.5),  # helps against left/right biases and small dataset size
    T.ToTensor(),
    T.Normalize(mean, std),
])

# VAL pipeline: resize + normalization only (no randomness)
val_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean, std),
])

# ==================
# DATA LOADERS (PyTorch)
# ==================
# pin_memory=True: speeds up host→GPU transfers
# persistent_workers=True: avoids worker restart overhead across epochs
# prefetch_factor: how many batches each worker preloads; tune with NUM_WORKERS
train_ds = KvasirDataset(df_train, IMG_ROOT, train_tf)
val_ds   = KvasirDataset(df_val,   IMG_ROOT, val_tf)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=True,
    persistent_workers=True, prefetch_factor=4, drop_last=True  # drop_last=True for even batch shapes on AMP
)

val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=True,
    persistent_workers=True, prefetch_factor=4, drop_last=False
)

# Quick sanity print to confirm tensor shapes and label dtype
print("One training batch (images, labels):")
sample_imgs, sample_labels = next(iter(train_loader))
print("Image batch shape:", sample_imgs.shape)   # [B, 3, H, W]
print("Label batch shape:", sample_labels.shape) # [B]

# ==============
# MODEL DEFINITION
# ==============
# Load a ResNet50 backbone; optionally with ImageNet-1K weights for faster convergence.
if USE_PRETRAINED:
    try:
        weights = models.ResNet50_Weights.IMAGENET1K_V2
    except AttributeError:
        # TorchVision < 0.13 compatibility fallback
        weights = "IMAGENET1K_V2"
    model = models.resnet50(weights=weights)
else:
    model = models.resnet50(weights=None)

# Replace the final classification head to match our dataset's class count
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Optionally freeze the convolutional backbone so only the final FC trains.
# This is great for quick sanity checks; later you can unfreeze for full finetuning.
if FREEZE_BACKBONE:
    for name, p in model.named_parameters():
        if not name.startswith("fc."):
            p.requires_grad_(False)

# Move the model to the chosen device (GPU or CPU)
model = model.to(DEVICE)

# ======================
# LOSS FUNCTION & OPTIMIZER
# ======================
# CrossEntropyLoss with label-smoothing mitigates over-confident predictions and can
# slightly improve generalization on small / imbalanced datasets.
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)

# Only optimize parameters that are marked as trainable (respects FREEZE_BACKBONE)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=LR, weight_decay=WEIGHT_DECAY)

# AMP gradient scaler for stable mixed-precision training on CUDA
scaler = amp.GradScaler('cuda') if DEVICE == 'cuda' else None

# ======================
# TRAIN / EVAL UTILITIES
# ======================
@torch.no_grad()
def _accuracy(logits: torch.Tensor, labels: torch.Tensor) -> float:
    """Compute top-1 accuracy for a batch (utility; not used in final printouts)."""
    return (logits.argmax(dim=1) == labels).float().mean().item()


def run_epoch(loader, train: bool = True, desc: str = "train"):
    """Runs one pass over a DataLoader.
    
    Args:
      loader: DataLoader yielding (images, labels)
      train : if True, updates model weights; if False, evaluation-only
      desc  : string for logging ("train"/"val")
    Returns:
      (avg_loss, avg_accuracy)
    """
    model.train(train)  # toggles dropout/batchnorm behavior as appropriate

    total_loss, total_correct, total_seen = 0.0, 0, 0

    for step, (images, labels) in enumerate(loader, 1):
        # Non-blocking copies overlap data transfer with compute on CUDA
        images = images.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)

        # Create an autocast context for mixed precision on CUDA; CPU uses full precision
        if DEVICE == 'cuda':
            autocast_ctx = amp.autocast('cuda', enabled=True)
        else:
            from contextlib import nullcontext
            autocast_ctx = nullcontext()

        # Forward pass (mixed precision when enabled)
        with autocast_ctx:
            logits = model(images)
            loss = criterion(logits, labels)

        if train:
            # Standard fused optimizer step; with AMP we scale/unscale grads to avoid underflow
            optimizer.zero_grad(set_to_none=True)
            if scaler is not None:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

        # --- Metrics bookkeeping in FP32 ---
        with torch.no_grad():
            preds = logits.argmax(dim=1)
            correct = (preds == labels).sum().item()
            batch_sz = labels.size(0)
            total_correct += correct
            total_seen += batch_sz
            total_loss += float(loss.item()) * batch_sz  # sum of per-sample losses

        # Periodic progress print for visibility
        if step % LOG_EVERY == 0 or step == 1:
            batch_acc = correct / batch_sz
            print(f"  Step {step}/{len(loader)} | Loss: {loss.item():.4f} | Batch acc: {batch_acc:.2f}")

    # Compute dataset-wide averages
    avg_loss = total_loss / max(1, total_seen)
    avg_acc  = total_correct / max(1, total_seen)
    return avg_loss, avg_acc


def collect_predictions(loader):
    """Run the model in eval mode and collect all predictions + labels.
    
    Returns
    -------
    y_true : np.ndarray of shape (N,)
        Ground-truth integer labels for the entire split.
    y_pred : np.ndarray of shape (N,)
        Predicted class indices (argmax over logits) for the entire split.
    """
    model.eval()
    all_labels = []
    all_preds  = []

    for images, labels in loader:
        images = images.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)
        with torch.no_grad():
            if DEVICE == 'cuda':
                with amp.autocast('cuda', enabled=True):
                    logits = model(images)
            else:
                logits = model(images)
            preds = logits.argmax(dim=1)
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

    # Concatenate list of arrays into one long vector per side
    y_true = np.concatenate(all_labels)
    y_pred = np.concatenate(all_preds)
    return y_true, y_pred

# ============================
# MAIN TRAINING CONTROL-LOOP
# ============================
print("\n==== START TRAINING ====\n")
best_val_acc = 0.0  # track best validation accuracy to decide checkpointing
start_time = time.time()

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")

    # One training epoch
    tr_loss, tr_acc = run_epoch(train_loader, train=True, desc="train")
    print(f"Train: loss={tr_loss:.4f}, acc={tr_acc:.3f}")

    # One validation epoch (no weight updates)
    vl_loss, vl_acc = run_epoch(val_loader, train=False, desc="val")
    print(f"Val:   loss={vl_loss:.4f}, acc={vl_acc:.3f}")

    # Save a checkpoint if validation accuracy improved
    if vl_acc > best_val_acc:
        best_val_acc = vl_acc
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"  ✅ New best model saved → {SAVE_PATH} (acc={best_val_acc:.3f})")

    # --- Post-epoch diagnostics on the validation split ---
    y_true, y_pred = collect_predictions(val_loader)

    # 1) Confusion matrix: rows = actual class, columns = predicted class
    cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes)))
    print("\nValidation Confusion Matrix (rows=true, cols=pred):")
    # Pretty-print small matrices to console; for large K consider saving to file
    with np.printoptions(threshold=200, linewidth=200):
        print(cm)

    # 2) Per-class precision/recall/F1 (macro/weighted) with human-friendly names
    target_names = [label_to_names.get(i, f"class_{i}") for i in range(num_classes)]
    print("\nPer-class metrics (validation):")
    print(classification_report(y_true, y_pred,
                                labels=list(range(num_classes)),
                                target_names=target_names,
                                digits=3))

print("\nTraining finished!")
print("Best validation accuracy:", round(best_val_acc, 3))
print("Elapsed time: %.1f sec" % (time.time() - start_time))


Using device: cuda
Tesla V100-PCIE-32GB
Allocated: 0.1 GB
Reserved:  0.41 GB
First rows of metadata_normalized.csv:
                                       image_path        finding_class  \
0  Normal clean mucosa/0728084c8da942d9_22803.jpg  Normal clean mucosa   
1  Normal clean mucosa/0728084c8da942d9_22804.jpg  Normal clean mucosa   
2  Normal clean mucosa/0728084c8da942d9_22805.jpg  Normal clean mucosa   
3  Normal clean mucosa/0728084c8da942d9_22806.jpg  Normal clean mucosa   
4  Normal clean mucosa/0728084c8da942d9_22807.jpg  Normal clean mucosa   

           video_id  encoded_label  
0  0728084c8da942d9              7  
1  0728084c8da942d9              7  
2  0728084c8da942d9              7  
3  0728084c8da942d9              7  
4  0728084c8da942d9              7  
Number of classes: 11
Train samples: 37408 | Val samples: 9763
Train videos: 34 | Val videos: 9
Loaded 11 class names from /home/stud/fwag/bhome/ele670_project/data/processed/label_classes_filtered.npy
One training ba