<a href="https://colab.research.google.com/github/Dimavl2025/skin_cancer_clasification/blob/main/dinoV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<big>First setp is to train the dinov2 classifier with labeled data</big>

*Pre requirement:*

*pip install torch torchvision timm scikit-learn*


In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# BEST SUBMISSION was with LR = 5e-4 and 50 epochs, weight_decay=1e-4 val_pos/neg = int(0.15

# =========================
# FULL COLAB SCRIPT (Option 1)
# ResNet18 (NO pretrained weights) + Oversampling + Best F1 model
# + Choose BEST threshold on VAL (maximize F1) + Kaggle submission (NO argmax)
# =========================

!pip -q install scikit-learn pandas tqdm

import os, re, random, shutil
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, WeightedRandomSampler, Subset
from torchvision import datasets, transforms, models
from sklearn.metrics import f1_score, precision_score, recall_score

# -------------------------
# Mount Drive (safe)
# -------------------------
from google.colab import drive
if not os.path.exists("/content/drive"):
    drive.mount("/content/drive")

# -------------------------
# Config
# -------------------------
SEED = 42
BATCH_SIZE = 64
EPOCHS = 150

NUM_WORKERS = 2
IMG_SIZE = 224
LR = 5e-4

ROOT = "/content/drive/MyDrive/Colab Notebooks/Course/Skin_Cancer_Classification"

# expected structure:
# ROOT/train/0 , ROOT/train/1 , ROOT/test
TRAIN_ROOT = os.path.join(ROOT, "train")
TEST_DIR   = os.path.join(ROOT, "test")

OUT_DIR = os.path.join(ROOT, "models_resnet18_scratch")
os.makedirs(OUT_DIR, exist_ok=True)

BEST_MODEL_PATH = os.path.join(OUT_DIR, "best_resnet18_f1.pt")
SUB_VALTH_PATH  = os.path.join(OUT_DIR, "submission_resnet18_bestValThreshold.csv")

# -------------------------
# Reproducibility
# -------------------------
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

seed_everything()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Mounted at /content/drive
Device: cpu


In [None]:
# -------------------------
# Step 1: Ensure correct folder structure (move ROOT/0 and ROOT/1 into ROOT/train/0, ROOT/train/1 if needed)
# -------------------------
os.makedirs(TRAIN_ROOT, exist_ok=True)

for cls in ["0", "1"]:
    src_root = os.path.join(ROOT, cls)
    dst_train = os.path.join(TRAIN_ROOT, cls)

    if os.path.exists(dst_train):
        print(f"[OK] {dst_train} exists")
        continue

    if os.path.exists(src_root):
        print(f"[MOVE] {src_root}  -->  {dst_train}")
        shutil.move(src_root, dst_train)
    else:
        print(f"[WARN] Missing: {src_root} (and {dst_train} not found)")

assert os.path.isdir(os.path.join(TRAIN_ROOT, "0")), "train/0 not found"
assert os.path.isdir(os.path.join(TRAIN_ROOT, "1")), "train/1 not found"
assert os.path.isdir(TEST_DIR), "test folder not found"

# -------------------------
# Transforms
# -------------------------
train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.85, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(p=0.2),
    transforms.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.10, hue=0.02),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

val_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])


[OK] /content/drive/MyDrive/Colab Notebooks/Course/Skin_Cancer_Classification/train/0 exists
[OK] /content/drive/MyDrive/Colab Notebooks/Course/Skin_Cancer_Classification/train/1 exists


In [None]:
# -------------------------
# Step 2: Dataset + stratified train/val split
# -------------------------
full_ds_train = datasets.ImageFolder(root=TRAIN_ROOT, transform=train_tfms)
print("Classes:", full_ds_train.classes, "class_to_idx:", full_ds_train.class_to_idx)

targets = np.array([y for _, y in full_ds_train.samples])
idx_all = np.arange(len(full_ds_train))

pos_idx = idx_all[targets == 1]
neg_idx = idx_all[targets == 0]

rng = np.random.default_rng(SEED)
rng.shuffle(pos_idx)
rng.shuffle(neg_idx)

val_pos = int(0.15 * len(pos_idx))
val_neg = int(0.15 * len(neg_idx))

val_idx = np.concatenate([pos_idx[:val_pos], neg_idx[:val_neg]])
train_idx = np.concatenate([pos_idx[val_pos:], neg_idx[val_neg:]])
rng.shuffle(train_idx)
rng.shuffle(val_idx)

train_ds = Subset(full_ds_train, train_idx)

full_ds_val = datasets.ImageFolder(root=TRAIN_ROOT, transform=val_tfms)
val_ds = Subset(full_ds_val, val_idx)

train_targets = targets[train_idx]
n0 = int((train_targets == 0).sum())
n1 = int((train_targets == 1).sum())
print(f"Train count: 0={n0}, 1={n1} | ratio0/1={n0/max(n1,1):.2f}")


Classes: ['0', '1'] class_to_idx: {'0': 0, '1': 1}
Train count: 0=4250, 1=250 | ratio0/1=17.00


In [None]:
# -------------------------
# Step 3: Oversampling via WeightedRandomSampler (balance sampling probability)
# -------------------------
LR = 5e-4
class_counts = np.bincount(train_targets, minlength=2).astype(np.float64)
class_weights = 1.0 / np.maximum(class_counts, 1.0)
sample_weights = class_weights[train_targets]

sampler = WeightedRandomSampler(
    weights=torch.from_numpy(sample_weights).double(),
    num_samples=len(train_targets),
    replacement=True
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler,
                          num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=NUM_WORKERS, pin_memory=True)

# -------------------------
# Step 4: Model (ResNet18 from scratch)
# -------------------------
model = models.resnet18(weights=None)
model.fc = nn.Linear(model.fc.in_features, 2)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

@torch.no_grad()
def eval_f1_argmax(model, loader):
    model.eval()
    ys, preds = [], []
    for x, y in loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(x)
        pred = torch.argmax(logits, dim=1)
        ys.append(y.detach().cpu().numpy())
        preds.append(pred.detach().cpu().numpy())
    ys = np.concatenate(ys)
    preds = np.concatenate(preds)
    return f1_score(ys, preds, pos_label=1)


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))


In [None]:





# -------------------------
# Step 5: Train (save best by val F1 using argmax internally for selection)
# NOTE: This is only for model selection; final submission uses VAL-chosen threshold.
# -------------------------
best_f1 = -1.0
best_epoch = -1

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss = 0.0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False)
    for x, y in pbar:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(x)
            loss = criterion(logits, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * x.size(0)
        pbar.set_postfix(loss=float(loss.item()))

    avg_loss = running_loss / len(train_ds)
    val_f1 = eval_f1_argmax(model, val_loader)
    print(f"Epoch {epoch:02d} | train_loss={avg_loss:.4f} | val_f1(argmax)={val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch
        torch.save(
            {"model_state": model.state_dict(), "best_f1": best_f1, "epoch": best_epoch},
            BEST_MODEL_PATH
        )
        print(f"  -> saved BEST model (val_f1={best_f1:.4f})")

print("Training done. Best val F1(argmax):", best_f1, "at epoch", best_epoch)
print("Best model saved to:", BEST_MODEL_PATH)

# -------------------------
# Step 6: Load best model
# -------------------------
ckpt = torch.load(BEST_MODEL_PATH, map_location=device)
model.load_state_dict(ckpt["model_state"])
model.eval()
print("Loaded best model. epoch:", ckpt.get("epoch"), "best_f1:", ckpt.get("best_f1"))

# -------------------------
# Step 7: Choose BEST threshold on VAL to maximize F1 (this is the key improvement)
# -------------------------
@torch.no_grad()
def collect_val_probs(model, loader):
    model.eval()
    probs, ys = [], []
    for x, y in loader:
        x = x.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(x)
            p1 = torch.softmax(logits, dim=1)[:, 1]
        probs.append(p1.detach().cpu().numpy())
        ys.append(y.detach().cpu().numpy())
    return np.concatenate(probs), np.concatenate(ys)

p_val, y_val = collect_val_probs(model, val_loader)

ths = np.linspace(0.05, 0.99, 95)
best = {"th": None, "f1": -1.0, "prec": None, "rec": None}

for th in ths:
    pred = (p_val > th).astype(int)
    f1 = f1_score(y_val, pred, pos_label=1)
    if f1 > best["f1"]:
        best["f1"] = float(f1)
        best["th"] = float(th)
        best["prec"] = float(precision_score(y_val, pred, pos_label=1, zero_division=0))
        best["rec"]  = float(recall_score(y_val, pred, pos_label=1, zero_division=0))

print("Best threshold on VAL (maximize F1):")
print(best)

P_THRESH = best["th"]
print("Using threshold:", P_THRESH)

# -------------------------
# Step 8: Test loader (NO labels; custom dataset)
# Kaggle ID format: "test/000000.jpg"
# Your filenames: "jpg.000045" -> "test/000045.jpg"
# -------------------------
test_files = [f for f in os.listdir(TEST_DIR) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
assert len(test_files) > 0, "No images found in test folder"

def extract_number(fn: str) -> int:
    m = re.findall(r"\d+", fn)
    return int(m[-1]) if m else -1

test_files = sorted(test_files, key=extract_number)

def filename_to_kaggle_id(fn: str) -> str:
    num = extract_number(fn)
    return f"test/{num:06d}.jpg"

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, folder, files, transform):
        self.folder = folder
        self.files = files
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        from PIL import Image
        fn = self.files[idx]
        path = os.path.join(self.folder, fn)
        img = Image.open(path).convert("RGB")
        img = self.transform(img)
        return img, fn

test_tfms = val_tfms
test_ds = TestDataset(TEST_DIR, test_files, test_tfms)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False,
                         num_workers=NUM_WORKERS, pin_memory=True)

# -------------------------
# Step 9: Predict and create single submission using VAL-chosen threshold
# -------------------------
all_ids, all_p = [], []

with torch.no_grad():
    for x, fns in tqdm(test_loader, desc="Predict test"):
        x = x.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(x)
            p1 = torch.softmax(logits, dim=1)[:, 1]
        p1 = p1.detach().cpu().numpy()

        for fn, prob1 in zip(fns, p1):
            all_ids.append(filename_to_kaggle_id(fn))
            all_p.append(float(prob1))

labels = (np.array(all_p) > P_THRESH).astype(int)

sub = pd.DataFrame({"ID": all_ids, "label": labels})
sub.to_csv(SUB_VALTH_PATH, index=False)

print("Saved submission:", SUB_VALTH_PATH)
print(sub.head())
print("Predicted positives:", int(sub["label"].sum()), "/", len(sub))


SyntaxError: invalid decimal literal (ipython-input-2251890809.py, line 170)