In [1]:
# ============================
# Cell 1 - Imports and setup
# ============================
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
%matplotlib inline
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [2]:
# =========================================
# Cell 2 - Paths and labels dataframe
# =========================================
ROOT_DIR      = "."
TRAIN_IMG_DIR = os.path.join(ROOT_DIR, "train_data")
TEST_IMG_DIR  = os.path.join(ROOT_DIR, "test_data")
LABELS_PATH   = os.path.join(ROOT_DIR, "train_labels.csv")
labels_df = pd.read_csv(LABELS_PATH)
print(labels_df.head())
print("\nClass distribution:")
print(labels_df["label"].value_counts())

   sample_index            label
0  img_0000.png  Triple negative
1  img_0001.png        Luminal A
2  img_0002.png        Luminal A
3  img_0003.png        Luminal B
4  img_0004.png          HER2(+)

Class distribution:
label
Luminal B          445
Luminal A          414
HER2(+)            397
Triple negative    156
Name: count, dtype: int64


In [3]:
# ==================================================
# Cell 3 - Label encoding (string <-> integer)
# ==================================================
unique_labels = sorted(labels_df["label"].unique())
label_to_idx = {lbl: idx for idx, lbl in enumerate(unique_labels)}
idx_to_label = {idx: lbl for lbl, idx in label_to_idx.items()}
labels_df["label_idx"] = labels_df["label"].map(label_to_idx)
def make_img_path(fname): return os.path.join(TRAIN_IMG_DIR, fname)
labels_df["img_path"] = labels_df["sample_index"].apply(make_img_path)
for p in labels_df["img_path"].head(): print(p, "->", os.path.exists(p))

./train_data/img_0000.png -> True
./train_data/img_0001.png -> True
./train_data/img_0002.png -> True
./train_data/img_0003.png -> True
./train_data/img_0004.png -> True


In [4]:
# ==========================================
# Cell 4 - Train/Validation split
# ==========================================
train_df, val_df = train_test_split(
    labels_df, test_size=0.2, random_state=42,
    stratify=labels_df["label_idx"]
)
print("Train size:", len(train_df))
print("Val size:", len(val_df))

Train size: 1129
Val size: 283


In [5]:
# ==========================================
# Cell 5 - Custom PyTorch Dataset
# ==========================================
class DoctogresDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row["img_path"]).convert("RGB")
        if self.transform: img = self.transform(img)
        return img, int(row["label_idx"])

In [6]:
# ==================================================
# Cell 6 - Transforms and Dataloaders
# ==================================================

# ImageNet statistics (for pretrained models)
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

# Bigger images to capture more tissue detail
IMG_SIZE = 384
BATCH_SIZE = 16  # puoi abbassare a 8 se la macchina fa fatica

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.6, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.15),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

val_transform = transforms.Compose([
    transforms.Resize(416),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

train_dataset = DoctogresDataset(train_df, transform=train_transform)
val_dataset   = DoctogresDataset(val_df,   transform=val_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))


Train batches: 71
Val batches: 18


In [7]:
# ==========================================
# Cell 7 - Model definition (ResNet50)
# ==========================================

num_classes = len(unique_labels)

# ResNet50 pretrained on ImageNet
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)

# Replace final FC layer with num_classes outputs
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)

model = model.to(device)
print(model)


6.6%

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/federico/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100.0%


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [10]:
# =======================================================
# Cell 8 - Loss, optimizer and scheduler
# =======================================================

# Compute class weights to handle class imbalance
class_counts = train_df["label_idx"].value_counts().sort_index().values.astype(float)
print("Train class counts:", class_counts)

class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * len(class_counts)
print("Class weights:", class_weights)

# Make sure weights are float32 on the correct device
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32, device=device)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# AdamW usually works a bit better than pure Adam per visione
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4,
    weight_decay=1e-4
)

# Scheduler: se la val acc non migliora, abbassa il LR
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="max",      # perchÃ© monitoriamo l'accuracy
    factor=0.5,      # dimezza il LR
    patience=2       # dopo 2 epoche senza migliorare
)


Train class counts: [317. 331. 356. 125.]
Class weights: [0.74292089 0.71149825 0.66153349 1.88404737]


In [11]:
# ==========================================
# Cell 9 - Train and validation loops
# ==========================================
def train_one_epoch(model,loader,criterion,optimizer,device):
    model.train()
    tot_loss=tot_corr=tot=0
    for x,y in loader:
        x,y=x.to(device),y.to(device)
        optimizer.zero_grad()
        out=model(x)
        loss=criterion(out,y)
        loss.backward()
        optimizer.step()
        _,pred=out.max(1)
        tot_loss+=loss.item()*x.size(0)
        tot_corr+=(pred==y).sum().item()
        tot+=y.size(0)
    return tot_loss/tot, tot_corr/tot

def evaluate(model,loader,criterion,device):
    model.eval()
    tot_loss=tot_corr=tot=0
    all_t=[]; all_p=[]
    with torch.no_grad():
        for x,y in loader:
            x,y=x.to(device),y.to(device)
            out=model(x)
            loss=criterion(out,y)
            _,pred=out.max(1)
            tot_loss+=loss.item()*x.size(0)
            tot_corr+=(pred==y).sum().item()
            tot+=y.size(0)
            all_t+=y.cpu().numpy().tolist()
            all_p+=pred.cpu().numpy().tolist()
    return tot_loss/tot, tot_corr/tot, np.array(all_t), np.array(all_p)

In [None]:
# ==========================================
# Cell 10 - Main training loop
# ==========================================

EPOCHS = 1
best_acc = 0
best_w = None

for e in range(1, EPOCHS + 1):
    print(f"\nEpoch {e}/{EPOCHS}")
    
    tr_l, tr_a = train_one_epoch(model, train_loader, criterion, optimizer, device)
    print("Train:", tr_l, tr_a)
    
    vl_l, vl_a, vt, vp = evaluate(model, val_loader, criterion, device)
    print("Val:", vl_l, vl_a)
    
    # Aggiorna lo scheduler in base alla validation accuracy
    scheduler.step(vl_a)

    if vl_a > best_acc:
        best_acc = vl_a
        best_w = model.state_dict().copy()

print("Best val acc:", best_acc)
model.load_state_dict(best_w)



Epoch 1/1




In [None]:
# ==========================================
# Cell 11 - Validation report
# ==========================================
vl_l,vl_a,vt,vp=evaluate(model,val_loader,criterion,device)
print("Val acc:",vl_a)
print(classification_report(vt,vp,target_names=unique_labels))

Val acc: 0.17314487632508835
                 precision    recall  f1-score   support

        HER2(+)       0.18      0.03      0.04        80
      Luminal A       0.41      0.30      0.35        83
      Luminal B       0.00      0.00      0.00        89
Triple negative       0.11      0.71      0.19        31

       accuracy                           0.17       283
      macro avg       0.18      0.26      0.14       283
   weighted avg       0.18      0.17      0.13       283



In [None]:
# ==========================================
# Cell 12 - Test Dataset and DataLoader
# (fixed: use only img_*.png, ignore masks)
# ==========================================

# Take only image files, ignore mask_*.png
test_files = sorted([
    f for f in os.listdir(TEST_IMG_DIR)
    if f.lower().endswith(".png") and f.startswith("img_")
])

print("Number of test images:", len(test_files))  # should be 954
print("First 5 test files:", test_files[:5])

test_df = pd.DataFrame({
    "sample_index": test_files,
    "img_path": [os.path.join(TEST_IMG_DIR, f) for f in test_files]
})

class DoctogresTestDataset(Dataset):
    """Dataset for test images (no labels)."""
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row["img_path"]
        img = Image.open(img_path).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)

        return img, row["sample_index"]

test_dataset = DoctogresTestDataset(test_df, transform=val_transform)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)


Number of test images: 954
First 5 test files: ['img_0000.png', 'img_0001.png', 'img_0002.png', 'img_0003.png', 'img_0004.png']


In [None]:
# ==========================================
# Cell 13 - Submission
# ==========================================
model.eval()
ids=[]; preds=[]
with torch.no_grad():
    for x,names in test_loader:
        x=x.to(device)
        out=model(x)
        _,p=out.max(1)
        ids+=list(names)
        preds+=p.cpu().numpy().tolist()
pred_labels=[idx_to_label[i] for i in preds]
sub=pd.DataFrame({"sample_index":ids,"label":pred_labels}).sort_values("sample_index")
sub.to_csv("submission.csv",index=False)
print("Submission saved.")



Submission saved.
