## This notebook provides an general overview of the data and trains a baseline EfficientNet-B0 model. You can use the provided code to enhance the baseline model by addressing class imbalance, adding augmentations, tuning hyperparameters, and more.

In [None]:
import h5py
import io
import tqdm
import os

import cv2
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torchvision.models import efficientnet_b0

plt.style.use('bmh')
plt.rcParams['figure.figsize'] = (15, 5)

In [None]:
class CFG:
    seed = 42
    test_size=0.2
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # dataset parameters
    img_size = 312
    img_resize=True # False if pad with zeros instead of resize
    
    # model parameters
    pretrained = False #'DEFAULT'
    last_layer_hidden_dim = 1280
    out_dim = 1
    
    # train parameters
    train_batch_size = 64
    val_batch_size = 128
    inference_batch_size = 128
    lr = 1e-3
    weight_decay = 1e-5
    train_epochs = 3
    model_save_folder = "/kaggle/working/"
    model_name = 'baseline_eff0.pt'
    
    # score parameters
    tpr_threshold = 0.8
    score_normalize=False
    

def seed_everything(seed):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(CFG.seed)

class PATHS:
    train_images_h5_path = "/kaggle/input/isic-2024-challenge/train-image.hdf5"
    test_images_h5_path = "/kaggle/input/isic-2024-challenge/test-image.hdf5"
    
    train_metadata_path = "/kaggle/input/isic-2024-challenge/train-metadata.csv"
    test_metadata_path = "/kaggle/input/isic-2024-challenge/test-metadata.csv"
    
    submission_path = "/kaggle/input/isic-2024-challenge/sample_submission.csv"
    
metadata = pd.read_csv(PATHS.train_metadata_path)
metadata_test = pd.read_csv(PATHS.test_metadata_path)
submission = pd.read_csv(PATHS.submission_path)



TRAIN = True #False if inference run

# EDA

### Сlass imbalance is very pronounced. 0.09 % of samples has positive class.

In [None]:
metadata.target.value_counts()

### Most iddx_* columns have a lot of null values. That is different levels of lesion diagnosis.

In [None]:
metadata.isnull().sum(0)

In [None]:
metadata['iddx_full'].value_counts()

### Patients Sex, Age and Anatom Site distributions 

In [None]:
metadata['sex'].hist()

In [None]:
metadata['age_approx'].hist()

In [None]:
metadata['anatom_site_general'].hist()

### We have aboout 1k unique patients

In [None]:
metadata.patient_id.value_counts()

### Most labels are quite confident

In [None]:
metadata.tbp_lv_dnn_lesion_confidence.hist()

### If condiser only malignant labels then we get different distribution with more density at low confidence

In [None]:
metadata[metadata.target == 1].tbp_lv_dnn_lesion_confidence.hist()

### Read h5 images. Train images already provided in train-image folder. Here, to demosntarte reading, I extract them from h5.

In [None]:
def read_images_from_hdf5(file_path):
    with h5py.File(file_path, 'r') as file:
        ids_list = list(file.keys())        
        ids_images = {}
        for img_id in tqdm.tqdm(ids_list):
            image_data = file[img_id][()]
            image = Image.open(io.BytesIO(image_data))
            ids_images[img_id] = np.array(image)
    return ids_images

In [None]:
%%time
if TRAIN:
    images_train = read_images_from_hdf5(PATHS.train_images_h5_path)
images_test = read_images_from_hdf5(PATHS.test_images_h5_path)

### Let's check image size distributions.

In [None]:
if TRAIN:
    imgs_sizes = [img.shape for img in images_train.values()]

#### height

In [None]:
if TRAIN:
    plt.hist([i[0] for i in imgs_sizes]);

#### width


In [None]:
if TRAIN:
    plt.hist([i[1] for i in imgs_sizes]);

#### You can see that min max of Images h/w is 41/269. And all images have equal hxw ratio = 1

In [None]:
if TRAIN:
    print(f"Min height within train images = {np.min([i[0] for i in imgs_sizes])}, Max height within train images = {np.max([i[0] for i in imgs_sizes])}")
    print(f"Min width within train images = {np.min([i[1] for i in imgs_sizes])}, Max width within train images = {np.max([i[1] for i in imgs_sizes])}")
    print(f"Height and width among images are equal = {[i[1] for i in imgs_sizes] == [i[0] for i in imgs_sizes]}")

## Baseline efficientnet b0 training

In [None]:
class ISIC2024Dataset(Dataset):
    def __init__(self, 
                 metadata: pd.DataFrame, 
                 ids_images: dict,
                 img_resize: bool=True,
                 test: bool=False):
        
        self.metadata = metadata
        self.ids_images = ids_images
        self.img_resize = img_resize
        self.test = test

    def __len__(self):
        return len(self.metadata)
    
    def pad_img(self, img):
        pad_y = CFG.img_size - img.shape[0]
        pad_x = CFG.img_size - img.shape[1]
        padded_img = np.pad(img, 
                       ((pad_y//2, pad_y//2 + pad_y%2), 
                        (pad_x//2, pad_x//2 + pad_x%2), 
                        (0, 0)),
                       mode='constant', constant_values=0)
        return padded_img

    def __getitem__(self, item):
        isic_row = self.metadata.iloc[item]
        isic_id = isic_row.isic_id
        image = self.ids_images[isic_id]
        if self.img_resize:
            image = cv2.resize(image, dsize=(CFG.img_size, CFG.img_size))
        else:
            image = self.pad_img(image)
        if self.test:
            return image
        label = isic_row.target
        return image, label

In [None]:
if TRAIN:
    train, val = train_test_split(metadata, 
                                 stratify=metadata.target,
                                 test_size=CFG.test_size,
                                 random_state=CFG.seed)
    print(train.target.value_counts(), val.target.value_counts())

In [None]:
if TRAIN:
    dataset_train = ISIC2024Dataset(train, images_train, img_resize=CFG.img_resize)
    dataset_val = ISIC2024Dataset(val, images_train, img_resize=CFG.img_resize)
    train_dataloader = torch.utils.data.DataLoader(
            dataset_train,
            batch_size=CFG.train_batch_size,
            num_workers=0,
            shuffle=True,
            pin_memory=True
    )
    val_dataloader = torch.utils.data.DataLoader(
            dataset_val,
            batch_size=CFG.val_batch_size,
            num_workers=0,
            shuffle=False,
            pin_memory=True,
        )


dataset_test = ISIC2024Dataset(metadata_test, images_test, img_resize=CFG.img_resize, test=True)
test_dataloader = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=CFG.inference_batch_size,
        num_workers=0,
        shuffle=False,
        pin_memory=True,
    )


#### Let's check dataset 

In [None]:
dataset_test[0].shape == (CFG.img_size, CFG.img_size, 3)

In [None]:
plt.imshow(dataset_test[0])

In [None]:
def train_epoch(train_loader, model, optimizer, criterion, scheduler, device):
    """One epoch training pass."""
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=True)
    losses = []
    with tqdm.notebook.tqdm(train_loader, unit="train_batch", desc='Train') as tqdm_train_loader:
        for step, batch in enumerate(tqdm_train_loader):
            
            X = batch[0].to(device).float()
            y = batch[1].to(device).view(-1, 1).float()
            
            y_preds = model(X)
            loss = criterion(y_preds, y)
            
            losses.append(loss.item())
            scaler.scale(loss).backward()

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()

    return np.mean(losses)

def validation(valid_loader, model, criterion, device):
    model.eval() 
    losses = []
    preds = []
    with tqdm.notebook.tqdm(valid_loader, unit="valid_batch", desc='Validation') as tqdm_valid_loader:
        for step, batch in enumerate(tqdm_valid_loader):
            X = batch[0].to(device).float()
            y = batch[1].to(device).view(-1, 1).float()
            with torch.no_grad():
                y_preds = model(X)
                loss = criterion(y_preds, y)                
                preds.append(y_preds.sigmoid().cpu().numpy())               
            losses.append(loss.item())
    preds = np.concatenate(preds, 0)
    return np.mean(losses), preds

def inference(valid_loader, model, device):
    model.eval() 
    preds = []
    with tqdm.notebook.tqdm(valid_loader, unit="inference_batch", desc='Inference') as tqdm_valid_loader:
        for step, batch in enumerate(tqdm_valid_loader):
            X = batch.to(device).float()
            with torch.no_grad():
                y_preds = model(X)
                preds.append(y_preds.sigmoid().cpu().numpy())               
    preds = np.concatenate(preds, 0)
    return preds

In [None]:
class ISIC2024Model(nn.Module):
    def __init__(self, 
                 config):
        
        super(ISIC2024Model, self).__init__()
        self.backbone = efficientnet_b0(pretrained=config.pretrained)
        self.backbone = torch.nn.Sequential(*(list(self.backbone.children())[:-1]))
        self.head = nn.Sequential(nn.Linear(config.last_layer_hidden_dim, config.out_dim))                                    

    def forward(self, 
                input):
        input = input.permute(0, 3, 1, 2)
        
        x = self.backbone(input)
        x = self.head(x.flatten(1))
        return x

#### The partial AUC score implementation

In [None]:
import numpy as np
from sklearn.metrics import roc_curve, auc

def compute_pauc(y_true, y_scores, tpr_threshold=0.8):
    """
    Compute the partial AUC above a given TPR threshold.

    Parameters:
    y_true (np.array): True binary labels.
    y_scores (np.array): Target scores.
    tpr_threshold (float): TPR threshold above which to compute the pAUC.
    Returns:
    float: The partial AUC above the given TPR threshold.
    """
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)

    # Find the indices where the TPR is above the threshold
    tpr_above_threshold_indices = np.where(tpr >= tpr_threshold)[0]

    if len(tpr_above_threshold_indices) == 0:
        return 0.0

    # Extract the indices for the ROC segment above the threshold
    start_index = tpr_above_threshold_indices[0] 
    fpr_above_threshold = fpr[start_index:]
    tpr_above_threshold = tpr[start_index:] - tpr_threshold

#     print(fpr_above_threshold, tpr_above_threshold, thresholds[start_index:])
    # Compute partial AUC
    partial_auc = auc(fpr_above_threshold, tpr_above_threshold)
    
    return partial_auc

# Example usage
y_true = np.array([0, 1, 0])
y_scores = np.array([0.1, 0.4, 0.35])

pauc = compute_pauc(y_true, y_scores)
print(f'Partial AUC above 80% TPR: {pauc:.4f}')

In [None]:
save_path = os.path.join(CFG.model_save_folder, model_name)
print(save_path)

#### I trained the baseline model locally on RTX 3090. The average epoch time was ~20 minutes.

In [None]:
model = ISIC2024Model(CFG)
model = model.to(CFG.device)

if TRAIN:
    optimizer = torch.optim.AdamW(model.parameters(), 
                                  lr=CFG.lr, 
                                  weight_decay=CFG.weight_decay)
    criterion = nn.BCEWithLogitsLoss()
    score_best = 0 
    for e in range(CFG.train_epochs):
        loss_mean = train_epoch(train_dataloader, model, optimizer, criterion, None, CFG.device)
        loss_val, preds = validation(val_dataloader, model, criterion, CFG.device)
        score = compute_pauc(val.target.values, 
                             preds, 
                             tpr_threshold=CFG.tpr_threshold)
        if score > score_best:
            score_best = score
            torch.save({
                'model': model.state_dict(), 
                'predictions': preds
            },
            save_path)

        print(f"Train loss = {loss_mean}. Val loss = {loss_val}. Val scor = {score}")

## Inference

In [None]:
model.load_state_dict(torch.load(save_path)['model'])

In [None]:
preds = inference(test_dataloader, model, CFG.device)

In [None]:
submission['isic_id'] = metadata_test['isic_id'].to_list()
submission['target'] = preds
submission.to_csv('submission.csv', index=False)