<a href="https://colab.research.google.com/github/9-coding/Kaggle/blob/main/image_classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Requirements

In [None]:
import time
from tqdm import tqdm
import os
import h5py
from PIL import Image
from io import BytesIO

import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, random_split
from torch.cuda import amp

import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

## Configuration and Set Seed

In [None]:
CONFIG = {
    "seed": 42,
    "img_size": 256,
    "batch_size": 1024,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(CONFIG['seed'])

In [None]:
ROOT_DIR = "/kaggle/input/isic-2024-challenge"

TEST_HDF  = f'{ROOT_DIR}/test-image.hdf5'
TEST_CSV  = f'{ROOT_DIR}/test-metadata.csv'
IMAGE_HDF = f'{ROOT_DIR}/train-image.hdf5'
TARGET_CSV = f'{ROOT_DIR}/train-metadata.csv'
SAMPLE    = f'{ROOT_DIR}/sample_submission.csv'

## Data Configuration

In [None]:
train_df = pd.read_csv(TARGET_CSV)
print(len(train_df))
train_df.head()

  df = pd.read_csv(TARGET_CSV)


401059


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


## Dataset and DataLoader

In [None]:
class ISIC(Dataset):
    def __init__(self, file_hdf, df, transforms):
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.df = df
        self.isic_ids = df['isic_id'].values
        self.targets = df['target'].values
        self.transforms = transforms

    def __len__(self):
        return len(self.isic_ids)

    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        img = Image.open(BytesIO(self.fp_hdf[isic_id][()]))
        target = self.targets[index]

        if self.transforms:
            img = self.transforms(img)

        return {
            'image': img,
            'target': target,
        }

In [None]:
transforms_data = transforms.Compose([transforms.Resize((128,128)), transforms.ToTensor()])
dataset = ISIC(IMAGE_HDF, df, transforms=transforms_data)
dataset_size = len(dataset)

train_size = int(dataset_size * 0.8)                     # 80%
validation_size = int(dataset_size * 0.1)                # 10%
test_size = dataset_size - train_size - validation_size  # 10%

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, validation_size, test_size])

print(f"Training Data Size : {len(train_dataset)}")
print(f"Validation Data Size : {len(val_dataset)}")
print(f"Testing Data Size : {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False, pin_memory=True, drop_last=True)

Training Data Size : 320847
Validation Data Size : 40105
Testing Data Size : 40107


In [None]:
### GPU Setting ###
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print(DEVICE)

cuda


In [None]:
EPOCH = 1
lr = 0.01
model = models.resnet18(pretrained=True)

### Transfer Learning ###
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 1)
model.to(DEVICE)
optimizer = optim.SGD(model.parameters(), lr=lr)
print("Created a learning model and optimizer")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 134MB/s] 


Created a learning model and optimizer


In [None]:
def compute_pauc(y_true, y_scores, min_tpr=0.8):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)

    # Filter TPR >= min_tpr
    indices = np.where(tpr >= min_tpr)[0]

    # Check if we have at least two points to compute pAUC
    if len(indices) < 2:
        return 0.0  # or any other value that signifies inability to compute pAUC

    fpr_min_tpr = fpr[indices]
    tpr_min_tpr = tpr[indices]

    # Calculate AUC for the area above the min_tpr threshold
    p_auc = auc(fpr_min_tpr, tpr_min_tpr)

    return p_auc

In [None]:
### Train/Evaluation ###
def train(model, train_loader, optimizer, epoch):
    model.train()
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch}", leave=False)
    total_loss = 0.0

    for i, batch in enumerate(train_loader_tqdm):
        image, target = batch['image'].to(DEVICE), batch['target'].to(DEVICE).float()

        output = model(image)
        train_loss = F.binary_cross_entropy_with_logits(output.squeeze(), target).to(DEVICE)

        train_loss.backward()
        optimizer.step()

        total_loss += train_loss.item()
        train_loader_tqdm.set_postfix(loss=train_loss.item())

    avg_loss = total_loss / len(train_loader)
    return avg_loss

In [None]:
def evaluate(model, data_loader):
    model.eval()
    y_true = []
    y_scores = []
    total_loss = 0

    with torch.no_grad():
        for batch in data_loader:
            images, targets = batch['image'].to(DEVICE), batch['target'].to(DEVICE)
            outputs = model(images)
            probs = torch.sigmoid(outputs).squeeze()

            loss = F.binary_cross_entropy_with_logits(outputs.squeeze(), targets.float())
            total_loss += loss.item()

            y_true.extend(targets.cpu().numpy())
            y_scores.extend(probs.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    pauc = compute_pauc(y_true, y_scores, min_tpr=0.8)

    return avg_loss, pauc

In [None]:
start = time.time()
best = 0

for epoch in range(EPOCH):
    train_loss = train(model, train_loader, optimizer, epoch)
    val_loss, val_pauc = evaluate(model, val_loader)

    if val_pauc > best:
        best = val_pauc
        torch.save(model.state_dict(), "./best_model.pth")

    print(f'\n[Epoch {epoch}] Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, pAUC(above 80% TPR): {val_pauc:.4f}\n')


test_loss, test_accuracy, test_pauc = evaluate(model, test_loader)
print(f'[FINAL] Test Loss {test_loss:.4f}, Accuracy: {test_accuracy:.4f}%, pAUC: {test_pauc:.4f}')

end = time.time()
elapsed_time = end - start

print("Best pAUC: ", best)
print(f"Elapsed Time: {int(elapsed_time/3600)}h, {int(elapsed_time/60)}m, {int(elapsed_time%60)}s")
print(f"time: {int(elapsed_time/3600)}h, {int(elapsed_time/60)}m, {int(elapsed_time%60)}s")


Epoch 0:  89%|████████▉ | 280/313 [12:40<01:04,  1.95s/it, loss=1.82e-6] 

[0] Training Loss: 0.1928, Validation Loss: 0.0408, Accuracy: 99.9274%, pAUC(above 80% TPR): 0.1565


                                                                         

[1] Training Loss: 0.1157, Validation Loss: 0.0099, Accuracy: 99.9274%, pAUC(above 80% TPR): 0.1816


                                                                         

[2] Training Loss: 0.0817, Validation Loss: 0.1361, Accuracy: 99.9274%, pAUC(above 80% TPR): 0.1143


                                                                         

[3] Training Loss: 0.0551, Validation Loss: 0.0157, Accuracy: 99.9274%, pAUC(above 80% TPR): 0.2719


                                                                         

[4] Training Loss: 0.0266, Validation Loss: 0.0134, Accuracy: 99.9274%, pAUC(above 80% TPR): 0.1779


Epoch 5:  58%|█████▊    | 181/313 [05:58<04:19,  1.97s/it, loss=0.000198]