## Import Requirements

In [1]:
import time
from tqdm import tqdm
import os
import h5py
from PIL import Image
from io import BytesIO

import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, random_split
from torch.cuda import amp

import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

## Configuration and Set Seed

In [2]:
CONFIG = {
    "seed": 42,
    "img_size": 256,
    "batch_size": 1024,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    "valid_batch_size": 32,
}

In [3]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [4]:
ROOT_DIR = "/kaggle/input/isic-2024-challenge"

TEST_HDF  = f'{ROOT_DIR}/test-image.hdf5'
TEST_CSV  = f'{ROOT_DIR}/test-metadata.csv'
IMAGE_HDF = f'{ROOT_DIR}/train-image.hdf5'
TARGET_CSV = f'{ROOT_DIR}/train-metadata.csv'
SAMPLE    = f'{ROOT_DIR}/sample_submission.csv'

## Data Configuration

In [5]:
train_df = pd.read_csv(TARGET_CSV)
print(len(train_df))
train_df.head()

  train_df = pd.read_csv(TARGET_CSV)


401059


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [6]:
test_df = pd.read_csv(TEST_CSV)
test_df['target'] = 0 # dummy
print(len(test_df))
test_df.head()

3


Unnamed: 0,isic_id,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license,target
0,ISIC_0015657,IP_6074337,45.0,male,posterior torso,2.7,TBP tile: close-up,3D: XP,22.80433,20.00727,...,1.281532,2.299935,0.479339,20,-155.0651,1511.222,113.9801,Memorial Sloan Kettering Cancer Center,CC-BY,0
1,ISIC_0015729,IP_1664139,35.0,female,lower extremity,2.52,TBP tile: close-up,3D: XP,16.64867,9.657964,...,1.27194,2.011223,0.42623,25,-112.36924,629.535889,-15.019287,"Frazer Institute, The University of Queensland...",CC-BY,0
2,ISIC_0015740,IP_7142616,65.0,male,posterior torso,3.16,TBP tile: close-up,3D: XP,24.25384,19.93738,...,1.080308,2.705857,0.366071,110,-84.29282,1303.978,-28.57605,FNQH Cairns,CC-BY,0


In [7]:
df_sub = pd.read_csv(SAMPLE)
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.3
1,ISIC_0015729,0.3
2,ISIC_0015740,0.3


## Dataset and DataLoader

In [8]:
class ISIC(Dataset):
    def __init__(self, df, file_hdf, transforms=None):
        self.df = df
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = df['isic_id'].values
        self.targets = df['target'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.isic_ids)
    
    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        img = Image.open(BytesIO(self.fp_hdf[isic_id][()]))
        target = self.targets[index]
        
        if self.transforms:
            img = self.transforms(img)
            
        return {
            'image': img,
            'target': target,
        }

In [9]:
transforms_data = transforms.Compose([transforms.Resize((128,128)), transforms.ToTensor()])
dataset = ISIC(train_df,IMAGE_HDF, transforms=transforms_data)
test_dataset = ISIC(test_df, TEST_HDF, transforms=transforms_data)
dataset_size = len(dataset)

train_size = int(dataset_size * 0.8)               # 80%
val_size = int(dataset_size * 0.1)                 # 10%
test_size = dataset_size - train_size - val_size   # 10%

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"Training Data Size : {len(train_dataset)}")
print(f"Validation Data Size : {len(val_dataset)}")
print(f"Test Data Size : {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False, pin_memory=True)

Training Data Size : 320847
Validation Data Size : 40105
Test Data Size : 40107


In [10]:
### GPU Setting ###
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print(DEVICE)

cuda


In [11]:
test_dataset = ISIC(test_df, TEST_HDF, transforms=transforms_data)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=2, shuffle=False, pin_memory=True)

In [12]:
EPOCH = 1
lr = 0.01
model = models.resnet18(pretrained=True)

### Transfer Learning ###
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 1)
model.load_state_dict(torch.load('/kaggle/input/resnet18/pytorch/default/1/best_model.pth'))
model.to(DEVICE)
optimizer = optim.SGD(model.parameters(), lr=lr)
print("Created a learning model and optimizer")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 79.1MB/s]


Created a learning model and optimizer


In [13]:
def compute_pauc(y_true, y_scores, min_tpr=0.8):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    
    # Filter TPR >= min_tpr
    indices = np.where(tpr >= min_tpr)[0]
    
    # Check if we have at least two points to compute pAUC
    if len(indices) < 2:
        return 0.0  # or any other value that signifies inability to compute pAUC
    
    fpr_min_tpr = fpr[indices]
    tpr_min_tpr = tpr[indices]
    
    # Calculate AUC for the area above the min_tpr threshold
    p_auc = auc(fpr_min_tpr, tpr_min_tpr)
    
    return p_auc

In [14]:
### Train/Evaluation ###
def train(model, train_loader, optimizer, epoch):
    model.train()
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch}", leave=False)
    total_loss = 0.0

    for i, batch in enumerate(train_loader_tqdm):
        image, target = batch['image'].to(DEVICE), batch['target'].to(DEVICE).float()
        
        output = model(image)
        train_loss = F.binary_cross_entropy_with_logits(output.squeeze(), target).to(DEVICE)

        train_loss.backward()
        optimizer.step()

        total_loss += train_loss.item()
        train_loader_tqdm.set_postfix(loss=train_loss.item())

    avg_loss = total_loss / len(train_loader)
    return avg_loss

In [15]:
def evaluate(model, data_loader):
    model.eval()
    y_true = []
    y_scores = []
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            images, targets = batch['image'].to(DEVICE), batch['target'].to(DEVICE)
            outputs = model(images)
            probs = torch.sigmoid(outputs).squeeze()

            loss = F.binary_cross_entropy_with_logits(outputs.squeeze(), targets.float())
            total_loss += loss.item()

            y_true.extend(targets.cpu().numpy())
            y_scores.extend(probs.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    pauc = compute_pauc(y_true, y_scores, min_tpr=0.8)

    return avg_loss, pauc

In [16]:
start = time.time()
best = 0

'''for epoch in range(EPOCH):
    train_loss = train(model, train_loader, optimizer, epoch)
    val_loss, val_pauc = evaluate(model, val_loader)

    if val_pauc > best:
        best = val_pauc
        torch.save(model.state_dict(), "./best_model.pth")
    print(f'\n\n[Epoch {epoch}] Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, pAUC(above 80% TPR): {val_pauc:.4f}\n')'''
    

test_loss, test_pauc = evaluate(model, test_loader)
print(f'[FINAL] Test Loss {test_loss:.4f}, pAUC: {test_pauc:.4f}, pAUC(above 80% TPR): {test_pauc:.4f}\n')

end = time.time()
elapsed_time = end - start

print(f"Elapsed Time: {int(elapsed_time/3600)}h, {int(elapsed_time/60)}m, {int(elapsed_time%60)}s")
print(f"time: {int(elapsed_time/3600)}h, {int(elapsed_time/60)}m, {int(elapsed_time%60)}s")

100%|██████████| 1/1 [00:00<00:00,  1.10it/s]

[FINAL] Test Loss 0.0000, pAUC: 0.0000, pAUC(above 80% TPR): 0.0000

Elapsed Time: 0h, 0m, 0s
time: 0h, 0m, 0s





## Making Submission

In [17]:
test_dataset = ISIC(test_df, TEST_HDF, transforms=transforms_data)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=2, shuffle=False, pin_memory=True)

In [18]:
preds = []
with torch.no_grad():
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    for step, data in bar:        
        images = data['image'].to(CONFIG["device"], dtype=torch.float)        
        batch_size = images.size(0)
        outputs = torch.sigmoid(model(images)).squeeze()
        
        preds.append( outputs.detach().cpu().numpy() )
preds = np.concatenate(preds).flatten()

100%|██████████| 1/1 [00:00<00:00, 20.10it/s]


In [19]:
df_sub["target"] = preds
df_sub.to_csv("submission.csv", index=False)

In [20]:
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.0
1,ISIC_0015729,0.0
2,ISIC_0015740,1.911362e-14
