# Import Libraries

In [None]:
# !pip install torcheval

from torcheval.metrics.functional import binary_Wauroc

In [None]:
import os
import gc
import math
import copy
import time
import random
import glob
import timm
import cv2
import h5py

from matplotlib import pyplot as plt
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from PIL import Image
from io import BytesIO

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

# import lightgbm as lgb
# import catboost as cb
# import xgboost as xgb

from sklearn.utils import resample

import joblib
from tqdm import tqdm
from collections import defaultdict

import albumentations as A
from albumentations.pytorch import ToTensorV2

import optuna

## Setting Seed & ETC.

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [None]:
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load Train dataset

In [None]:
root_dir = "data/isic-skin-cancer"
train_image_dir = f'{root_dir}/train-image/image'

In [None]:
def get_train_file_path(image_id):
    return f"{train_image_dir}/{image_id}.jpg"

In [None]:
train_images = sorted(glob.glob(f"{train_image_dir}/*.jpg"))

## Load Extra Train Dataset

In [None]:
train_2018_image_dir = "data/isic-2018/train-image/image"
train_2019_image_dir = "data/isic-2019/train-image/image"
train_2020_image_dir = "data/isic-2020/train-image/image"

In [None]:
def get_2018_train_file_path(image_id):
    return f"{train_2018_image_dir}/{image_id}.jpg"

def get_2019_train_file_path(image_id):
    return f"{train_2019_image_dir}/{image_id}.jpg"

def get_2020_train_file_path(image_id):
    return f"{train_2020_image_dir}/{image_id}.jpg"


In [None]:
train_2018_images = sorted(glob.glob(f"{train_2018_image_dir}/*.jpg"))
train_2019_images = sorted(glob.glob(f"{train_2019_image_dir}/*.jpg"))
train_2020_images = sorted(glob.glob(f"{train_2020_image_dir}/*.jpg"))

### Reduce Data Imbalance

In [None]:
image_df = pd.read_csv(f"{root_dir}/train-metadata.csv")

print("df.shape, # of positive cases, # of patients")
print("original>", image_df.shape, image_df.target.sum(), image_df["patient_id"].unique().shape)

image_df_postive = image_df[image_df["target"] == 1].reset_index(drop=True)
image_df_negative = image_df[image_df["target"] == 0].reset_index(drop=True)

image_df = pd.concat([image_df_postive, image_df_negative.iloc[:image_df_postive.shape[0]*20, :]])
print("filtered>", image_df.shape, image_df.target.sum(), image_df["patient_id"].unique().shape)

image_df['file_path'] = image_df['isic_id'].apply(get_train_file_path)
image_df = image_df[ image_df["file_path"].isin(train_images) ].reset_index(drop=True)
image_df = image_df[['isic_id', 'target', 'patient_id', 'file_path']]
print(image_df.isnull().sum())
image_df.head()

### Reduce Data Imbalance - Extra Dataset (2018)

In [None]:
image_2018_df = pd.read_csv("data/isic-2018/train-metadata.csv")

print("df.shape, # of positive cases, # of patients")
print("original>", image_2018_df.shape, image_2018_df.target.sum(), image_2018_df["patient_id"].unique().shape)

image_2018_df_postive = image_2018_df[image_2018_df["target"] == 1].reset_index(drop=True)
image_2018_df_negative = image_2018_df[image_2018_df["target"] == 0].reset_index(drop=True)

print(image_2018_df_postive.shape, image_2018_df_negative.shape)

image_2018_df = pd.concat([image_2018_df_postive, image_2018_df_negative.iloc[:image_2018_df_postive.shape[0]*20, :]])
print("filtered>", image_2018_df.shape, image_2018_df.target.sum(), image_2018_df["patient_id"].unique().shape)

image_2018_df['file_path'] = image_2018_df['isic_id'].apply(get_2018_train_file_path)
image_2018_df = image_2018_df[ image_2018_df["file_path"].isin(train_2018_images) ].reset_index(drop=True)
image_2018_df = image_2018_df.drop('Unnamed: 0', axis=1)
print(image_2018_df.columns)
print(image_2018_df.isnull().sum())
image_2018_df.head()

### Reduce Data Imbalance - Extra Dataset (2019)

In [None]:
image_2019_df = pd.read_csv("data/isic-2019/train-metadata.csv")

print("df.shape, # of positive cases, # of patients")
print("original>", image_2019_df.shape, image_2019_df.target.sum(), image_2019_df["patient_id"].unique().shape)

image_2019_df_postive = image_2019_df[image_2019_df["target"] == 1].reset_index(drop=True)
image_2019_df_negative = image_2019_df[image_2019_df["target"] == 0].reset_index(drop=True)

print(image_2019_df_postive.shape, image_2019_df_negative.shape)

image_2019_df = pd.concat([image_2019_df_postive, image_2019_df_negative.iloc[:image_2019_df_postive.shape[0]*20, :]])
print("filtered>", image_2019_df.shape, image_2019_df.target.sum(), image_2019_df["patient_id"].unique().shape)

image_2019_df['file_path'] = image_2019_df['isic_id'].apply(get_2019_train_file_path)
image_2019_df = image_2019_df[ image_2019_df["file_path"].isin(train_2019_images) ].reset_index(drop=True)
image_2019_df = image_2019_df.drop('Unnamed: 0', axis=1)
print(image_2019_df.columns)
print(image_2019_df.isnull().sum())
image_2019_df.head()

### Reduce Data Imbalance - Extra Dataset (2020)

In [None]:
image_2020_df = pd.read_csv("data/isic-2020/train-metadata.csv")

print("df.shape, # of positive cases, # of patients")
print("original>", image_2020_df.shape, image_2020_df.target.sum(), image_2020_df["patient_id"].unique().shape)

image_2020_df_postive = image_2020_df[image_2020_df["target"] == 1].reset_index(drop=True)
image_2020_df_negative = image_2020_df[image_2020_df["target"] == 0].reset_index(drop=True)

print(image_2020_df_postive.shape, image_2020_df_negative.shape)

image_2020_df = pd.concat([image_2020_df_postive, image_2020_df_negative.iloc[:image_2020_df_postive.shape[0]*20, :]])
print("filtered>", image_2020_df.shape, image_2020_df.target.sum(), image_2020_df["patient_id"].unique().shape)

image_2020_df['file_path'] = image_2020_df['isic_id'].apply(get_2020_train_file_path)
image_2020_df = image_2020_df[ image_2020_df["file_path"].isin(train_2020_images) ].reset_index(drop=True)
image_2020_df = image_2020_df.drop('Unnamed: 0', axis=1)
print(image_2020_df.columns)
print(image_2020_df.isnull().sum())
image_2020_df.head()

In [None]:
combined_df = pd.concat([image_df, image_2018_df, image_2019_df, image_2020_df], axis=0, ignore_index=True)
print(combined_df.shape)
combined_df

In [None]:
print(combined_df.isnull().sum())

In [None]:
t_max_value = combined_df.shape[0] * (4) * 10 // 32 // 5
t_max_value

### K-Fold

In [None]:
skf = StratifiedGroupKFold(n_splits=5)

for fold, ( _, val_) in enumerate(skf.split(combined_df, combined_df.target, combined_df.patient_id)):
      combined_df.loc[val_ , "kfold"] = int(fold)

# Make DataLoader

In [None]:
class ISICDataset_for_Train(Dataset):
    def __init__(self, df, transforms=None):
        self.df_positive = df[df["target"] == 1].reset_index()
        self.df_negative = df[df["target"] == 0].reset_index()
        self.file_names_positive = self.df_positive['file_path'].values
        self.file_names_negative = self.df_negative['file_path'].values
        self.targets_positive = self.df_positive['target'].values
        self.targets_negative = self.df_negative['target'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df_positive) * 3
    
    def __getitem__(self, index):
        if random.random() >= 0.76:
            df = self.df_positive
            file_names = self.file_names_positive
            targets = self.targets_positive
        else:
            df = self.df_negative
            file_names = self.file_names_negative
            targets = self.targets_negative
        index = index % df.shape[0]
        
        img_path = file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        target = targets[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'target': target
        }

    
class ISICDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.file_names = df['file_path'].values
        self.targets = df['target'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        target = self.targets[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'target': target
        }

### Augmentation

In [None]:
data_transforms = {
    "train": A.Compose([
        A.Resize(224, 224),
        A.RandomRotate90(p=0.5),
        A.Flip(p=0.5),
        A.Downscale(p=0.25),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(224, 224),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

## GeM Pooling Layer

In [None]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool1d(x.clamp(min=eps).pow(p), x.size(-1)).pow(1./p)

# ISIC Model (ViT)

In [None]:
class ISICViTModel(nn.Module):
    def __init__(self, num_classes=1):
        super(ISICViTModel, self).__init__()
        self.model = torchvision.models.vit_b_16(weights=torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1)
        
        self.in_features = self.model.heads.head.in_features
        self.model.heads = nn.Identity()  # Remove the classification head
        
        self.gem_pooling = GeM()
        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(self.in_features, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        features = self.model(x)  # Shape: [batch_size, in_features]
        features = features.unsqueeze(-1)  # Shape: [batch_size, in_features, 1]
        pooled_features = self.gem_pooling(features).squeeze(-1)  # Shape: [batch_size, in_features]
        x = self.dropout1(pooled_features)
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout2(x)
        return self.fc2(x).squeeze(-1)  # Shape: [batch_size]
    
model = ISICViTModel()
model.to(DEVICE)

In [None]:
test_input = torch.randn(32, 3, 224, 224).cuda()
output = model(test_input)
print(f"Output shape: {output.shape}")
print(output)

### Loss

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return torch.mean(F_loss)

criterion = FocalLoss()

In [None]:
# def criterion(outputs, targets):
#     return nn.BCELoss()(outputs, targets)

# Training & Validation Code

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    running_auroc  = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images = data['image'].to(device, dtype=torch.float)
        targets = data['target'].to(device, dtype=torch.float)
        
        batch_size = images.size(0)
        outputs = model(images).squeeze()
        loss = criterion(outputs, targets)
        loss = loss / 1
            
        loss.backward()
    
        if (step + 1) % 1 == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        auroc = binary_auroc(input=outputs.squeeze(), target=targets).item()
        
        running_loss += (loss.item() * batch_size)
        running_auroc  += (auroc * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        epoch_auroc = running_auroc / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, Train_Auroc=epoch_auroc,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss, epoch_auroc

In [None]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    running_auroc = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float)
        targets = data['target'].to(device, dtype=torch.float)
        
        batch_size = images.size(0)

        outputs = model(images).squeeze()
        loss = criterion(outputs, targets)

        auroc = binary_auroc(input=outputs.squeeze(), target=targets).item()
        running_loss += (loss.item() * batch_size)
        running_auroc  += (auroc * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        epoch_auroc = running_auroc / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss, Valid_Auroc=epoch_auroc,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss, epoch_auroc

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_auroc = -np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss, train_epoch_auroc = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=DEVICE, epoch=epoch)
        
        val_epoch_loss, val_epoch_auroc = valid_one_epoch(model, valid_loader, device=DEVICE, 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Train AUROC'].append(train_epoch_auroc)
        history['Valid AUROC'].append(val_epoch_auroc)
        history['lr'].append( scheduler.get_lr()[0] )
        
        # deep copy the model
        if best_epoch_auroc <= val_epoch_auroc:
            print(f"{b_}Validation AUROC Improved ({best_epoch_auroc} ---> {val_epoch_auroc})")
            best_epoch_auroc = val_epoch_auroc
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "best_weight.bin".format(val_epoch_auroc, val_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best AUROC: {:.4f}".format(best_epoch_auroc))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

## Schedular

In [None]:
def fetch_scheduler(optimizer, name):
    if name == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=t_max_value, 
                                                   eta_min=1e-6)
    elif name == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=100, 
                                                             eta_min=1e-6)
    elif name == None:
        return None
        
    return scheduler

In [None]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = ISICDataset_for_Train(df_train, transforms=data_transforms["train"])
    valid_dataset = ISICDataset(df_valid, transforms=data_transforms["valid"])

    train_loader = DataLoader(train_dataset, batch_size=32, 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=64, 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [None]:
train_loader, valid_loader = prepare_loaders(combined_df, fold=0)

In [None]:
optimizer = optim.AdamW([
    {'params': model.swin.parameters(), 'lr': 1e-5},
    {'params': model.fc1.parameters(), 'lr': 1e-4},
    {'params': model.fc2.parameters(), 'lr': 1e-4},
    {'params': model.fc3.parameters(), 'lr': 1e-3}
], weight_decay=0.01)

scheduler = fetch_scheduler(optimizer, 'CosineAnnealingLR')

In [None]:
model, history = run_training(model, optimizer, scheduler,
                              device=DEVICE,
                              num_epochs=50)

## Logging

In [None]:
history = pd.DataFrame.from_dict(history)
history.to_csv("history.csv", index=False)

# Submission

In [None]:
# df = pd.read_csv(test_csv)
# df['target'] = 0
# df

In [None]:
# df_sub = pd.read_csv(sample)
# df_sub

In [None]:
# class ISICTestDataset(Dataset):
#     def __init__(self, df, file_hdf, transforms=None):
#         self.df = df
#         self.fp_hdf = h5py.File(file_hdf, mode="r")
#         self.isic_ids = df['isic_id'].values
#         self.targets = df['target'].values
#         self.transforms = transforms
        
#     def __len__(self):
#         return len(self.isic_ids)
    
#     def __getitem__(self, index):
#         isic_id = self.isic_ids[index]
#         img = np.array( Image.open(BytesIO(self.fp_hdf[isic_id][()])) )
#         target = self.targets[index]
        
#         if self.transforms:
#             img = self.transforms(image=img)["image"]
            
#         return {
#             'image': img,
#             'target': target,
#         }

In [None]:
# data_transforms = {
#     "valid": A.Compose([
#         A.Resize(224, 224),
#         A.Normalize(
#                 mean=[0.485, 0.456, 0.406], 
#                 std=[0.229, 0.224, 0.225], 
#                 max_pixel_value=255.0, 
#                 p=1.0
#             ),
#         ToTensorV2()], p=1.)
# }

In [None]:
# model = ISICViTModel()
# model.load_state_dict( torch.load(best_weight) )
# model.to('cuda')

In [None]:
# preds = []
# with torch.no_grad():
#     bar = tqdm(enumerate(test_loader), total=len(test_loader))
#     for step, data in bar:        
#         images = data['image'].to('cuda', dtype=torch.float)        
#         batch_size = images.size(0)
#         outputs = model(images)
#         preds.append( outputs.detach().cpu().numpy() )
# preds = np.concatenate(preds).flatten()

In [None]:
# df_sub["target"] = preds
# df_sub.to_csv("submission.csv", index=False)

In [None]:
# df_sub