## Import Libs

In [29]:
import os
import random
import gc
import time
import copy
import sys
import numpy as np
import pandas as pd
from io import BytesIO
import shutil

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from torch import nn
import timm
from sklearn.metrics import roc_curve, auc

from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
from collections import defaultdict

import warnings # 避免一些可以忽略的报错
warnings.filterwarnings('ignore')
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

## CONFIG

In [2]:
is_debug = False

class CONFIG:
    seed = 308
    
    # 256 tiny_vit ---------- 3h+
    # 512 efficientnet convnext -------------- 17mins+
    #     efficientvit_b0 --------------- 7mins+
    test_batch_size = 64
    img_size = [128, 128]
    n_classes = 1
    n_folds = 5
    formatted_time = None
    ckpt_save_path = None
    
    n_accumulate = 1.0
    n_workers = os.cpu_count()

    DataParallel = True
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    """
    tf_efficientnet_b0_ns
    tiny_vit_21m_512.dist_in22k_ft_in1k
    eva02_base_patch14_224.mim_in22k
    convnext_tiny.fb_in22k_ft_in1k_384
    tf_efficientnetv2_s.in21k_ft_in1k
    efficientvit_b0.r224_in1k
    """
    model_names = ["tiny_vit_21m_512.dist_in22k_ft_in1k",
                   "tf_efficientnetv2_s.in21k_ft_in1k",
                   "tiny_vit_21m_512.dist_in22k_ft_in1k",
                   "tf_efficientnetv2_s.in21k_ft_in1k",
                   "tiny_vit_21m_512.dist_in22k_ft_in1k"]
    
    ckpt_paths = ["/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-07-30_23:38:34_tiny_vit_21m_512.dist_in22k_ft_in1k_output/1_CV_0.1685_Loss0.3688_epoch21.bin",
                  "/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-07-31_14:39:22_tf_efficientnetv2_s.in21k_ft_in1k_output/2_CV_0.1733_Loss0.1937_epoch2.bin",
                  "/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-08-02_23:51:47_tiny_vit_21m_512.dist_in22k_ft_in1k_output/3_CV_0.1832_Loss0.1030_epoch14.bin",
                  "/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-07-30_12:59:10_tf_efficientnetv2_s.in21k_ft_in1k_output/4_CV_0.1751_Loss0.3172_epoch22.bin",
                  "/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-08-02_13:08:32_tiny_vit_21m_512.dist_in22k_ft_in1k_output/5_CV_0.1814_Loss0.3872_epoch18.bin"]

    test_csv = "/data1/hjs/pythonProject/pythonProject/ISIC2024/my_train.csv"
    train_img_dir = "/data1/hjs/pythonProject/pythonProject/ISIC2024/image"
    
    # debug
#     test_csv = "/kaggle/input/isic-2024-challenge/train-metadata.csv"
#     test_img_hdf5 = "/kaggle/input/isic-2024-challenge/train-image.hdf5"

if CONFIG.DataParallel:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    print("IN DataParallel!")
else:
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    print("NO IN DataParallel!")

IN DataParallel!


## Set Random Seed

In [3]:
def set_seed(seed=308):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(CONFIG.seed)

## Data Progress

In [4]:
test = pd.read_csv(CONFIG.test_csv)
test

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,kfold
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,Benign,Benign,,,,,,,97.517282,2
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,Benign,Benign,,,,,,,3.141455,2
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,Benign,Benign,,,,,,,99.804040,4
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,Benign,Benign,,,,,,,99.989998,4
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,Benign,Benign,,,,,,,70.442510,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,Benign,Benign,,,,,,,99.999988,2
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,Benign,Benign,,,,,,,99.999820,2
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,Benign,Benign,,,,,,,99.999416,1
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,Benign,Benign,,,,,,,100.000000,4


## Dataset and DataLoader

In [5]:
def transform(img):
    composition = A.Compose([
        A.Resize(CONFIG.img_size[0], CONFIG.img_size[1]),
        ToTensorV2(),
    ])
    return composition(image=img)["image"]

In [6]:
class MyDataset(Dataset):
    def __init__(self, df, transform=None):
        super().__init__()
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :]
        img_id = row.isic_id + ".jpg"
        label = torch.tensor(row.target, dtype=torch.float32)
        
        img_path = os.path.join(CONFIG.train_img_dir, img_id)
        img = Image.open(img_path)
        img = np.array(img)

        if self.transform != None:
            img = self.transform(img)

        return img, label

In [7]:
def prepare_loaders(df, fold):
    df_test = df[df["kfold"] == fold]
    test_datasets = MyDataset(df=df_test, transform=transform)
    
    test_loader = DataLoader(test_datasets, batch_size=CONFIG.test_batch_size, num_workers=CONFIG.n_workers, shuffle=False, pin_memory=True)
    
    return test_loader

In [8]:
# test_loader = prepare_loaders(test)
# x, y = next(iter(test_loader))
# x.shape

## Evaluation

In [9]:
def compute_pAUC(y_true, y_scores, min_tpr=0.8):
    """
    计算pAUC, 特别是高于min_tpr (例如0.8) 部分的面积。

    参数:
    y_true (array-like): 真实标签 (0或1) 。
    y_scores (array-like): 预测概率或得分。
    min_tpr (float): 最小TPR阈值, 默认值为0.8。

    返回:
    pAUC (float): ROC曲线在min_tpr以上部分的面积。
    """
    # 计算fpr, tpr, 和阈值
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    
    # 找到高于min_tpr的部分
    idx = np.where(tpr >= min_tpr)[0]
    
    if len(idx) == 0:
        return 0.0
    
    # 截取高于min_tpr的部分
    fpr = fpr[idx]
    tpr = tpr[idx]
    # 确保有足够的点进行AUC计算
    if len(fpr) < 2 or len(tpr) < 2:
        return 0.0
    
    # 计算高于min_tpr部分的AUC
    pAUC = auc(fpr, tpr)
    # 归一化pAUC使其在[0, 0.2]范围内
    max_possible_auc = (1 - min_tpr)
    pAUC_normalized = pAUC * max_possible_auc
    
    return pAUC_normalized

## DL Model

In [10]:
def updata_req_grad(models, requires_grad=True):
    for model in models:
        for param in model.parameters():
            param.requires_grad = requires_grad

In [11]:
class GeMPool(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeMPool, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
    
    def gem(self, x, p=3, eps=1e-6):
        return torch.mean(x.clamp(min=eps).pow(p), dim=(-2, -1)).pow(1./p)
    
    def __repr__(self):
        return self.__class__.__name__ + f'(p={self.p.data.tolist()[0]:.4f}, eps={self.eps})'

In [12]:
class ISIC2024Model(nn.Module):
    def __init__(self, model_name=CONFIG.model_names[0]):
        super(ISIC2024Model, self).__init__()
        self.backbone = timm.create_model(model_name=model_name, 
                                          pretrained=False)
        
        if "efficientnet" in model_name:
            in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif "convnext" in model_name or "tiny_vit" in model_name:
            in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
        elif "eva" in model_name:
            in_features = 768
        elif "efficientvit" in model_name:
            in_features = self.backbone.head.classifier[4].in_features
            self.backbone.head.classifier[4] = nn.Identity()

        self.head = nn.Sequential(
            nn.Linear(in_features, CONFIG.n_classes)
        )
        
    def forward(self, x):
        _tmp = self.backbone(x)
        output = self.head(_tmp)
        return output

## Load Models

In [13]:
model_names = CONFIG.model_names

paths = CONFIG.ckpt_paths

models = []

if CONFIG.DataParallel:
    device_ids = [0, 1]
    for i in range(CONFIG.n_folds):
        model = ISIC2024Model(model_name=model_names[i])
        model = torch.nn.DataParallel(model, device_ids=device_ids)
        model = model.cuda()
        model.load_state_dict(torch.load(paths[i]))
        model.eval()
        models.append(model)
else:
    for i in range(CONFIG.n_folds):
        model = ISIC2024Model(model_name=model_names[i])
        model = model.cuda()
        model.load_state_dict(torch.load(paths[i]))
        model.eval()
        models.append(model)

## Infer Function

In [14]:
def Infer(models, kfolds=CONFIG.n_folds):
    oof = []
    true = []

    for fold in range(kfolds):
        model = models[fold]
        test_loader = prepare_loaders(test, fold)
        bar = tqdm(enumerate(test_loader), total=len(test_loader))
        with torch.no_grad():
            for step, (images, labels) in bar:
                
                if CONFIG.DataParallel:
                    images = images.cuda().float()
                else:
                    images = images.to(CONFIG.device, dtype=torch.float)
                    
                output = model(images)
                output = F.sigmoid(output)

                oof.append(output.flatten().detach().cpu().numpy())
                true.append(labels.detach().cpu().numpy())
            
    oof = np.concatenate(oof)
    true = np.concatenate(true)

    return oof, true

## Start Infer

In [15]:
oof, true = Infer(models)

100%|██████████| 1254/1254 [03:28<00:00,  6.01it/s]
100%|██████████| 1254/1254 [01:58<00:00, 10.57it/s]
100%|██████████| 1254/1254 [03:24<00:00,  6.12it/s]
100%|██████████| 1254/1254 [01:54<00:00, 11.00it/s]
100%|██████████| 1254/1254 [03:25<00:00,  6.11it/s]


## Local CV

In [18]:
local_cv = compute_pAUC(true, oof)
print("Local CV : ", local_cv)

Local CV :  0.17552580170301432


In [19]:
# 获取当前时间的时间戳
current_time = time.time()
print("当前时间戳:", current_time)

# 将时间戳转换为本地时间结构
local_time = time.localtime(current_time)

# 格式化本地时间
CONFIG.formatted_time = time.strftime('%Y-%m-%d_%H:%M:%S', local_time)
print("当前时间:", CONFIG.formatted_time)

CONFIG.ckpt_save_path = f"output/{CONFIG.formatted_time}_kfold_ensemble_output"
if os.path.exists(CONFIG.ckpt_save_path) is False:
    os.makedirs(CONFIG.ckpt_save_path)

当前时间戳: 1722665747.6723561
当前时间: 2024-08-03_14:15:47


In [27]:
df = pd.DataFrame()
df["model_name"] = CONFIG.model_names
df["ckpt_path"] = CONFIG.ckpt_paths
df["CV"] = local_cv
df

Unnamed: 0,model_name,ckpt_path,CV
0,tiny_vit_21m_512.dist_in22k_ft_in1k,/data1/hjs/pythonProject/pythonProject/ISIC202...,0.175526
1,tf_efficientnetv2_s.in21k_ft_in1k,/data1/hjs/pythonProject/pythonProject/ISIC202...,0.175526
2,tiny_vit_21m_512.dist_in22k_ft_in1k,/data1/hjs/pythonProject/pythonProject/ISIC202...,0.175526
3,tf_efficientnetv2_s.in21k_ft_in1k,/data1/hjs/pythonProject/pythonProject/ISIC202...,0.175526
4,tiny_vit_21m_512.dist_in22k_ft_in1k,/data1/hjs/pythonProject/pythonProject/ISIC202...,0.175526


In [28]:
np.save(f"result_analysis/{CONFIG.formatted_time}_kfold_ensemble.npy", oof)
df.to_csv(CONFIG.ckpt_save_path+"/ensemble_kfold.csv", index=False)

In [33]:
source_files = CONFIG.ckpt_paths
for source_file in source_files:
    # 目标文件路径
    file_name = source_file.split("/")[-1]
    destination_file = os.path.join(CONFIG.ckpt_save_path, file_name)
    # 复制文件
    shutil.copyfile(source_file, destination_file)
    print(f'文件已从 {source_file} 复制到 {destination_file}')

文件已从 /data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-07-30_23:38:34_tiny_vit_21m_512.dist_in22k_ft_in1k_output/1_CV_0.1685_Loss0.3688_epoch21.bin 复制到 output/2024-08-03_14:15:47_kfold_ensemble_output/1_CV_0.1685_Loss0.3688_epoch21.bin
文件已从 /data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-07-31_14:39:22_tf_efficientnetv2_s.in21k_ft_in1k_output/2_CV_0.1733_Loss0.1937_epoch2.bin 复制到 output/2024-08-03_14:15:47_kfold_ensemble_output/2_CV_0.1733_Loss0.1937_epoch2.bin
文件已从 /data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-08-02_23:51:47_tiny_vit_21m_512.dist_in22k_ft_in1k_output/3_CV_0.1832_Loss0.1030_epoch14.bin 复制到 output/2024-08-03_14:15:47_kfold_ensemble_output/3_CV_0.1832_Loss0.1030_epoch14.bin
文件已从 /data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-07-30_12:59:10_tf_efficientnetv2_s.in21k_ft_in1k_output/4_CV_0.1751_Loss0.3172_epoch22.bin 复制到 output/2024-08-03_14:15:47_kfold_ensemble_output/4_CV_0.1751_Loss0.3172_epoch22.bin
文件已从 /data1/hjs/python