# DL Part

## Import Libs

In [1]:
import os
import random
import gc
import time
import copy
import sys
import numpy as np
import pandas as pd
# import h5py
# from io import BytesIO

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from torch import nn
import timm

from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
from collections import defaultdict

import warnings # 避免一些可以忽略的报错
warnings.filterwarnings('ignore')
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

  from .autonotebook import tqdm as notebook_tqdm


## CONFIG

In [2]:
is_debug = False

class CONFIG:
    seed = 308
    
    # 256 tiny_vit ---------- 3h+
    # 512 efficientnet convnext -------------- 17mins+
    #     efficientvit_b0 --------------- 7mins+
    test_batch_size = 512
    img_size = [160, 160]
    n_classes = 1
    n_folds = 5
    
    n_accumulate = 1.0
    n_workers = os.cpu_count()

    DataParallel = True
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    """
    tf_efficientnet_b0_ns
    tiny_vit_21m_512.dist_in22k_ft_in1k
    eva02_base_patch14_224.mim_in22k
    convnext_tiny.fb_in22k_ft_in1k_384
    tf_efficientnetv2_s.in21k_ft_in1k
    efficientvit_b0.r224_in1k

    edgenext_base.in21k_ft_in1k
    """
    model_name = "tf_efficientnetv2_s.in21k_ft_in1k"
    ckpt_path = "/data1/hjs/pythonProject/pythonProject/ISIC2024/GBDT_DL_CKPT"
    use_gempool = False
    dl_pred_name = "cv15496"
    
    train_csv = "/data1/hjs/pythonProject/pythonProject/ISIC2024/train-metadata.csv"
    train_img_dir = "/data1/hjs/pythonProject/pythonProject/ISIC2024/image"
    
    my_train_csv = "/data1/hjs/pythonProject/pythonProject/ISIC2024/my_train_with_sgkfold.csv"

if CONFIG.DataParallel:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    print("IN DataParallel!")
else:
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    print("NO IN DataParallel!")

IN DataParallel!


## Set Random Seed

In [3]:
def set_seed(seed=308):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(CONFIG.seed)

## Data Progress

In [4]:
train = pd.read_csv(CONFIG.my_train_csv)
train

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,kfold
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,Benign,Benign,,,,,,,97.517282,4
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,Benign,Benign,,,,,,,3.141455,0
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,Benign,Benign,,,,,,,99.804040,0
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,Benign,Benign,,,,,,,99.989998,2
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,Benign,Benign,,,,,,,70.442510,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,Benign,Benign,,,,,,,99.999988,4
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,Benign,Benign,,,,,,,99.999820,1
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,Benign,Benign,,,,,,,99.999416,2
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,Benign,Benign,,,,,,,100.000000,0


## Dataset and DataLoader

In [5]:
def transform(img):
    composition = A.Compose([
        A.Resize(CONFIG.img_size[0], CONFIG.img_size[1]),
        A.Normalize(),
        ToTensorV2(),
    ])
    return composition(image=img)["image"]

In [6]:
# class MyDataset(Dataset):
#     def __init__(self, df, img_hdf5, transform=None):
#         super().__init__()
#         self.df = df
#         self.fp_hdf = h5py.File(img_hdf5, mode="r")
#         self.transform = transform

#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         row = self.df.iloc[idx, :]
#         img_id = row.isic_id
#         label = str(img_id)
        
#         img = np.array( Image.open(BytesIO(self.fp_hdf[img_id][()])) )
#         img = np.array(img).astype(np.float32)

#         if self.transform != None:
#             img = self.transform(img)

#         return img, label
    
class MyDataset(Dataset):
    def __init__(self, df, transform=None):
        super().__init__()
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :]
        img_id = row.isic_id + ".jpg"
        label = torch.tensor(row.target, dtype=torch.float32)
        

        img_path = os.path.join(CONFIG.train_img_dir, img_id)
        img = Image.open(img_path)
        img = np.array(img)

        if self.transform != None:
            img = self.transform(img)

        return img, label

In [7]:
def prepare_loaders(df, fold):
    df = df[df["kfold"] == fold]
    test_datasets = MyDataset(df=df, transform=transform)
    
    test_loader = DataLoader(test_datasets, batch_size=CONFIG.test_batch_size, num_workers=CONFIG.n_workers, shuffle=False, pin_memory=True)
    
    return test_loader

In [8]:
# test_loader = prepare_loaders(test)
# x, y = next(iter(test_loader))
# x.shape

## DL Model

In [9]:
def updata_req_grad(models, requires_grad=True):
    for model in models:
        for param in model.parameters():
            param.requires_grad = requires_grad

In [10]:
class GeMPool(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeMPool, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
    
    def gem(self, x, p=3, eps=1e-6):
        return torch.mean(x.clamp(min=eps).pow(p), dim=(-2, -1)).pow(1./p)
    
    def __repr__(self):
        return self.__class__.__name__ + f'(p={self.p.data.tolist()[0]:.4f}, eps={self.eps})'

In [11]:
class ISIC2024Model(nn.Module):
    def __init__(self):
        super(ISIC2024Model, self).__init__()
        self.backbone = timm.create_model(model_name=CONFIG.model_name, 
                                          pretrained=False)
        
        if "efficientnet" in CONFIG.model_name:
            in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            if CONFIG.use_gempool:
                self.backbone.global_pool = GeMPool()
        elif "convnext" in CONFIG.model_name or "tiny_vit" in CONFIG.model_name:
            in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
        elif "eva" in CONFIG.model_name:
            in_features = 768
        elif "efficientvit" in CONFIG.model_name:
            in_features = self.backbone.head.classifier[4].in_features
            self.backbone.head.classifier[4] = nn.Identity()
        elif "edgenext" in CONFIG.model_name:
            in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()

        self.head = nn.Sequential(
            nn.Linear(in_features, CONFIG.n_classes)
        )
        
    def forward(self, x):
        _tmp = self.backbone(x)
        output = self.head(_tmp)
        return output

In [12]:
model = ISIC2024Model()
model

ISIC2024Model(
  (backbone): EfficientNet(
    (conv_stem): Conv2dSame(3, 24, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn1): BatchNormAct2d(
      24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
    (blocks): Sequential(
      (0): Sequential(
        (0): ConvBnAct(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNormAct2d(
            24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
          (drop_path): Identity()
        )
        (1): ConvBnAct(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNormAct2d(
            24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
     

## Load Models

In [13]:
models = []

paths = sorted(os.listdir(CONFIG.ckpt_path))
# paths = ["1_CV0.1547_Loss0.6784_epoch2.bin",
#          "2_CV0.1708_Loss0.5088_epoch5.bin",
#          "3_CV0.1791_Loss0.3739_epoch7.bin",
#          "4_CV0.1654_Loss0.5382_epoch3.bin",
#          "5_CV0.1812_Loss0.4236_epoch8.bin"]

if CONFIG.DataParallel:
    device_ids = [0, 1]
    for i in range(CONFIG.n_folds):
        model = ISIC2024Model()
        model = torch.nn.DataParallel(model, device_ids=device_ids)
        model = model.cuda()
        model.load_state_dict(torch.load(os.path.join(CONFIG.ckpt_path, paths[i])))
        print(f"fold : {i} --------- path : {paths[i]}")
        model.eval()
        models.append(model)
else:
    for i in range(CONFIG.n_folds):
        model = ISIC2024Model()
        model = model.cuda()
        model.load_state_dict(torch.load(os.path.join(CONFIG.ckpt_path, paths[i])))
        print(f"fold : {i} --------- path : {paths[i]}")
        model.eval()
        models.append(model)

fold : 0 --------- path : 1_CV_0.1514_Loss0.2055_epoch15.bin
fold : 1 --------- path : 2_CV_0.1445_Loss0.2757_epoch17.bin
fold : 2 --------- path : 3_CV_0.1704_Loss0.1593_epoch25.bin
fold : 3 --------- path : 4_CV_0.1492_Loss0.2547_epoch20.bin
fold : 4 --------- path : 5_CV_0.1627_Loss0.2155_epoch19.bin


## Infer Function

In [14]:
def Infer(model, test_loader):
    y_preds = []
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    with torch.no_grad():
        for step, (images, study_id) in bar:
            
            if CONFIG.DataParallel:
                images = images.cuda().float()
            else:
                images = images.to(CONFIG.device, dtype=torch.float)
                  
            outputs = model(images)
#             outputs = F.sigmoid(outputs)
            y_preds.append(outputs.detach().flatten().cpu().numpy())
            
    y_preds = np.concatenate(y_preds)
    return y_preds

## Start Infer

In [15]:
# train preds
total_train_pred = []

for fold in range(0, CONFIG.n_folds):
    train_loader = prepare_loaders(train, fold)

    preds = Infer(models[fold], train_loader)
    preds = F.sigmoid(torch.from_numpy(preds)).numpy()
    total_train_pred.append(preds)

100%|██████████| 148/148 [00:33<00:00,  4.41it/s]
100%|██████████| 174/174 [00:31<00:00,  5.55it/s]
100%|██████████| 160/160 [00:28<00:00,  5.54it/s]
100%|██████████| 132/132 [00:24<00:00,  5.41it/s]
100%|██████████| 172/172 [00:31<00:00,  5.48it/s]


In [16]:
train[CONFIG.dl_pred_name] = -1

for fold in range(CONFIG.n_folds):
    train.loc[train["kfold"] == fold, CONFIG.dl_pred_name] = total_train_pred[fold]
train

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,kfold,cv15496
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,Benign,,,,,,,97.517282,4,0.055244
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,Benign,,,,,,,3.141455,0,0.160246
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,Benign,,,,,,,99.804040,0,0.019354
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,Benign,,,,,,,99.989998,2,0.045941
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,Benign,,,,,,,70.442510,4,0.092378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,Benign,,,,,,,99.999988,4,0.615235
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,Benign,,,,,,,99.999820,1,0.102108
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,Benign,,,,,,,99.999416,2,0.051599
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,Benign,,,,,,,100.000000,0,0.076979


# ML Part

## Import libs

In [17]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import optuna
import random
from tqdm import tqdm
import gc
import warnings # 避免一些可以忽略的报错
warnings.filterwarnings('ignore')

## CONFIG

In [18]:
root = Path('/data1/hjs/pythonProject/pythonProject/ISIC2024')

train_path = root / 'train-metadata.csv'
# /kaggle/input/isic2024-my-train-csv/my_train.csv
# test_path = root / 'test-metadata.csv'
# subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 308

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
    
#     'size_age_interaction2',             # tbp_lv_perimeterMM      * age_approx             / (sum)
#     'hue_color_std_interaction2',        # tbp_lv_H                * tbp_lv_color_std_mean  / (sum)
#     'symmetry_perim_interaction2',       # tbp_lv_symm_2axis       * clin_size_long_diam_mm / (sum)
    
#     'tbp_lv_ratio_A',
#     'tbp_lv_ratio_B',
#     'tbp_lv_ratio_C',
#     'tbp_lv_ratio_H',
#     'tbp_lv_ratio_L',
    
#     'tbp_lv_contrast_A',
#     'tbp_lv_contrast_B',
#     'tbp_lv_contrast_C',
#     'tbp_lv_contrast_H',
#     'tbp_lv_contrast_L',
    
#     'tbp_lv_patient_ratio_A',
#     'tbp_lv_patient_ratio_B',
#     'tbp_lv_patient_ratio_C',
#     'tbp_lv_patient_ratio_H',
#     'tbp_lv_patient_ratio_L',
    
#     'tbp_lv_patient_contrast_A',
#     'tbp_lv_patient_contrast_B',
#     'tbp_lv_patient_contrast_C',
#     'tbp_lv_patient_contrast_H',
#     'tbp_lv_patient_contrast_L',
    
#     'tbp_lv_age_ratio_A',
#     'tbp_lv_age_ratio_B',
#     'tbp_lv_age_ratio_C',
#     'tbp_lv_age_ratio_H',
#     'tbp_lv_age_ratio_L',
    
#     'tbp_lv_age_contrast_A',
#     'tbp_lv_age_contrast_B',
#     'tbp_lv_age_contrast_C',
#     'tbp_lv_age_contrast_H',
#     'tbp_lv_age_contrast_L',
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
norm_cols2 = [f'{col}_sex_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + norm_cols2 + special_cols

## Set Seed

In [19]:
def set_seed(seed=308):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(seed)

## Feature Engineering

In [20]:
def read_data(path):
    df = pd.read_csv(path)
    
    df['sex'] = df['sex'].fillna('Unknown')
    df['sex'] = df['sex'].apply(lambda x: 'male' if x!='male' or x!='female' else x)
       
    return (
        pl.from_pandas(df)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
#         .with_columns(
#             size_age_interaction2          = pl.col('tbp_lv_perimeterMM') * pl.col('age_approx') / (pl.col('tbp_lv_perimeterMM') + pl.col('age_approx')),
#             hue_color_std_interaction2     = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_H') + pl.col('tbp_lv_color_std_mean')),
#             symmetry_perim_interaction2    = pl.col('tbp_lv_symm_2axis') * pl.col('clin_size_long_diam_mm') / (pl.col('tbp_lv_symm_2axis') + pl.col('clin_size_long_diam_mm')),
#         )
#         .with_columns(
#             # Ratio of tbp_lv_* to the product of tbp_lv_*ext and age minimum tbp_lv_*
#             tbp_lv_ratio_A = pl.col('tbp_lv_A').truediv(pl.col('tbp_lv_Aext').mul(pl.col('tbp_lv_A').min())).over('age_approx').cast(pl.Float32),         
#             tbp_lv_ratio_B = pl.col('tbp_lv_B').truediv(pl.col('tbp_lv_Bext').mul(pl.col('tbp_lv_B').min())).over('age_approx').cast(pl.Float32),
#             tbp_lv_ratio_C = pl.col('tbp_lv_C').truediv(pl.col('tbp_lv_Cext').mul(pl.col('tbp_lv_C').min())).over('age_approx').cast(pl.Float32),
#             tbp_lv_ratio_H = pl.col('tbp_lv_H').truediv(pl.col('tbp_lv_Hext').mul(pl.col('tbp_lv_H').min())).over('age_approx').cast(pl.Float32),
#             tbp_lv_ratio_L = pl.col('tbp_lv_L').truediv(pl.col('tbp_lv_Lext').mul(pl.col('tbp_lv_L').min())).over('age_approx').cast(pl.Float32),
#         )
#         .with_columns(
#             # Contrast between tbp_lv_* and tbp_lv_*ext
#             tbp_lv_contrast_A = pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext')).cast(pl.Float32),
#             tbp_lv_contrast_B = pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext')).cast(pl.Float32),
#             tbp_lv_contrast_C = pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext')).cast(pl.Float32),
#             tbp_lv_contrast_H = pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext')).cast(pl.Float32),
#             tbp_lv_contrast_L = pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext')).cast(pl.Float32),
#         )
#         .with_columns(
#             # Ratio ratio to patient average
#             tbp_lv_patient_ratio_A = pl.col('tbp_lv_ratio_A').truediv(pl.col('tbp_lv_ratio_A').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_ratio_B = pl.col('tbp_lv_ratio_B').truediv(pl.col('tbp_lv_ratio_B').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_ratio_C = pl.col('tbp_lv_ratio_C').truediv(pl.col('tbp_lv_ratio_C').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_ratio_H = pl.col('tbp_lv_ratio_H').truediv(pl.col('tbp_lv_ratio_H').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_ratio_L = pl.col('tbp_lv_ratio_L').truediv(pl.col('tbp_lv_ratio_L').mean()).over('patient_id').cast(pl.Float32),
#         )
#         .with_columns(
#             # Ratio contrast to patient average
#             tbp_lv_patient_contrast_A = pl.col('tbp_lv_contrast_A').truediv(pl.col('tbp_lv_contrast_A').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_contrast_B = pl.col('tbp_lv_contrast_B').truediv(pl.col('tbp_lv_contrast_B').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_contrast_C = pl.col('tbp_lv_contrast_C').truediv(pl.col('tbp_lv_contrast_C').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_contrast_H = pl.col('tbp_lv_contrast_H').truediv(pl.col('tbp_lv_contrast_H').mean()).over('patient_id').cast(pl.Float32),
#             tbp_lv_patient_contrast_L = pl.col('tbp_lv_contrast_L').truediv(pl.col('tbp_lv_contrast_L').mean()).over('patient_id').cast(pl.Float32),
#         )
#         .with_columns(
#             # Ratio to age average
#             tbp_lv_age_ratio_A = pl.col('tbp_lv_ratio_A').truediv(pl.col('tbp_lv_ratio_A').mean()).over('age_approx').cast(pl.Float32), 
#             tbp_lv_age_ratio_B = pl.col('tbp_lv_ratio_B').truediv(pl.col('tbp_lv_ratio_B').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_ratio_C = pl.col('tbp_lv_ratio_C').truediv(pl.col('tbp_lv_ratio_C').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_ratio_H = pl.col('tbp_lv_ratio_H').truediv(pl.col('tbp_lv_ratio_H').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_ratio_L = pl.col('tbp_lv_ratio_L').truediv(pl.col('tbp_lv_ratio_L').mean()).over('age_approx').cast(pl.Float32),
#         )
#         .with_columns(
#             # contrast to age average
#             tbp_lv_age_contrast_A = pl.col('tbp_lv_contrast_A').truediv(pl.col('tbp_lv_contrast_A').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_contrast_B = pl.col('tbp_lv_contrast_B').truediv(pl.col('tbp_lv_contrast_B').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_contrast_C = pl.col('tbp_lv_contrast_C').truediv(pl.col('tbp_lv_contrast_C').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_contrast_H = pl.col('tbp_lv_contrast_H').truediv(pl.col('tbp_lv_contrast_H').mean()).over('age_approx').cast(pl.Float32),
#             tbp_lv_age_contrast_L = pl.col('tbp_lv_contrast_L').truediv(pl.col('tbp_lv_contrast_L').mean()).over('age_approx').cast(pl.Float32),
#         )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('sex')) / (pl.col(col).std().over('sex') + err)).alias(f'{col}_sex_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
        .set_index(id_col)
    )

In [21]:
def preprocess(df_train, df_test):
    global cat_cols
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

## Evaluation

In [22]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

def cal_cv(y_pred, y_true):
    y_hat = y_pred
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

## Read Data

In [23]:
df_train = read_data(train_path)
# df_test = read_data(test_path)
df_test = read_data(train_path)
# df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train, df_test = preprocess(df_train, df_test)

## ML-Model lgb-xgb-catgb

In [24]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           250,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        4, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

lgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

In [25]:
cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
    'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24,
    'cat_features':      cat_cols,
}

cb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

In [26]:
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'learning_rate':      0.08501257473292347, 
    'lambda':             8.879624125465703, 
    'alpha':              0.6779926606782505, 
    'max_depth':          6, 
    'subsample':          0.6012681388711075, 
    'colsample_bytree':   0.8437772277074493, 
    'colsample_bylevel':  0.5476090898823716, 
    'colsample_bynode':   0.9928601203635129, 
    'scale_pos_weight':   3.29440313334688,
}

xgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

In [27]:
estimator = VotingClassifier([
    ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
], voting='soft')

## Cross_Val_Score

In [28]:
# X = df_train[feature_cols]
# y = df_train[target_col]
# groups = df_train[group_col]
# cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

# val_score = cross_val_score(
#     estimator=estimator, 
#     X=X, y=y, 
#     cv=cv, 
#     groups=groups,
#     scoring=custom_metric,
# )

# np.mean(val_score), val_score

# (0.17107799677766736, array([0.16475346, 0.17189171, 0.18386501, 0.16892588, 0.16595392]))

## Make Kfold

In [29]:
# df = pd.read_csv(CONFIG.my_train_csv)
df = train

df_train["kfold"] = df["kfold"].values
df_train[CONFIG.dl_pred_name] = df[CONFIG.dl_pred_name].values
df_train

Unnamed: 0_level_0,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,onehot_37,onehot_38,onehot_39,onehot_40,onehot_41,onehot_42,onehot_43,onehot_44,kfold,cv15496
isic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,16.261975,...,0,0,0,0,0,1,0,0,4,0.055244
ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,25.364740,...,0,0,0,0,0,1,0,0,0,0.160246
ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,17.128170,...,0,0,0,0,0,1,0,0,0,0.019354
ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,12.164757,...,0,1,0,0,0,0,0,0,2,0.045941
ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,20.057470,...,0,0,0,0,0,1,0,0,4,0.092378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,14.944666,...,0,0,1,0,0,0,0,0,4,0.615235
ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,16.026870,...,0,0,0,0,0,1,0,0,1,0.102108
ISIC_9999960,0,IP_0076153,65.0,male,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,12.364397,...,0,0,0,0,1,0,0,0,2,0.051599
ISIC_9999964,0,IP_5231513,30.0,male,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,9.564721,...,0,0,0,0,0,0,1,0,0,0.076979


## Kfold Train

In [30]:
feature_cols.append(CONFIG.dl_pred_name)
len(feature_cols)

275

In [31]:
models = []

oof = []
true = []
for fold in tqdm(range(0, 5)):
    train_ml = df_train[df_train["kfold"] != fold]
    valid_ml = df_train[df_train["kfold"] == fold]
    
    X = train_ml[feature_cols]
    y = train_ml[target_col]
    
    X_val = valid_ml[feature_cols]
    y_val = valid_ml[target_col]
    
    
    estimator = VotingClassifier([
        ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
    ], voting='soft')

    estimator.fit(X, y)
    
    models.append(estimator)
    cv = custom_metric(estimator, X_val, y_val)
    print(cv)
    
    y_val_pred = estimator.predict_proba(X_val)
    del estimator
    gc.collect()
    
    oof.append(y_val_pred)
    true.append(y_val)

  0%|          | 0/5 [00:00<?, ?it/s]

0.18558028416113975


 20%|██        | 1/5 [00:16<01:05, 16.39s/it]

0.1770393995167252


 40%|████      | 2/5 [00:31<00:47, 15.72s/it]

0.18683538708472128


 60%|██████    | 3/5 [00:46<00:31, 15.55s/it]

0.16430096971359315


 80%|████████  | 4/5 [01:02<00:15, 15.58s/it]

0.17996424230585573


100%|██████████| 5/5 [01:17<00:00, 15.56s/it]


## Cal CV

In [32]:
oof = np.concatenate(oof, axis=0)
true = np.concatenate(true, axis=0)

cal_cv(oof[:, 1], true)
# 0.1781605408165887

0.1781605408165887

In [33]:
from joblib import dump

ml_save_path = "/data1/hjs/pythonProject/pythonProject/ISIC2024/ML_output"

if os.path.exists(ml_save_path) is False:
    os.makedirs(ml_save_path)
for fold in range(CONFIG.n_folds):
    # 保存模型到文件
    estimator = models[fold]
    dump(estimator, f'{ml_save_path}/voting_classifier_model_{fold}.joblib')
    
# from joblib import load
# models = []
# # 从文件加载模型
# loaded_estimator = load('/kaggle/working/voting_classifier_model_0.joblib')
# for fold in range(CONFIG.n_folds):
#     # 从文件加载模型
#     loaded_estimator = load(f'/kaggle/working/voting_classifier_model_{fold}.joblib')
#     models.append(loaded_estimator)