# DL Part

## Import Libs

In [30]:
import os
import random
import gc
import time
import copy
import sys
import numpy as np
import pandas as pd
# import h5py
# from io import BytesIO

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from torch import nn
import timm

from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
from collections import defaultdict

import warnings # Avoid some errors that can be ignored
warnings.filterwarnings('ignore')
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

## CONFIG

In [31]:
is_debug = False

class CONFIG:
    seed = 308
    n_seed = [7, 42, 308, 7777, 9216]
    # n_seed = [7, 42, 308, 666, 7777, 9216]
    # n_seed = [7, 42, 308, 666, 1414, 7777, 9216, 10267]
    # n_seed = [301, 302, 303, 304, 305, 306, 307, 308, 309, 310]
    
    # 256 tiny_vit ---------- 3h+
    # 512 efficientnet convnext -------------- 17mins+
    #     efficientvit_b0 --------------- 7mins+
    test_batch_size = 512
    img_size = [160, 160]
    n_classes = 1
    n_folds = 5
    
    n_accumulate = 1.0
    n_workers = os.cpu_count()

    DataParallel = False
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    """
    tf_efficientnet_b0_ns
    tiny_vit_21m_512.dist_in22k_ft_in1k
    eva02_base_patch14_224.mim_in22k
    convnext_tiny.fb_in22k_ft_in1k_384
    tf_efficientnetv2_s.in21k_ft_in1k
    efficientvit_b0.r224_in1k

    edgenext_base.in21k_ft_in1k
    """
    # model_name = "ensemble_linear"
    # model_names = ['tf_efficientnetv2_s.in21k_ft_in1k', 
    #                'edgenext_base.in21k_ft_in1k', 
    #                'efficientvit_b3.r256_in1k',
    #                'tf_efficientnet_b3.ns_jft_in1k']
    # ensemble_path = "/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-09-04_15:09:02_ensemble_linear_output"
    # ckpt_path = ['/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-09-03_12:17:57_tf_efficientnetv2_s.in21k_ft_in1k_output', 
    #              '/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-09-03_20:22:16_edgenext_base.in21k_ft_in1k_output', 
    #              '/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-09-03_23:18:27_efficientvit_b3.r256_in1k_output',
    #              '/data1/hjs/pythonProject/pythonProject/ISIC2024/output/2024-08-29_01:58:41_tf_efficientnet_b3.ns_jft_in1k_output']
    # use_gempool = False
    # dl_pred_name = "cv1606"
    model_name = "ensemble_linear"
    model_names = ["tf_efficientnetv2_s.in21k_ft_in1k",
                   "edgenext_base.in21k_ft_in1k",
                   "convnext_atto_ols.a2_in1k",
                   "tf_efficientnet_b3.ns_jft_in1k"]
    ensemble_path = "/kaggle/input/20240829-ensemblel-train1954-cv159"
    ckpt_path = ["/kaggle/input/20240827-efficientnetv2s-train1954-cv15496",
                 "/kaggle/input/20240828-edgenext-train1954-cv0-1519",
                 "/kaggle/input/20240829-convnext-atto-train1954-cv149",
                 "/kaggle/input/20240829-efficientnet-b3-train1954-cv1504"]
    use_gempool = False
    dl_pred_name = "cv159"
    
    old_my_train_csv = "/kaggle/input/my-train-with-sgkfold/my_train_with_sgkfold.csv"
    my_train_csv = "/kaggle/input/my-train-with-sgkfold/my_train_with_sgkfold.csv"
    train_img_dir = "/kaggle/input/isic-2024-train-1954-imgs/train_1954_img"
    img_dir = "/kaggle/input/isic-2024-challenge/train-image/image"
    
    train_1954_csv = "/kaggle/input/isic-2024-train-1954/train_1954.csv"

if CONFIG.DataParallel:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    print("IN DataParallel!")
else:
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    print("NO IN DataParallel!")

NO IN DataParallel!


## Set Random Seed

In [32]:
def set_seed(seed=308):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(CONFIG.seed)

## Data Progress

In [33]:
train = pd.read_csv(CONFIG.my_train_csv)
train

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,kfold
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,Benign,Benign,,,,,,,97.517282,4
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,Benign,Benign,,,,,,,3.141455,0
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,Benign,Benign,,,,,,,99.804040,0
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,Benign,Benign,,,,,,,99.989998,2
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,Benign,Benign,,,,,,,70.442510,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,Benign,Benign,,,,,,,99.999988,4
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,Benign,Benign,,,,,,,99.999820,1
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,Benign,Benign,,,,,,,99.999416,2
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,Benign,Benign,,,,,,,100.000000,0


## Dataset and DataLoader

In [34]:
def transform(img):
    composition = A.Compose([
        A.Resize(CONFIG.img_size[0], CONFIG.img_size[1]),
        A.Normalize(),
        ToTensorV2(),
    ])
    return composition(image=img)["image"]

In [35]:
# class MyDataset(Dataset):
#     def __init__(self, df, img_hdf5, transform=None):
#         super().__init__()
#         self.df = df
#         self.fp_hdf = h5py.File(img_hdf5, mode="r")
#         self.transform = transform

#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         row = self.df.iloc[idx, :]
#         img_id = row.isic_id
#         label = str(img_id)
        
#         img = np.array( Image.open(BytesIO(self.fp_hdf[img_id][()])) )
#         img = np.array(img).astype(np.float32)

#         if self.transform != None:
#             img = self.transform(img)

#         return img, label
    
class MyDataset(Dataset):
    def __init__(self, df, transform=None):
        super().__init__()
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :]
        img_id = row.isic_id + ".jpg"
        label = torch.tensor(row.target, dtype=torch.float32)
        

        img_path = os.path.join(CONFIG.train_img_dir, img_id)
        if os.path.exists(img_path) is False:
            img_path = os.path.join(CONFIG.img_dir, img_id)
        img = Image.open(img_path)
        img = np.array(img)

        if self.transform != None:
            img = self.transform(img)

        return img, label

In [36]:
def prepare_loaders(df, fold):
    df = df[df["kfold"] == fold]
    test_datasets = MyDataset(df=df, transform=transform)
    
    test_loader = DataLoader(test_datasets, batch_size=CONFIG.test_batch_size, num_workers=CONFIG.n_workers, shuffle=False, pin_memory=True)
    
    return test_loader

In [37]:
# test_loader = prepare_loaders(test)
# x, y = next(iter(test_loader))
# x.shape

## DL Model

In [38]:
def updata_req_grad(models, requires_grad=True):
    for model in models:
        for param in model.parameters():
            param.requires_grad = requires_grad

In [39]:
class GeMPool(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeMPool, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
    
    def gem(self, x, p=3, eps=1e-6):
        return torch.mean(x.clamp(min=eps).pow(p), dim=(-2, -1)).pow(1./p)
    
    def __repr__(self):
        return self.__class__.__name__ + f'(p={self.p.data.tolist()[0]:.4f}, eps={self.eps})'

In [40]:
class ensemblelinear(nn.Module):
    def __init__(self, in_features, out_features=1) -> None:
        super().__init__()
        self.model = nn.Linear(in_features, in_features, bias=False)
        self.softmax = nn.Softmax()
        self.out_features = out_features

    def forward(self, x):
        Identity = x
        _tmp = self.model(x)
        _tmp = self.softmax(_tmp)
        _tmp = Identity * _tmp
        output = _tmp.sum(dim=1, keepdim=True)
        return output

In [41]:
class ISIC2024Model(nn.Module):
    def __init__(self, model_name=CONFIG.model_names[0]):
        super(ISIC2024Model, self).__init__()
        self.backbone = timm.create_model(model_name=model_name, 
                                          pretrained=False)
        
        if "efficientnet" in model_name:
            in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            if CONFIG.use_gempool:
                self.backbone.global_pool = GeMPool()
        elif "convnext" in model_name or "tiny_vit" in model_name:
            in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
        elif "eva" in model_name:
            in_features = 768
        elif "efficientvit" in model_name:
            in_features = self.backbone.head.classifier[4].in_features
            self.backbone.head.classifier[4] = nn.Identity()
        elif "edgenext" in model_name:
            in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()

        self.head = nn.Sequential(
            nn.Linear(in_features, CONFIG.n_classes)
        )
        
    def forward(self, x):
        _tmp = self.backbone(x)
        output = self.head(_tmp)
        return output

## Load Models

In [42]:
from collections import OrderedDict

def remove_module(path):
    # Load a saved state dictionary
    state_dict = torch.load(path)

    # Use OrderedDict to remove the 'module.' prefix
    new_state_dict = OrderedDict()
    for key, value in state_dict.items():
        new_state_dict[key.replace('module.', '')] = value
    return new_state_dict

In [43]:
# Load DL Models
more_models = []

for n_models in range(len(CONFIG.model_names)):
    models = []
    all_paths = sorted(os.listdir(CONFIG.ckpt_path[n_models]))
    paths = []
    for i in range(CONFIG.n_folds):
        _tmp_paths = []
        for path in all_paths:
            if path[0] == str(i+1):
                _tmp_paths.append(path)
        paths.append(_tmp_paths[-1])
    # paths = ["1_CV0.1547_Loss0.6784_epoch2.bin",
    #          "2_CV0.1708_Loss0.5088_epoch5.bin",
    #          "3_CV0.1791_Loss0.3739_epoch7.bin",
    #          "4_CV0.1654_Loss0.5382_epoch3.bin",
    #          "5_CV0.1812_Loss0.4236_epoch8.bin"]

    if CONFIG.DataParallel:
        device_ids = [0, 1]
        for i in range(CONFIG.n_folds):
            model = ISIC2024Model(model_name=CONFIG.model_names[n_models])
            model = torch.nn.DataParallel(model, device_ids=device_ids)
            model = model.cuda()
            model.load_state_dict(torch.load(os.path.join(CONFIG.ckpt_path[n_models], paths[i])))
            print(f"fold : {i} --------- path : {paths[i]}")
            model.eval()
            models.append(model)
    else:
        for i in range(CONFIG.n_folds):
            model = ISIC2024Model(model_name=CONFIG.model_names[n_models])
            model = model.cuda()
            model.load_state_dict(remove_module(os.path.join(CONFIG.ckpt_path[n_models], paths[i])))
            print(f"fold : {i} --------- path : {paths[i]}")
            model.eval()
            models.append(model)
    more_models.append(models)
    print(f"{CONFIG.model_names[n_models]} load success.")

fold : 0 --------- path : 1_CV_0.1514_Loss0.2055_epoch15.bin
fold : 1 --------- path : 2_CV_0.1445_Loss0.2757_epoch17.bin
fold : 2 --------- path : 3_CV_0.1704_Loss0.1593_epoch25.bin
fold : 3 --------- path : 4_CV_0.1492_Loss0.2547_epoch20.bin
fold : 4 --------- path : 5_CV_0.1627_Loss0.2155_epoch19.bin
tf_efficientnetv2_s.in21k_ft_in1k load success.
fold : 0 --------- path : 1_CV_0.1485_Loss0.2624_epoch25.bin
fold : 1 --------- path : 2_CV_0.1441_Loss0.3111_epoch23.bin
fold : 2 --------- path : 3_CV_0.1690_Loss0.2146_epoch27.bin
fold : 3 --------- path : 4_CV_0.1470_Loss0.2021_epoch8.bin
fold : 4 --------- path : 5_CV_0.1628_Loss0.1775_epoch28.bin
edgenext_base.in21k_ft_in1k load success.
fold : 0 --------- path : 1_CV_0.1443_Loss0.2453_epoch30.bin
fold : 1 --------- path : 2_CV_0.1450_Loss0.3462_epoch19.bin
fold : 2 --------- path : 3_CV_0.1583_Loss0.2147_epoch28.bin
fold : 3 --------- path : 4_CV_0.1528_Loss0.2709_epoch23.bin
fold : 4 --------- path : 5_CV_0.1552_Loss0.2250_epoch19.

In [45]:
# Load ensemble Models
ensemble_models = []

all_paths = sorted(os.listdir(CONFIG.ensemble_path))
paths = []
for i in range(CONFIG.n_folds):
    _tmp_paths = []
    for path in all_paths:
        if path[0] == str(i+1):
            _tmp_paths.append(path)
    paths.append(_tmp_paths[-1])

if CONFIG.DataParallel:
    device_ids = [0, 1]
    for i in range(CONFIG.n_folds):
        model = ensemblelinear(in_features=len(CONFIG.model_names))
        model = torch.nn.DataParallel(model, device_ids=device_ids)
        model = model.cuda()
        model.load_state_dict(torch.load(os.path.join(CONFIG.ensemble_path, paths[i])))
        print(f"fold : {i} --------- path : {paths[i]}")
        model.eval()
        ensemble_models.append(model)
else:
    for i in range(CONFIG.n_folds):
        model = ensemblelinear(in_features=len(CONFIG.model_names))
        model = model.cuda()
        model.load_state_dict(remove_module(os.path.join(CONFIG.ensemble_path, paths[i])))
        print(f"fold : {i} --------- path : {paths[i]}")
        model.eval()
        ensemble_models.append(model)
print(f"{CONFIG.model_name} load success.")

fold : 0 --------- path : 1_CV_0.1538_Loss0.2314_epoch8.bin
fold : 1 --------- path : 2_CV_0.1509_Loss0.2566_epoch1.bin
fold : 2 --------- path : 3_CV_0.1719_Loss0.1896_epoch1.bin
fold : 3 --------- path : 4_CV_0.1554_Loss0.2305_epoch1.bin
fold : 4 --------- path : 5_CV_0.1654_Loss0.1897_epoch2.bin
ensemble_linear load success.


## Infer Function

In [46]:
def Infer(ensemble_model, dl_models, test_loader):
    y_preds = []
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    with torch.no_grad():
        for step, (images, study_id) in bar:
            
            if CONFIG.DataParallel:
                images = images.cuda().float()
            else:
                images = images.to(CONFIG.device, dtype=torch.float)
                
            ensemble_input = []
            for dl_model in dl_models:
                output = dl_model(images)
                output = F.sigmoid(output)
                ensemble_input.append(output)
            ensemble_input = torch.cat(ensemble_input, axis=1)
            outputs = ensemble_model(ensemble_input)
            y_preds.append(outputs.detach().flatten().cpu().numpy())
            
    y_preds = np.concatenate(y_preds)
    return y_preds

## Start Infer

In [47]:
# train preds
total_train_pred = []

for fold in range(0, CONFIG.n_folds):
    train_loader = prepare_loaders(train, fold)

    dl_model = []
    for dl_model_per in more_models:
        dl_model.append(dl_model_per[fold])
    preds = Infer(ensemble_models[fold], dl_model, train_loader)
    total_train_pred.append(preds)

100%|██████████| 148/148 [03:50<00:00,  1.56s/it]
100%|██████████| 174/174 [04:27<00:00,  1.54s/it]
100%|██████████| 160/160 [04:07<00:00,  1.55s/it]
100%|██████████| 132/132 [03:24<00:00,  1.55s/it]
100%|██████████| 172/172 [04:25<00:00,  1.54s/it]


In [48]:
train[CONFIG.dl_pred_name] = -1

for fold in range(CONFIG.n_folds):
    train.loc[train["kfold"] == fold, CONFIG.dl_pred_name] = total_train_pred[fold]
train

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,kfold,cv159
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,Benign,,,,,,,97.517282,4,0.139400
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,Benign,,,,,,,3.141455,0,0.315115
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,Benign,,,,,,,99.804040,0,0.060258
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,Benign,,,,,,,99.989998,2,0.063040
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,Benign,,,,,,,70.442510,4,0.093386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,Benign,,,,,,,99.999988,4,0.335972
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,Benign,,,,,,,99.999820,1,0.071880
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,Benign,,,,,,,99.999416,2,0.058406
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,Benign,,,,,,,100.000000,0,0.074318


# ML Part

## Import libs

In [49]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

from joblib import dump
import optuna
import random
from tqdm import tqdm
import gc
import warnings # 避免一些可以忽略的报错
warnings.filterwarnings('ignore')

## CONFIG

In [55]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
# /kaggle/input/isic2024-my-train-csv/my_train.csv
test_path = root / 'test-metadata.csv'
# subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = CONFIG.seed

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
    
#     'size_age_interaction2',             # tbp_lv_perimeterMM      * age_approx             / (sum)
#     'hue_color_std_interaction2',        # tbp_lv_H                * tbp_lv_color_std_mean  / (sum)
#     'symmetry_perim_interaction2',       # tbp_lv_symm_2axis       * clin_size_long_diam_mm / (sum)
    
#     'tbp_lv_ratio_A',
#     'tbp_lv_ratio_B',
#     'tbp_lv_ratio_C',
#     'tbp_lv_ratio_H',
#     'tbp_lv_ratio_L',
    
#     'tbp_lv_contrast_A',
#     'tbp_lv_contrast_B',
#     'tbp_lv_contrast_C',
#     'tbp_lv_contrast_H',
#     'tbp_lv_contrast_L',
    
#     'tbp_lv_patient_ratio_A',
#     'tbp_lv_patient_ratio_B',
#     'tbp_lv_patient_ratio_C',
#     'tbp_lv_patient_ratio_H',
#     'tbp_lv_patient_ratio_L',
    
#     'tbp_lv_patient_contrast_A',
#     'tbp_lv_patient_contrast_B',
#     'tbp_lv_patient_contrast_C',
#     'tbp_lv_patient_contrast_H',
#     'tbp_lv_patient_contrast_L',
    
#     'tbp_lv_age_ratio_A',
#     'tbp_lv_age_ratio_B',
#     'tbp_lv_age_ratio_C',
#     'tbp_lv_age_ratio_H',
#     'tbp_lv_age_ratio_L',
    
#     'tbp_lv_age_contrast_A',
#     'tbp_lv_age_contrast_B',
#     'tbp_lv_age_contrast_C',
#     'tbp_lv_age_contrast_H',
#     'tbp_lv_age_contrast_L',
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
# norm_cols2 = [f'{col}_sex_norm' for col in num_cols + new_num_cols]
loc_norm_cols = [f'{col}_patient_location_norm' for col in num_cols + new_num_cols]
anatom_norm_cols = [f'{col}_patient_anatom_norm' for col in num_cols + new_num_cols]
type_norm_cols = [f'{col}_patient_type_norm' for col in num_cols + new_num_cols]

max_col = [f'max_{col}' for col in num_cols + new_num_cols]
min_col = [f'min_{col}' for col in num_cols + new_num_cols]
std_col = [f'std_{col}' for col in num_cols + new_num_cols]
sum_col = [f'sum_{col}' for col in num_cols + new_num_cols]

special_cols = ['count_per_patient']

feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols + loc_norm_cols

## Set Seed

In [51]:
def set_seed(seed=308):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(seed)

## Feature Engineering

In [52]:
def read_data(path):
    # df = pd.read_csv(path)
    df = pd.read_csv(path, low_memory=False)
    
    df['sex'] = df['sex'].fillna('Unknown')
    # df['sex'] = df['sex'].apply(lambda x: 'male' if x!='male' or x!='female' else x)
    
    return (
        pl.from_pandas(df)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            # ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols)
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over(['patient_id', 'tbp_lv_location'])) / (pl.col(col).std().over(['patient_id', 'tbp_lv_location']) + err)).alias(f'{col}_patient_location_norm') for col in (num_cols + new_num_cols)
        )
        # .with_columns(
        #     ((pl.col(col) - pl.col(col).mean().over(['patient_id', 'anatom_site_general'])) / (pl.col(col).std().over(['patient_id', 'anatom_site_general']) + err)).alias(f'{col}_patient_anatom_norm') for col in (num_cols + new_num_cols)
        # )
        # .with_columns(
        #     ((pl.col(col) - pl.col(col).mean().over(['patient_id', 'tbp_tile_type'])) / (pl.col(col).std().over(['patient_id', 'tbp_tile_type']) + err)).alias(f'{col}_patient_type_norm') for col in (num_cols + new_num_cols)
        # )
        # .with_columns(
        #     ((pl.col(col) - pl.col(col).mean().over('sex')) / (pl.col(col).std().over('sex') + err)).alias(f'{col}_sex_norm') for col in (num_cols + new_num_cols)
        # )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        # .with_columns(
        #     pl.col(col).max().over(['patient_id', 'tbp_lv_location']).alias(f'max_{col}') for col in (num_cols + new_num_cols)
        # )
        # .with_columns(
        #     pl.col(col).max().over(['patient_id', 'tbp_lv_location']).alias(f'min_{col}') for col in (num_cols + new_num_cols)
        # )
        # .with_columns(
        #     pl.col(col).max().over(['patient_id', 'tbp_lv_location']).alias(f'std_{col}') for col in (num_cols + new_num_cols)
        # )
        # .with_columns(
        #     pl.col(col).max().over(['patient_id', 'tbp_lv_location']).alias(f'sum_{col}') for col in (num_cols + new_num_cols)
        # )
        
        .to_pandas()
        .set_index(id_col)
    )

In [53]:
def preprocess(df_train, df_test):
    global cat_cols
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

## Evaluation

In [54]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

def cal_cv(y_pred, y_true):
    y_hat = y_pred
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

## Read Data

In [56]:
df_train = read_data(train_path)
df_test = read_data(test_path)
# df_test = read_data(train_path)
# df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train, df_test = preprocess(df_train, df_test)

## ML-Model lgb-xgb-catgb

In [57]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           250,
    'boosting_type':    'gbdt',
    'random_state':     None,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        4, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

# lgb_model = Pipeline([
#     ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
#     ('classifier', lgb.LGBMClassifier(**lgb_params)),
# ])

In [58]:
cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      None,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
    'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24,
    'cat_features':      cat_cols,
}

# cb_model = Pipeline([
#     ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
#     ('classifier', cb.CatBoostClassifier(**cb_params)),
# ])

In [59]:
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       None,
    'learning_rate':      0.08501257473292347, 
    'lambda':             8.879624125465703, 
    'alpha':              0.6779926606782505, 
    'max_depth':          6, 
    'subsample':          0.6012681388711075, 
    'colsample_bytree':   0.8437772277074493, 
    'colsample_bylevel':  0.5476090898823716, 
    'colsample_bynode':   0.9928601203635129, 
    'scale_pos_weight':   3.29440313334688,
}

# xgb_model = Pipeline([
#     ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
#     ('classifier', xgb.XGBClassifier(**xgb_params)),
# ])

In [60]:
# estimator = VotingClassifier([
#     ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
# ], voting='soft')

## Cross_Val_Score

In [61]:
# X = df_train[feature_cols]
# y = df_train[target_col]
# groups = df_train[group_col]
# cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

# val_score = cross_val_score(
#     estimator=estimator, 
#     X=X, y=y, 
#     cv=cv, 
#     groups=groups,
#     scoring=custom_metric,
# )

# np.mean(val_score), val_score

# (0.17107799677766736, array([0.16475346, 0.17189171, 0.18386501, 0.16892588, 0.16595392]))

## Make Kfold

In [62]:
# df = pd.read_csv(CONFIG.my_train_csv)
df = train

df_train["kfold"] = df["kfold"].values
df_train[CONFIG.dl_pred_name] = df[CONFIG.dl_pred_name].values
df_train

Unnamed: 0_level_0,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,onehot_39,onehot_40,onehot_41,onehot_42,onehot_43,onehot_44,onehot_45,onehot_46,kfold,cv159
isic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,16.261975,...,0,0,0,0,0,1,0,0,4,0.139400
ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,25.364740,...,0,0,0,0,0,1,0,0,0,0.315115
ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,17.128170,...,0,0,0,0,0,1,0,0,0,0.060258
ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,12.164757,...,0,1,0,0,0,0,0,0,2,0.063040
ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,20.057470,...,0,0,0,0,0,1,0,0,4,0.093386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,14.944666,...,0,0,1,0,0,0,0,0,4,0.335972
ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,16.026870,...,0,0,0,0,0,1,0,0,1,0.071880
ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,12.364397,...,0,0,0,0,1,0,0,0,2,0.058406
ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,9.564721,...,0,0,0,0,0,0,1,0,0,0.074318


## Kfold Train

In [63]:
feature_cols.append(CONFIG.dl_pred_name)
len(feature_cols)

277

In [64]:
n_seed_oof = []

for now_seed in CONFIG.n_seed:
    lgb_params['random_state'] = now_seed
    cb_params['random_state'] = now_seed
    xgb_params['random_state'] = now_seed
    
    sampler_seed = 308
    lgb_model = Pipeline([
        ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=sampler_seed)),
        ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=sampler_seed)),
        ('classifier', lgb.LGBMClassifier(**lgb_params)),
    ])
    cb_model = Pipeline([
        ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=sampler_seed)),
        ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=sampler_seed)),
        ('classifier', cb.CatBoostClassifier(**cb_params)),
    ])
    xgb_model = Pipeline([
        ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=sampler_seed)),
        ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=sampler_seed)),
        ('classifier', xgb.XGBClassifier(**xgb_params)),
    ])
    estimator = VotingClassifier([
        ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
    ], voting='soft')

    models = []
    oof = []
    true = []
    for fold in tqdm(range(0, 5)):
        train_ml = df_train[df_train["kfold"] != fold]
        valid_ml = df_train[df_train["kfold"] == fold]
        
        X = train_ml[feature_cols]
        y = train_ml[target_col]
        
        X_val = valid_ml[feature_cols]
        y_val = valid_ml[target_col]
        
        
        estimator = VotingClassifier([
            ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
        ], voting='soft')

        estimator.fit(X, y)
        
        models.append(estimator)
        cv = custom_metric(estimator, X_val, y_val)
        print(cv)
        
        y_val_pred = estimator.predict_proba(X_val)
        del estimator
        gc.collect()
        
        oof.append(y_val_pred)
        true.append(y_val)

    oof = np.concatenate(oof, axis=0)
    true = np.concatenate(true, axis=0)

    cv = cal_cv(oof[:, 1], true)
    print(f"seed : {now_seed} ----> cv : {cv}")

    n_seed_oof.append(oof[:, 1])

    ml_save_path = "/data1/hjs/pythonProject/pythonProject/ISIC2024/ML_output"
    if os.path.exists(ml_save_path) is False:
        os.makedirs(ml_save_path)
    for fold in range(CONFIG.n_folds):
        # Save the model to a file
        estimator = models[fold]
        dump(estimator, f'{ml_save_path}/seed_{now_seed}_voting_classifier_model_{fold}.joblib')

    print(f"#################### seed : {now_seed} complete! ####################")
    print()

    del lgb_model, cb_model, xgb_model, estimator
    gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]

0.18308693334278464


 20%|██        | 1/5 [01:07<04:28, 67.25s/it]

0.18275539697410645


 40%|████      | 2/5 [02:13<03:19, 66.53s/it]

0.18927393649398655


 60%|██████    | 3/5 [03:20<02:14, 67.02s/it]

0.17263528551252216


 80%|████████  | 4/5 [04:30<01:07, 67.87s/it]

0.18256562811998087


100%|██████████| 5/5 [05:35<00:00, 67.17s/it]


seed : 7 ----> cv : 0.18182360847560308
#################### seed : 7 complete! ####################



  0%|          | 0/5 [00:00<?, ?it/s]

0.18259668079940805


 20%|██        | 1/5 [01:06<04:26, 66.58s/it]

0.18104960542897827


 40%|████      | 2/5 [02:10<03:15, 65.29s/it]

0.18967581540525127


 60%|██████    | 3/5 [03:18<02:12, 66.10s/it]

0.17461950511002408


 80%|████████  | 4/5 [04:28<01:07, 67.71s/it]

0.1828817500108521


100%|██████████| 5/5 [05:36<00:00, 67.21s/it]


seed : 42 ----> cv : 0.18201336377984087
#################### seed : 42 complete! ####################



  0%|          | 0/5 [00:00<?, ?it/s]

0.18253229338803298


 20%|██        | 1/5 [01:08<04:32, 68.17s/it]

0.1799254490003388


 40%|████      | 2/5 [02:13<03:20, 66.71s/it]

0.1893806736716365


 60%|██████    | 3/5 [03:20<02:13, 66.82s/it]

0.17294706796175682


 80%|████████  | 4/5 [04:30<01:07, 67.89s/it]

0.18377748296219118


100%|██████████| 5/5 [05:39<00:00, 67.81s/it]


seed : 308 ----> cv : 0.18164954841283407
#################### seed : 308 complete! ####################



  0%|          | 0/5 [00:00<?, ?it/s]

0.1830740853960195


 20%|██        | 1/5 [01:07<04:31, 68.00s/it]

0.18160282434446723


 40%|████      | 2/5 [02:13<03:20, 66.74s/it]

0.19078646768332386


 60%|██████    | 3/5 [03:20<02:13, 66.83s/it]

0.17440420479234683


 80%|████████  | 4/5 [04:30<01:07, 67.79s/it]

0.1832875917003081


100%|██████████| 5/5 [05:37<00:00, 67.57s/it]


seed : 7777 ----> cv : 0.18250045988950023
#################### seed : 7777 complete! ####################



  0%|          | 0/5 [00:00<?, ?it/s]

0.18405806090517474


 20%|██        | 1/5 [01:08<04:34, 68.68s/it]

0.1817780834235874


 40%|████      | 2/5 [02:15<03:22, 67.65s/it]

0.18983108771441443


 60%|██████    | 3/5 [03:23<02:15, 67.87s/it]

0.17140347415874874


 80%|████████  | 4/5 [04:32<01:08, 68.09s/it]

0.18329201393410596


100%|██████████| 5/5 [05:38<00:00, 67.69s/it]


seed : 9216 ----> cv : 0.18209888296800075
#################### seed : 9216 complete! ####################



## Cal CV

In [None]:
# (0.18143600910284405 + 0.17887079377102694 + 0.1897565602149504 + 0.171365298352746 + 0.1823258236749576) / 5

In [65]:
# oof = np.concatenate(oof, axis=0)
# true = np.concatenate(true, axis=0)

# cal_cv(oof[:, 1], true)
# 0.1781605408165887
# 0.1795317424986125
# 0.1800544167752041

oof_mean = np.mean(n_seed_oof, axis=0)
cal_cv(oof_mean, true)
# 0.18159598492428677  cv159  0.18185883608118178 0.18187747934040963 0.1818398092367048
# 0.18116504721928064  cv1606 0.18116360432907197 
# 0.18081447443441784  cv1604
# 0.18249626204430686 0.18249626204430686
# 0.18136309279146912

# 0.17304478628325565
# 0.173683454452916
# sample_seed 42  0.18244281286924444
# sample_seed 308 0.18249626204430686

0.18239644986009226

In [None]:
# ml_save_path = "/data1/hjs/pythonProject/pythonProject/ISIC2024/ML_output"

# if os.path.exists(ml_save_path) is False:
#     os.makedirs(ml_save_path)
# for fold in range(CONFIG.n_folds):
#     # 保存模型到文件
#     estimator = models[fold]
#     dump(estimator, f'{ml_save_path}/voting_classifier_model_{fold}.joblib')
    
# from joblib import load
# models = []
# # 从文件加载模型
# loaded_estimator = load('/kaggle/working/voting_classifier_model_0.joblib')
# for fold in range(CONFIG.n_folds):
#     # 从文件加载模型
#     loaded_estimator = load(f'/kaggle/working/voting_classifier_model_{fold}.joblib')
#     models.append(loaded_estimator)