In [1]:
import os

from pathlib import Path

from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedGroupKFold

import numpy as np, pandas as pd, polars as pl

import optuna, lightgbm as lgb, catboost as cb, xgboost as xgb

In [2]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path  = root / 'test-metadata.csv'
subm_path  = root / 'sample_submission.csv'

id_col     = 'isic_id'
group_col  = 'patient_id'
target_col = 'target'

err = 1e-5
sampling_ratio = 0.01
seed = 42

In [3]:
num_cols = [
    'age_approx',
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
    'sin_over_cos_tbp_lv_symm_2axis_angle',
]
cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'age_special', 'age_count', 'clin_size_gt10', 'attribution']
# cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols

In [4]:
def read_data(path):
#     df1 = pd.read_csv("autodl-tmp/isic_2024/isic_2024_continue_5folds_oof5folds.csv")
#     df2 = pd.read_csv("autodl-tmp/isic_2024/isic_2024_5folds_oof5folds.csv")
#     df3 = pd.read_csv("autodl-tmp/isic_2018-2024_mtarget_5folds_oof_mtarget_5folds.csv")
#     df4 = pd.read_csv("autodl-tmp/isic_2018-2024_mtarget_5folds_meta_oof5folds.csv")
    # df_train["target_eva1"] = df_eff["preds"]
    
    # _ll_lines = ["preds"]+[f'pred_class_{i}' for i in range(5)]
#     _ll_lines = ["pred_dnn_confidence", "preds", "bn_meta_preds"]+[f'pred_class_{i}' for i in range(5)]
    # feature_cols.extend(_ll_lines)
    
    df = pd.read_csv(path)
#     df = df.merge(df1[['isic_id', 'pred_dnn_confidence']], how='left', on='isic_id')
#     df = df.merge(df2[['isic_id', 'preds']], how='left', on='isic_id')
#     df = df.merge(df3[['isic_id']+[f'pred_class_{i}' for i in range(5)]], how='left', on='isic_id')
#     df = df.merge(df4[['isic_id', 'bn_meta_preds']], how='left', on='isic_id')
    
    col_age = ['age_approx']
    df[col_age] = df[col_age].fillna(55)
    ccols = ['sex', 'anatom_site_general']
    for col in ccols:
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)
    return (
        pl.from_pandas(df)
#         .with_columns(
#             pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
#         )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            sin_over_cos_tbp_lv_symm_2axis_angle = np.sin(2 * np.pi * pl.col('tbp_lv_symm_2axis_angle') / 180) / (np.cos(2 * np.pi * pl.col('tbp_lv_symm_2axis_angle') / 180) + err),
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            age_special = (pl.col('age_approx').n_unique().over('patient_id')>1) & (pl.col('age_approx') == pl.col('age_approx').min().over('patient_id')),#.cast(pl.Utf8)
        )
        .with_columns(
            age_count = (pl.col('age_approx').n_unique().over('patient_id')>1),
        )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            clin_size_gt10 = (pl.col('clin_size_long_diam_mm')>10),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
    )

In [5]:
def preprocess(df_train, df_test):
    global cat_cols
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

In [6]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

### Data Read & Feature Engineering

In [7]:
df_train = read_data(train_path)
df_test = read_data(test_path)
df_train, df_test = preprocess(df_train, df_test)
# nan_cols = df_train[feature_cols].isna().sum()[df_train[feature_cols].isna().sum() > 0].keys().tolist()
# print(nan_cols)
# for c in nan_cols:
#     df_train[c] = df_train[c].fillna(df_train[c].median())
# assert df_train[feature_cols].isna().sum().sum() == 0

# nan_cols = df_test[feature_cols].isna().sum()[df_test[feature_cols].isna().sum() > 0].keys().tolist()
# print(nan_cols)
# for c in nan_cols:
#     df_test[c] = df_test[c].fillna(df_test[c].median())
# assert df_test[feature_cols].isna().sum().sum() == 0

  df = pd.read_csv(path)


### Connecting parallel image lines

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from PIL import Image
import albumentations as A
from io import BytesIO
import h5py
import timm
from functools import partial

class ISICDataset(Dataset):
    def __init__(self, test_csv, file_hdf, img_size=224):
        self.mode = 'test'
        self.img_size = img_size
        self.df = pd.read_csv(test_csv)
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = self.df['isic_id'].values
        self.transforms = self.get_transforms()
        
    def __len__(self):
        return len(self.isic_ids)
    
    def get_transforms(self,):
        if self.mode == 'train':
            transforms=(A.Compose([
                A.Resize(self.img_size, self.img_size),
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
                A.RandomRotate90(p=0.5),
                A.RandomBrightnessContrast(brightness_limit=0.10, contrast_limit=0.10, p=0.5),
                A.ShiftScaleRotate(shift_limit=0.0, scale_limit=0.2, rotate_limit=180, border_mode=0, p=0.7),
                A.Normalize (mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0)
            ]))
        else:
            transforms=(A.Compose([A.Resize(self.img_size, self.img_size),
                                   A.Normalize (mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0)]))
        return transforms
    
    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        image = np.array(Image.open(BytesIO(self.fp_hdf[isic_id][()])))
        image = self.transforms(image=image)['image']
        image = torch.from_numpy(image.transpose((2,0,1)))
        return image, isic_id

class MyModel(nn.Module):
    def __init__(self, model_name, output_dim=384, num_classes=1, use_meta=False):
        super(MyModel, self).__init__()
        self.use_meta = use_meta
        self.backbone = timm.create_model(model_name, pretrained=False, num_classes=0)
        self.drop_out = nn.Dropout(0.5)
        if self.use_meta:
            self.meta_l1 = nn.Linear(9, 512)
            self.meta_drop_out = nn.Dropout(0.3)
            self.meta_norm1 = nn.LayerNorm(512, eps=1e-6)
            self.meta_l2 = nn.Linear(512, 128)
            self.meta_norm2 = nn.LayerNorm(128, eps=1e-6)
            self.head = nn.Linear(output_dim+128, num_classes)
        else:
            self.head = nn.Linear(output_dim, num_classes)

    def forward(self, x, m=None):
        x = self.backbone(x)
        if self.use_meta:
            m = self.meta_l1(m)
            m = self.meta_norm1(m)
            m = self.meta_drop_out(m)
            m = self.meta_l2(m)
            m = self.meta_norm2(m)
            cats = torch.cat([x, m], dim=1)
            cats = self.drop_out(cats)
            cats = self.head(cats)
            return cats
        else:
            x = self.drop_out(x)
            x = self.head(x)
            return x
    
def load_dicts(model, weight_path):
    model.load_state_dict(torch.load(weight_path, map_location='cpu')['state_dict'], strict=True)
    model.cuda()
    model.eval()
    return
    
def inference(model, test_loader, column):
    preds = []
    isic_ids_all = []
    with torch.no_grad():
        bar = tqdm(enumerate(test_loader), total=len(test_loader))
        for step, (images, isic_ids) in bar:
            images = images.cuda()
            outputs = model(images).sigmoid()
            preds.append(outputs.detach().cpu().numpy())
            isic_ids_all.extend(isic_ids)
    preds = np.concatenate(preds)
    results = [(n, p) for n, p in zip(isic_ids_all, preds[:, 0].tolist())]
    df = pd.DataFrame(results, columns=['isic_id', column])
    return df

def inference_mt(model, test_loader, column):
    preds_all = []
    isic_ids_all = []
    with torch.no_grad():
        bar = tqdm(enumerate(test_loader), total=len(test_loader))
        for step, (images, isic_ids) in bar:
            images = images.cuda()
            outputs = model(images).softmax(1)
            preds_all.append(outputs.detach().cpu().numpy())
            isic_ids_all.extend(isic_ids)
    preds_all = np.concatenate(preds_all)
    preds_df = pd.DataFrame(preds_all, columns=[f'{column}_class_{i}' for i in range(5)])
    isic_ids_df = pd.DataFrame(isic_ids_all, columns=['isic_id'])
    df = pd.concat([isic_ids_df, preds_df], axis=1)
    return df

def inference_loop(model_name, model_weight, column, output_dim=768, num_classes=1, image_size=224, mtarget=False):
    model = MyModel(model_name, output_dim=output_dim, num_classes=num_classes)
    load_dicts(model, model_weight)
    test_dataset = ISICDataset(test_csv, test_hdf, image_size)
    test_loader = DataLoader(test_dataset, batch_size=32, num_workers=2, shuffle=False, pin_memory=True)
    if mtarget:
        test_pred_df = inference_mt(model, test_loader, column)
    else:
        test_pred_df = inference(model, test_loader, column)
    return test_pred_df

In [9]:
test_csv = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
test_hdf = '/kaggle/input/isic-2024-challenge/test-image.hdf5'
df_sub = pd.read_csv(subm_path)
# eva02_base_patch16_clip_224.merged2b
# eva02_base_patch14_224.mim_in22k
model_names = ['eva02_base_patch16_clip_224.merged2b', 'convnext_base.clip_laion2b']
model_weights = ['/kaggle/input/isic-models/eva02_base_patch16_clip_224.merged2b_data_all_bn_ep8_2024.pth',
                 '/kaggle/input/isic-models/eva02_base_patch16_clip_224.merged2b_data_all_bn_ep4.pth',
                 '/kaggle/input/isic-models/eva02_base_patch16_clip_224.merged2b_data_all_mtarget_ep3.pth',
                 '/kaggle/input/isic-models/convnext_base.clip_laion2b_data_all_bn_ep10_2024.pth',
                 '/kaggle/input/isic-models/convnext_base.clip_laion2b_data_all_bn_ep8.pth',
                 '/kaggle/input/isic-models/convnext_base.clip_laion2b_data_all_mtarget_ep3.pth']

test_pred_df1 = inference_loop(model_names[0], model_weights[0], 'eva02_2024')
test_pred_df2 = inference_loop(model_names[0], model_weights[1], 'eva02')
test_pred_df3 = inference_loop(model_names[0], model_weights[2], 'eva02', num_classes=5, mtarget=True)
test_pred_df4 = inference_loop(model_names[1], model_weights[3], 'convnext_2024', output_dim=1024, image_size=256)
test_pred_df5 = inference_loop(model_names[1], model_weights[4], 'convnext', output_dim=1024, image_size=256)
test_pred_df6 = inference_loop(model_names[1], model_weights[5], 'convnext', output_dim=1024, num_classes=5, image_size=256, mtarget=True)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
df_train_eva = pd.read_csv("/kaggle/input/oof-csvs/isic_2024_10folds_pseudo_mtarget_oof_eva02_bn.csv")
df_train_eva_all_bn = pd.read_csv("/kaggle/input/oof-csvs/isic_2018-2024_btarget_10folds_oof_eva02_bn.csv")
df_train_eva_all_mt = pd.read_csv("/kaggle/input/oof-csvs/isic_2018-2024_mtarget_5folds_oof_eva02_mtarget.csv")
df_train_conv = pd.read_csv("/kaggle/input/oof-csvs/isic_2024_10folds_pseudo_mtarget_oof_convnext_bn.csv")
df_train_conv_all_bn = pd.read_csv("/kaggle/input/oof-csvs/isic_2018-2024_btarget_10folds_oof_convnext_bn.csv")
df_train_conv_all_mt = pd.read_csv("/kaggle/input/oof-csvs/isic_2018-2024_mtarget_5folds_oof_convnext_mtarget.csv")

df_train = df_train.merge(df_train_eva[['isic_id', 'eva02_2024']], how='left', on='isic_id')
df_train = df_train.merge(df_train_eva_all_bn[['isic_id', 'eva02']], how='left', on='isic_id')
df_train = df_train.merge(df_train_eva_all_mt[['isic_id']+[f'eva02_class_{i}' for i in range(5)]], how='left', on='isic_id')
df_train = df_train.merge(df_train_conv[['isic_id', 'convnext_2024']], how='left', on='isic_id')
df_train = df_train.merge(df_train_conv_all_bn[['isic_id', 'convnext']], how='left', on='isic_id')
df_train = df_train.merge(df_train_conv_all_mt[['isic_id']+[f'convnext_class_{i}' for i in range(5)]], how='left', on='isic_id')
df_train['new_preds_2024'] = (df_train['eva02_2024'] + df_train['convnext_2024'])/2
df_train['new_preds'] = (df_train['eva02'] + df_train['convnext'])/2
df_train['new_preds_m_0'] = (df_train['eva02_class_0'] + df_train['convnext_class_0'])/2
df_train['new_preds_m_1'] = (df_train['eva02_class_1'] + df_train['convnext_class_1'])/2
df_train['new_preds_m_2'] = (df_train['eva02_class_2'] + df_train['convnext_class_2'])/2
df_train['new_preds_m_3'] = (df_train['eva02_class_3'] + df_train['convnext_class_3'])/2
df_train['new_preds_m_4'] = (df_train['eva02_class_4'] + df_train['convnext_class_4'])/2

df_test = df_test.merge(test_pred_df1[['isic_id', 'eva02_2024']], how='left', on='isic_id')
df_test = df_test.merge(test_pred_df2[['isic_id', 'eva02']], how='left', on='isic_id')
df_test = df_test.merge(test_pred_df3[['isic_id']+[f'eva02_class_{i}' for i in range(5)]], how='left', on='isic_id')
df_test = df_test.merge(test_pred_df4[['isic_id', 'convnext_2024']], how='left', on='isic_id')
df_test = df_test.merge(test_pred_df5[['isic_id', 'convnext']], how='left', on='isic_id')
df_test = df_test.merge(test_pred_df6[['isic_id']+[f'convnext_class_{i}' for i in range(5)]], how='left', on='isic_id')
df_test['new_preds_2024'] = (df_test['eva02_2024'] + df_test['convnext_2024'])/2
df_test['new_preds'] = (df_test['eva02'] + df_test['convnext'])/2
df_test['new_preds_m_0'] = (df_test['eva02_class_0'] + df_test['convnext_class_0'])/2
df_test['new_preds_m_1'] = (df_test['eva02_class_1'] + df_test['convnext_class_1'])/2
df_test['new_preds_m_2'] = (df_test['eva02_class_2'] + df_test['convnext_class_2'])/2
df_test['new_preds_m_3'] = (df_test['eva02_class_3'] + df_test['convnext_class_3'])/2
df_test['new_preds_m_4'] = (df_test['eva02_class_4'] + df_test['convnext_class_4'])/2

# _ll_lines = ['eva02_2024', 'eva02', 'convnext_2024', 'convnext']+[f'eva02_class_{i}' for i in range(5)]+[f'convnext_class_{i}' for i in range(5)]
# feature_cols.extend(_ll_lines)


In [11]:
import copy
from sklearn.base import BaseEstimator, TransformerMixin

image_cols = ['new_preds_2024', 'new_preds'] + [f'new_preds_m_{i}' for i in range(5)]
feature_cols_without_image_cols = copy.copy(feature_cols)
feature_cols_with_image_cols_a = copy.copy(feature_cols) + ['new_preds_2024', 'new_preds']
feature_cols_with_image_cols_b = copy.copy(feature_cols) + ['new_preds_2024'] + [f'new_preds_m_{i}' for i in range(5)]
feature_cols_with_image_cols_c = copy.copy(feature_cols) + ['new_preds'] + [f'new_preds_m_{i}' for i in range(5)]
feature_cols += image_cols

class SelectColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

In [12]:
# - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - LightGBM
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_estimators':           200,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        4, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

seed = 0
lgb_model = Pipeline([
    ('sampler', RandomUnderSampler(sampling_strategy=0.01,  random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

# - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - CatBoost

cb_params = {
    'loss_function':     'Logloss',
    'iterations':        200,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
    'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24,
    'cat_features':      cat_cols,
}

seed = 1
cb_model = Pipeline([
    ('sampler', RandomUnderSampler(sampling_strategy=0.01, random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

# - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - XGB
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'n_estimators':       200,
    'learning_rate':      0.08501257473292347, 
    'lambda':             8.879624125465703, 
    'alpha':              0.6779926606782505, 
    'max_depth':          6, 
    'subsample':          0.6012681388711075, 
    'colsample_bytree':   0.8437772277074493, 
    'colsample_bylevel':  0.5476090898823716, 
    'colsample_bynode':   0.9928601203635129, 
    'scale_pos_weight':   3.29440313334688,
}

seed = 2
xgb_model = Pipeline([
    ('sampler', RandomUnderSampler(sampling_strategy=0.01, random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

estimator = VotingClassifier([('lgb',lgb_model),('cb',cb_model),('xgb',xgb_model)], voting='soft', weights=[1.0, 1.0, 1.0])  

In [13]:
# - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - LightGBM

lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_estimators':     300,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.05895663576059849, 
    'lambda_l2':        0.1695883857043064, 
    'learning_rate':    0.020426983505072768, 
    'max_depth':        6, 
    'num_leaves':       227, 
    'colsample_bytree': 0.582460116058585, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.756793414892886, 
    'feature_fraction': 0.5559920882615041,
    'bagging_freq':     2, 
    'min_data_in_leaf': 92, 
    'scale_pos_weight': 2.474797877400847, 
    'min_child_samples': 64,
    'min_sum_hessian_in_leaf': 0.2035179197462234
}
seed = 3
lgb_model = Pipeline([
    ('sampler_1', RandomOverSampler (sampling_strategy=0.003, random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=0.02,  random_state=seed)),
    ('filter', SelectColumns(feature_cols_with_image_cols_b)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

# - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - CatBoost

cb_params = {
    'loss_function':     'Logloss',
    'iterations':        300,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7, 
    'max_bin':           319, 
    'bagging_temperature': 2, 
    'learning_rate':     0.03016539007161872, 
    'scale_pos_weight':  3.9634290696616663, 
    'l2_leaf_reg':       1.5787849004535293, 
    'subsample':         0.6753592471688482, 
    'min_data_in_leaf':  69,
    'leaf_estimation_iterations': 2,
    'colsample_bylevel': 0.9612799862107829, 
    'cat_features':      cat_cols,
}
seed = 4
cb_model = Pipeline([
    ('sampler_1', RandomOverSampler (sampling_strategy=0.003, random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=0.02,  random_state=seed)),
    ('filter', SelectColumns(feature_cols_with_image_cols_c)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

# - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - XGB

xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'n_estimators':       300, 
    'learning_rate':      0.0359772292953598, 
    'lambda':             0.002153456849648466, 
    'alpha':              1.5838252279981035, 
    'max_depth':          7, 
    'subsample':          0.6788232629752643, 
    'colsample_bytree':   0.5155075347438656, 
    'colsample_bynode':   0.9280980805290755, 
    'scale_pos_weight':   3.3485192818949487,
    'max_bin':            260, 
    'max_delta_step':     8,
    'min_split_loss':     3
}
seed = 5
xgb_model = Pipeline([
    ('sampler_1', RandomOverSampler (sampling_strategy=0.003, random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=0.02,  random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

estimator_img = VotingClassifier([('lgb',lgb_model),('cb',cb_model),('xgb',xgb_model)], voting='soft')  

In [14]:
# # - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - LightGBM

# lgb_params = {
#     'objective':        'binary',
#     'verbosity':        -1,
#     'n_estimators':     300,
#     'boosting_type':    'gbdt',
#     'random_state':     seed,
#     'lambda_l1':        0.05895663576059849, 
#     'lambda_l2':        0.1695883857043064, 
#     'learning_rate':    0.020426983505072768, 
#     'max_depth':        6, 
#     'num_leaves':       227, 
#     'colsample_bytree': 0.582460116058585, 
#     'colsample_bynode': 0.4025961355653304, 
#     'bagging_fraction': 0.756793414892886, 
#     'feature_fraction': 0.5559920882615041,
#     'bagging_freq':     2, 
#     'min_data_in_leaf': 92, 
#     'scale_pos_weight': 2.474797877400847, 
#     'min_child_samples': 64,
#     'min_sum_hessian_in_leaf': 0.2035179197462234
# }
# seed = 3
# lgb_model = Pipeline([
#     ('sampler_1', RandomOverSampler (sampling_strategy=0.003, random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=0.01,  random_state=seed)),
#     ('filter', SelectColumns(feature_cols_with_image_cols_b)),
#     ('classifier', lgb.LGBMClassifier(**lgb_params)),
# ])

# # - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - CatBoost

# cb_params = {
#     'loss_function':     'Logloss',
#     'iterations':        300,
#     'verbose':           False,
#     'random_state':      seed,
#     'max_depth':         7, 
#     'max_bin':           319, 
#     'bagging_temperature': 2, 
#     'learning_rate':     0.03016539007161872, 
#     'scale_pos_weight':  3.9634290696616663, 
#     'l2_leaf_reg':       1.5787849004535293, 
#     'subsample':         0.6753592471688482, 
#     'min_data_in_leaf':  69,
#     'leaf_estimation_iterations': 2,
#     'colsample_bylevel': 0.9612799862107829, 
#     'cat_features':      cat_cols,
# }
# seed = 4
# cb_model = Pipeline([
#     ('sampler_1', RandomOverSampler (sampling_strategy=0.003, random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=0.01,  random_state=seed)),
#     ('filter', SelectColumns(feature_cols_with_image_cols_c)),
#     ('classifier', cb.CatBoostClassifier(**cb_params)),
# ])

# # - - - - - -- - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - XGB

# xgb_params = {
#     'enable_categorical': True,
#     'tree_method':        'hist',
#     'random_state':       seed,
#     'n_estimators':       300, 
#     'learning_rate':      0.0359772292953598, 
#     'lambda':             0.002153456849648466, 
#     'alpha':              1.5838252279981035, 
#     'max_depth':          7, 
#     'subsample':          0.6788232629752643, 
#     'colsample_bytree':   0.5155075347438656, 
#     'colsample_bynode':   0.9280980805290755, 
#     'scale_pos_weight':   3.3485192818949487,
#     'max_bin':            260, 
#     'max_delta_step':     8,
#     'min_split_loss':     3
# }
# seed = 5
# xgb_model = Pipeline([
#     ('sampler_1', RandomOverSampler (sampling_strategy=0.003, random_state=seed)),
#     ('sampler_2', RandomUnderSampler(sampling_strategy=0.01,  random_state=seed)),
#     ('classifier', xgb.XGBClassifier(**xgb_params)),
# ])

# estimator_img = VotingClassifier([('lgb',lgb_model),('cb',cb_model),('xgb',xgb_model)], voting='soft')  

### Cross Validation

In [15]:
X = df_train[feature_cols]
y = df_train[target_col]
groups = df_train[group_col]
cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

val_score = cross_val_score(
    estimator=estimator_img, 
    X=X, y=y, 
    cv=cv, 
    groups=groups,
    scoring=custom_metric,
)

np.mean(val_score), val_score

(0.17699980259974962,
 array([0.18140226, 0.17477439, 0.17767894, 0.18509073, 0.16605269]))

### Training

In [16]:
X, y = df_train[feature_cols], df_train[target_col]
estimator.fit(X, y)

In [17]:
estimator_img.fit(X, y)

### Prediction

In [18]:
df_test['target1'] = estimator.predict_proba(df_test[feature_cols])[:, 1]
df_test['target2'] = estimator_img.predict_proba(df_test[feature_cols])[:, 1]
df_test['target'] = (df_test['target1'] + df_test['target2'] ) /2
del df_sub['target']
df_sub = df_sub.merge(df_test[['isic_id', 'target']], how='left', on='isic_id')
df_sub.to_csv('submission.csv', index=False)
df_sub.head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.521554
1,ISIC_0015729,0.340047
2,ISIC_0015740,0.540462
