In [None]:
import os
import gc
import time
import copy
import optuna

from pathlib import Path

import numpy as np, pandas as pd, polars as pl

from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

import lightgbm as lgb, catboost as cb, xgboost as xgb

from optuna.samplers import TPESampler

# IMAGE PREDICTIONS

In [None]:
#EFFNET V1B0
!python /kaggle/input/isic-script-inference-effnetv1b0-f313ae/main.py /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
!mv submission.csv submission_effnetv1b0.csv

BEST_WEIGHT = /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]


In [None]:
#TARGET 3
!python /kaggle/input/isic-2024-pl-submission-script-and-preds1/pl_submission.py
!mv submission.csv submission_image3.csv

  df_train_meta = pd.read_csv(BASE_DATA_DIR + "train-metadata.csv") # металанные в коде не используются


In [None]:
# #EVA02
# !python /kaggle/input/isic-script-inference-eva02/main.py /kaggle/input/isic-pytorch-training-baseline-eva02/AUROC0.5177_Loss0.2829_epoch7.bin
# !mv submission.csv submission_eva02.csv

In [None]:
# #EDGENEXT
# !python /kaggle/input/isic-script-inference-edgenext/main.py /kaggle/input/isic-pytorch-training-edgenext/Final_model.bin
# !mv submission.csv submission_edgenext.csv

# DATA PREPROCESS

In [None]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis

 ###########NEW###################

    'lesion_compactness',               # tbp_lv_perimeterMM**2 / tbp_lv_areaMM2
    'ellipticity_index',                # (clin_size_long_diam_mm - tbp_lv_minorAxisMM) / clin_size_long_diam_mm
    'border_irregularity_index',        # tbp_lv_norm_border / tbp_lv_area_perim_ratio
    'lesion_color_intensity',           # (tbp_lv_L * tbp_lv_A * tbp_lv_B) / 1000
    'lesion_color_contrast_norm',       # lesion_color_difference / (tbp_lv_Lext + tbp_lv_Aext + tbp_lv_Bext)
    'lesion_circularity',               # (4 * pi * tbp_lv_areaMM2) / tbp_lv_perimeterMM**2
    'volume_to_area_ratio',             # volume_approximation_3d / tbp_lv_areaMM2
    'lesion_spread_index',              # sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2) / tbp_lv_areaMM2
    'gradient_based_color_asymmetry',   # (tbp_lv_stdL - tbp_lv_stdLExt) / tbp_lv_stdL
    'symmetry_to_compactness_ratio',    # tbp_lv_symm_2axis / lesion_compactness
    'age_normalized_severity_index',    # lesion_severity_index / age_approx
    'color_homogeneity',                # 1 / (tbp_lv_radial_color_std_max + err)
    'border_to_hue_contrast_ratio',     # tbp_lv_norm_border / hue_contrast

]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
mean_cols = [f'{col}_patient_mean' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
image_cols = ["target_3","target_effnetv1b0"]

#norm_cols += image_cols
feature_cols_with_OHE = num_cols + new_num_cols + cat_cols + norm_cols + special_cols + mean_cols
feature_cols_without_OHE = num_cols + new_num_cols + cat_cols + norm_cols + special_cols + mean_cols

In [None]:
def read_data(path):
    # Чтение данных
    df = (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()),  # Impute missing values
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        ###########NEW###################
        .with_columns(
            lesion_compactness             = (pl.col('tbp_lv_perimeterMM')**2) / pl.col('tbp_lv_areaMM2') ,
            ellipticity_index              = (pl.col('clin_size_long_diam_mm') - pl.col('tbp_lv_minorAxisMM')) / pl.col('clin_size_long_diam_mm'),
            border_irregularity_index      = pl.col('tbp_lv_norm_border') / pl.col('tbp_lv_area_perim_ratio'),
            lesion_color_intensity         = (pl.col('tbp_lv_L') * pl.col('tbp_lv_A') * pl.col('tbp_lv_B')) / 1000,
            lesion_color_contrast_norm     = pl.col('lesion_color_difference') / (pl.col('tbp_lv_Lext') + pl.col('tbp_lv_Aext') + pl.col('tbp_lv_Bext')),
            lesion_circularity             = (4 * np.pi * pl.col('tbp_lv_areaMM2')) / (pl.col('tbp_lv_perimeterMM')**2),
            volume_to_area_ratio           = pl.col('volume_approximation_3d') / pl.col('tbp_lv_areaMM2'),
            lesion_spread_index            = (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt() / pl.col('tbp_lv_areaMM2'),
            gradient_based_color_asymmetry = (pl.col('tbp_lv_stdL') - pl.col('tbp_lv_stdLExt')) / pl.col('tbp_lv_stdL'),
            age_normalized_severity_index  = pl.col('lesion_severity_index') / pl.col('age_approx'),
            color_homogeneity              = 1 / (pl.col('tbp_lv_radial_color_std_max') + err),
            border_to_hue_contrast_ratio   = pl.col('tbp_lv_norm_border') / pl.col('hue_contrast'),
        )
        .with_columns(
            symmetry_to_compactness_ratio  = pl.col('tbp_lv_symm_2axis') / pl.col('lesion_compactness')
        )

        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            ( pl.col(col).mean().over('patient_id') ).alias(f'{col}_patient_mean') for col in (num_cols + new_num_cols)
        )

        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
    )

    return df.to_pandas().set_index(id_col)

In [None]:
def preprocess_with_OHE(df_train, df_test):
    global cat_cols

    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])

    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
#     df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
#     df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    # effnetv1b0
    df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")
    df_train = df_train.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_train["target_effnetv1b0"] = df_eff["target_effnetv1b0"]

    df_eff = pd.read_csv("submission_effnetv1b0.csv")
    df_test = df_test.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_test["target_effnetv1b0"] = df_eff["target"]

    # target 3
    df_image_3 = pd.read_csv("/kaggle/input/isic-2024-pl-submission-script-and-preds/train_preds.csv")
    df_train = df_train.reset_index(drop=True)
    df_image_3 = df_image_3.reset_index(drop=True)
    df_train["target_3"] = df_image_3["pred"]
    df_3 = pd.read_csv("submission_image3.csv")
    df_test = df_test.reset_index(drop=True)
    df_image_3 = df_image_3.reset_index(drop=True)
    df_test["target_3"] = df_3["target"]

    for col in cat_cols:
        feature_cols_with_OHE.remove(col)

    feature_cols_with_OHE.extend(new_cat_cols)
#     cat_cols = new_cat_cols

    return df_train, df_test

In [None]:
def preprocess_without_OHE(df_train, df_test):

    # effnetv1b0
    df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")
    df_train = df_train.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_train["target_effnetv1b0"] = df_eff["target_effnetv1b0"]
    df_eff = pd.read_csv("submission_effnetv1b0.csv")
    df_test = df_test.reset_index(drop=True)
    df_eff = df_eff.reset_index(drop=True)
    df_test["target_effnetv1b0"] = df_eff["target"]

    # target 3
    df_image_3 = pd.read_csv("/kaggle/input/isic-2024-pl-submission-script-and-preds/train_preds.csv")
    df_train = df_train.reset_index(drop=True)
    df_image_3 = df_image_3.reset_index(drop=True)
    df_train["target_3"] = df_image_3["pred"]
    df_3 = pd.read_csv("submission_image3.csv")
    df_test = df_test.reset_index(drop=True)
    df_image_3 = df_image_3.reset_index(drop=True)
    df_test["target_3"] = df_3["target"]

    return df_train, df_test

In [None]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])

    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return partial_auc

In [None]:
df_train = read_data(train_path)
df_test = read_data(test_path)
df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train_XGB_LGBM, df_test_XGB_LGBM = preprocess_with_OHE(df_train, df_test)
df_train_CB, df_test_CB = preprocess_without_OHE(df_train, df_test)

  df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")
  df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")


***drop least important features with feature importance(code at the last of the notebook before test predictions)***

In [None]:
# # #they are detected at the first run
# least_important_features = []
# # #they are detected after the least_important_features are removed and it has increased cv score also so I add it

# df_train.drop(columns =least_important_features,inplace = True)

# for feature in least_important_features:
#     cat_cols.remove(feature)
#     feature_cols.remove(feature)

# MODEL INITIALIZATION

In [None]:
def lgb_objective(trial):

    params = {
        'objective':         'binary',
        'verbosity':         -1,
        'n_iter': 200,
        'boosting_type':  'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'class_weight': trial.suggest_categorical('auto_class_weights', ['balanced']),
        'num_leaves': trial.suggest_int('num_leaves', 15, 250),
        'min_data_in_leaf':trial.suggest_int('min_data_in_leaf', 200, 1000),
        'max_bin': trial.suggest_int('max_bin', 50, 200),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.95),
        "bagging_freq":      trial.suggest_int("bagging_freq", 1, 7),
        'lambda_l2':         trial.suggest_float('lambda_l2', 1e-4, 10.0, log=True), # большие значения из диапазона (ближе к 0.001) будут предлагаться чаще, чем большие значения (ближе к 0.1).исп-м широкий диапазон значений, избегая переобучения, которое может произойти при слишком большом значении learning_rate
        "lambda_l1":         trial.suggest_float("lambda_l1", 1e-4, 10.0, log=True),
        'scale_pos_weight' : trial.suggest_float('scale_pos_weight', 0.5, 6.0),

#         'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
#         'colsample_bynode':  trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'random_state': 42
    }

    estimator = Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', lgb.LGBMClassifier(**params)),
    ])

    X = df_train_XGB_LGBM[feature_cols_with_OHE]
    y = df_train_XGB_LGBM[target_col]
    groups = df_train_XGB_LGBM[group_col]
    cv = StratifiedGroupKFold(5, shuffle=True)

    val_score = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        groups=groups,
        scoring=custom_metric,
    )

    return np.mean(val_score)

In [None]:
def xgb_objective(trial):
    params = {
        'objective':          'binary:logistic',
        'n_estimators':       200,
        'tree_method':        'hist',
        'random_state':       seed,
        'learning_rate':      trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':          trial.suggest_int('max_depth', 4, 8),
        'lambda':             trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha':              trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'subsample':          trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree':   trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode':   trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'scale_pos_weight':   trial.suggest_float('scale_pos_weight', 0.8, 4.0),
    }

    estimator = Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', xgb.XGBClassifier(**params)),
    ])

    X = df_train_XGB_LGBM[feature_cols_with_OHE]
    y = df_train_XGB_LGBM[target_col]
    groups = df_train_XGB_LGBM[group_col]
    cv = StratifiedGroupKFold(5, shuffle=True)

    val_score = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        groups=groups,
        scoring=custom_metric,
    )

    return np.mean(val_score)

In [None]:
DO_TUNING=True

In [None]:
if DO_TUNING:
    # LightGBM
    start_time = time.time()
    study_lgb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
    study_lgb.optimize(lgb_objective, n_trials=90)
    end_time = time.time()
    elapsed_time_lgb = end_time - start_time
    print(f"LightGBM tuning took {elapsed_time_lgb:.2f} seconds.")

[I 2025-06-01 15:37:58,089] A new study created in memory with name: no-name-3b07054b-2ade-48be-a3e9-dcd55d206200
[I 2025-06-01 15:38:21,882] Trial 0 finished with value: 0.1602796041221647 and parameters: {'learning_rate': 0.03807947176588889, 'auto_class_weights': 'balanced', 'num_leaves': 239, 'min_data_in_leaf': 786, 'max_bin': 140, 'bagging_fraction': 0.31701398033182737, 'bagging_freq': 2, 'lambda_l2': 0.00019517224641449495, 'lambda_l1': 2.1423021757741068, 'scale_pos_weight': 3.8061325645876485}. Best is trial 0 with value: 0.1602796041221647.
[I 2025-06-01 15:38:48,690] Trial 1 finished with value: 0.15932449693322506 and parameters: {'learning_rate': 0.07109918520180851, 'auto_class_weights': 'balanced', 'num_leaves': 19, 'min_data_in_leaf': 976, 'max_bin': 175, 'bagging_fraction': 0.3592543330087071, 'bagging_freq': 2, 'lambda_l2': 0.0008260808399079611, 'lambda_l1': 0.0033205591037519565, 'scale_pos_weight': 3.386160373977308}. Best is trial 0 with value: 0.1602796041221647

LightGBM tuning took 3553.88 seconds.


In [None]:
def cb_objective(trial):
#     cat_features = [i for i, col in enumerate(df_train.columns) if df_train[col].dtype == 'category']

    params = {
        'loss_function':     'Logloss',
        'iterations':        200,
        'verbose':           False,
        'random_state':      seed,
        'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':         trial.suggest_int('max_depth', 4, 8),
        'l2_leaf_reg':       trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'subsample':         trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 5, 100),
        'scale_pos_weight':  trial.suggest_float('scale_pos_weight', 0.8, 4.0),
        'bootstrap_type':    'Bernoulli',
        'cat_features':      cat_cols,
    }

    estimator = Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', cb.CatBoostClassifier(**params)),
    ])

    X = df_train_CB[feature_cols_without_OHE]
    y = df_train_CB[target_col]

    groups = df_train_CB[group_col]

    cv = StratifiedGroupKFold(5, shuffle=True)

    val_score = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        groups=groups,
        scoring=custom_metric,
    )

    return np.mean(val_score)

In [None]:
if DO_TUNING:
    # CatBoost
    start_time = time.time()
    study_cb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
    study_cb.optimize(cb_objective, n_trials=60)
    end_time = time.time()
    elapsed_time_cb = end_time - start_time
    print(f"CatBoost tuning took {elapsed_time_cb:.2f} seconds.")

[I 2025-06-01 16:37:12,136] A new study created in memory with name: no-name-1d1258dc-a0b3-401c-b57c-54e74c9f3133
[I 2025-06-01 16:39:57,837] Trial 0 finished with value: 0.16019108115507677 and parameters: {'learning_rate': 0.023688639503640783, 'max_depth': 8, 'l2_leaf_reg': 0.8471801418819978, 'subsample': 0.759195090518222, 'colsample_bylevel': 0.4936111842654619, 'min_data_in_leaf': 19, 'scale_pos_weight': 0.9858675589382383}. Best is trial 0 with value: 0.16019108115507677.
[I 2025-06-01 16:41:43,563] Trial 1 finished with value: 0.15622107470073282 and parameters: {'learning_rate': 0.07348118405270448, 'max_depth': 7, 'l2_leaf_reg': 0.679657809075816, 'subsample': 0.41235069657748147, 'colsample_bylevel': 0.9819459112971965, 'min_data_in_leaf': 84, 'scale_pos_weight': 1.4794851541704839}. Best is trial 0 with value: 0.16019108115507677.
[I 2025-06-01 16:42:25,744] Trial 2 finished with value: 0.15374758312515674 and parameters: {'learning_rate': 0.015199348301309814, 'max_depth'

CatBoost tuning took 8786.69 seconds.


In [None]:
if DO_TUNING:
    # XGBoost
    start_time = time.time()
    study_xgb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
    study_xgb.optimize(xgb_objective, n_trials=60)
    end_time = time.time()
    elapsed_time_xgb = end_time - start_time
    print(f"XGBoost tuning took {elapsed_time_xgb:.2f} seconds.")

[I 2025-06-01 19:03:38,933] A new study created in memory with name: no-name-54b2a309-f885-4fd3-a319-d95319d0add3
[I 2025-06-01 19:04:34,458] Trial 0 finished with value: 0.1652810863241569 and parameters: {'learning_rate': 0.023688639503640783, 'max_depth': 8, 'lambda': 0.8471801418819978, 'alpha': 0.24810409748678125, 'subsample': 0.4936111842654619, 'colsample_bytree': 0.49359671220172163, 'colsample_bynode': 0.4348501673009197, 'scale_pos_weight': 3.571763666479793}. Best is trial 0 with value: 0.1652810863241569.
[I 2025-06-01 19:05:26,940] Trial 1 finished with value: 0.1669207775334132 and parameters: {'learning_rate': 0.039913058785616795, 'max_depth': 7, 'lambda': 0.0012087541473056963, 'alpha': 7.579479953348009, 'subsample': 0.899465584480253, 'colsample_bytree': 0.5274034664069657, 'colsample_bynode': 0.5090949803242604, 'scale_pos_weight': 1.3868944315309881}. Best is trial 1 with value: 0.1669207775334132.
[I 2025-06-01 19:06:12,056] Trial 2 finished with value: 0.1654408

XGBoost tuning took 3598.38 seconds.


In [None]:
if DO_TUNING:
    # Print best parameters for each study
    print("Best LGBM trial:", study_lgb.best_trial)
    print("Best CatBoost trial:", study_cb.best_trial)
    print("Best XGBoost trial:", study_xgb.best_trial)

Best LGBM trial: FrozenTrial(number=80, state=TrialState.COMPLETE, values=[0.17117780885872408], datetime_start=datetime.datetime(2025, 6, 1, 16, 29, 38, 1979), datetime_complete=datetime.datetime(2025, 6, 1, 16, 30, 25, 236649), params={'learning_rate': 0.025554664888037208, 'auto_class_weights': 'balanced', 'num_leaves': 217, 'min_data_in_leaf': 523, 'max_bin': 190, 'bagging_fraction': 0.8281147049112076, 'bagging_freq': 3, 'lambda_l2': 0.06607163051401742, 'lambda_l1': 0.00015559287678760844, 'scale_pos_weight': 1.760048194662142}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=False, low=0.001, step=None), 'auto_class_weights': CategoricalDistribution(choices=('balanced',)), 'num_leaves': IntDistribution(high=250, log=False, low=15, step=1), 'min_data_in_leaf': IntDistribution(high=1000, log=False, low=200, step=1), 'max_bin': IntDistribution(high=200, log=False, low=50, step=1), 'bagging_fraction': FloatDistr

TRAINING

LOOKING FOR FEATURE IMPORTANCE ( lgb + xgb )¶

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline


def create_lgb_pipeline(params):
    return ImbPipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', lgb.LGBMClassifier(
            objective='binary',
            verbosity=-1,
            n_estimators=200,
            random_state=42,
            **study_lgb.best_trial.params  # Распакуйте параметры
        )),
    ])

#  пайплайн с лучшими параметрами
lgb_estimator = create_lgb_pipeline(study_lgb.best_trial.params)
# Обучение
X, y = df_train_XGB_LGBM[feature_cols_with_OHE], df_train_XGB_LGBM[target_col]
lgb_estimator.fit(X, y)


def create_xgb_pipeline(params):
    return ImbPipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', xgb.XGBClassifier(
            objective='binary:logistic',
            n_estimators=200,
            tree_method='hist',
            random_state=42,
            **study_xgb.best_trial.params
        )),
    ])
# пайплайн с лучшими параметрами
xgb_estimator = create_xgb_pipeline(study_xgb.best_trial.params)
# Обучите модели
X, y = df_train_XGB_LGBM[feature_cols_with_OHE], df_train_XGB_LGBM[target_col]
xgb_estimator.fit(X, y)

In [None]:
DO_FEATURE_IMPORTANCE_MODELS = True
DO_FEATURE_IMPORTANCE_TEST= True

In [None]:
if DO_FEATURE_IMPORTANCE_MODELS:
    # LightGBM feature importance
    lgb_model = lgb_estimator.named_steps['classifier']
    lgb_feature_importance = lgb_model.booster_.feature_importance(importance_type='gain')
    lgb_feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': lgb_feature_importance
    }).sort_values(by='importance', ascending=False)

    print("LightGBM Feature Importance:")
    print(lgb_feature_importance_df)

    # XGBoost feature importance
    xgb_model = xgb_estimator.named_steps['classifier']
    xgb_feature_importance = xgb_model.get_booster().get_score(importance_type='weight')
    xgb_feature_importance_df = pd.DataFrame({
        'feature': list(xgb_feature_importance.keys()),
        'importance': list(xgb_feature_importance.values())
    }).sort_values(by='importance', ascending=False)

    print("XGBoost Feature Importance:")
    print(xgb_feature_importance_df)

    # Настройки отображения для Pandas
    pd.set_option('display.max_rows', 100)  # Количество отображаемых строк
    pd.set_option('display.max_columns', None)  # Количество отображаемых столбцов

    # Наименее важные признаки для LightGBM
    least_important_lgb = lgb_feature_importance_df.sort_values(by='importance').head(100)
    print("\nLeast Important Features in LightGBM:")
    print(least_important_lgb)

    # Наименее важные признаки для XGBoost
    least_important_xgb = xgb_feature_importance_df.sort_values(by='importance').head(100)
    print("\nLeast Important Features in XGBoost:")
    print(least_important_xgb)

    # Общие наименее важные признаки между LightGBM и XGBoost
    common_least_important_features = pd.merge(
        least_important_lgb[['feature']],
        least_important_xgb[['feature']],
        on='feature'
    )

    print("\nCommon Least Important Features in Both LightGBM and XGBoost:")
    print(common_least_important_features)

LightGBM Feature Importance:
                                           feature    importance
157  age_normalized_nevi_confidence_2_patient_norm  12716.113711
8                                         tbp_lv_H   7014.563448
97                           tbp_lv_H_patient_norm   5442.551528
89                         age_approx_patient_norm   4087.237096
133           lesion_visibility_score_patient_norm   3266.709234
..                                             ...           ...
291                                      onehot_23      0.000000
293                                      onehot_25      0.000000
303                                      onehot_35      0.000000
304                                      onehot_36      0.000000
0                                       age_approx      0.000000

[315 rows x 2 columns]
XGBoost Feature Importance:
                                           feature  importance
155  age_normalized_nevi_confidence_2_patient_norm       194.0
176          

In [None]:
def create_cat_pipeline(params):
    return ImbPipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', cb.CatBoostClassifier(
            loss_function='Logloss',
            iterations=200,
            random_seed=42,
            silent=True,
            cat_features= cat_cols,
            **study_cb.best_trial.params  # Используем параметры, переданные в функцию
        )),
    ])

# Создание и обучение модели CatBoost
cat_estimator = create_cat_pipeline(study_cb.best_trial.params)
X, y = df_train_XGB_LGBM[feature_cols_without_OHE], df_train_XGB_LGBM[target_col]
cat_estimator.fit(X, y)

if DO_FEATURE_IMPORTANCE_MODELS:
    # CatBoost feature importance
    cat_model = cat_estimator.named_steps['classifier']
    cat_feature_importance = cat_model.get_feature_importance()
    cat_feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': cat_feature_importance
    }).sort_values(by='importance', ascending=False)

    print("CatBoost Feature Importance:")
    print(cat_feature_importance_df)

    # Настройки отображения для Pandas
    pd.set_option('display.max_rows', 100)  # Количество отображаемых строк
    pd.set_option('display.max_columns', None)  # Количество отображаемых столбцов

    # Наименее важные признаки для CatBoost
    least_important_cat = cat_feature_importance_df.sort_values(by='importance').head(100)
    print("\nLeast Important Features in CatBoost:")
    print(least_important_cat)

    # Сохранение результатов feature importance для всех трех моделей
    common_least_important_features = pd.merge(
        least_important_lgb[['feature']],
        least_important_xgb[['feature']],
        on='feature'
    ).merge(least_important_cat[['feature']], on='feature')

    print("\nCommon Least Important Features in LightGBM, XGBoost, and CatBoost:")
    print(common_least_important_features)


CatBoost Feature Importance:
                                    feature  importance
103                   tbp_lv_H_patient_norm    2.911975
8                                  tbp_lv_H    2.363092
104                tbp_lv_Hext_patient_norm    2.014676
38                  lesion_color_difference    1.696822
36                             hue_contrast    1.668643
..                                      ...         ...
186     clin_size_long_diam_mm_patient_mean    0.000000
93                   tbp_lv_location_simple    0.000000
263  border_irregularity_index_patient_mean    0.000000
242           std_dev_contrast_patient_mean    0.000000
220         lesion_shape_index_patient_mean    0.000000

[274 rows x 2 columns]

Least Important Features in CatBoost:
                                         feature  importance
220              lesion_shape_index_patient_mean    0.000000
269  gradient_based_color_asymmetry_patient_mean    0.000000
242                std_dev_contrast_patient_mean    0

In [None]:
"""TEST PREDICTION"""

# Ensure df_subm has the correct structure
df_subm = df_subm.reset_index()  # This moves isic_id from index to column

# Predictions for each model
lgb_predictions = lgb_estimator.predict_proba(df_test[feature_cols_with_OHE])[:, 1]
xgb_predictions = xgb_estimator.predict_proba(df_test[feature_cols_with_OHE])[:, 1]
cat_predictions = cat_estimator.predict_proba(df_test[feature_cols_without_OHE])[:, 1]

# Individual model submissions (optional)
df_subm_temp = df_subm.copy()
df_subm_temp['target'] = lgb_predictions
df_subm_temp[['isic_id', 'target']].to_csv('submission_lgb.csv', index=False)

df_subm_temp['target'] = xgb_predictions
df_subm_temp[['isic_id', 'target']].to_csv('submission_xgb.csv', index=False)

df_subm_temp['target'] = cat_predictions
df_subm_temp[['isic_id', 'target']].to_csv('submission_cat.csv', index=False)

# Average predictions
average_predictions = (lgb_predictions + xgb_predictions + cat_predictions) / 3

# Create proper submission DataFrame
df_subm['target'] = average_predictions

# Save with proper format - keep the isic_id column
df_subm[['isic_id', 'target']].to_csv('submission.csv', index=False)

display(df_subm[['isic_id', 'target']].head())


Unnamed: 0,isic_id,target
0,ISIC_0015657,0.69672
1,ISIC_0015729,0.615805
2,ISIC_0015740,0.67489
