<table style="border-collapse: collapse; width: 100%;">
  <thead>
    <tr>
      <th style="padding: 8px; text-align: center; background-color: #8E716C; color: white;">Model Name</th>
      <th style="padding: 8px; text-align: center; background-color: #8E716C; color: white;">Notebook Version</th>
      <th style="padding: 8px; text-align: center; background-color: #8E716C; color: white;">1st best CV</th>
      <th style="padding: 8px; text-align: center; background-color: #8E716C; color: white;">2nd best CV</th>
      <th style="padding: 8px; text-align: center; background-color: #8E716C; color: white;">3rd best CV</th>
      <th style="padding: 8px; text-align: center; background-color: #8E716C; color: white;">Public LB</th>
    </tr>
  </thead>
  <tbody>
    <tr style="background-color: #B08E86; color: white;">
      <td style="padding: 8px; text-align: center;">EffNetV1B0</td>
      <td style="padding: 8px; text-align: center;">27</td>
      <td style="padding: 8px; text-align: center;">0.186</td>
      <td style="padding: 8px; text-align: center;">0.1857</td>
      <td style="padding: 8px; text-align: center;">0.1852</td>
      <td style="padding: 8px; text-align: center;">0.177</td>
    </tr>
    <tr style="background-color: #C8AAA3; color: white;">
      <td style="padding: 8px; text-align: center;">Eva02</td>
      <td style="padding: 8px; text-align: center;">28</td>
      <td style="padding: 8px; text-align: center;">0.1844</td>
      <td style="padding: 8px; text-align: center;">0.1843</td>
      <td style="padding: 8px; text-align: center;">0.1837</td>
      <td style="padding: 8px; text-align: center;">0.174</td>
    </tr>
    <tr style="background-color: #B08E86; color: white;">
      <td style="padding: 8px; text-align: center;">Image3</td>
      <td style="padding: 8px; text-align: center;">29</td>
      <td style="padding: 8px; text-align: center;">0.1821</td>
      <td style="padding: 8px; text-align: center;">0.1809</td>
      <td style="padding: 8px; text-align: center;">0.1809</td>
      <td style="padding: 8px; text-align: center;">0.176</td>
    </tr>
    <tr style="background-color: #C8AAA3; color: white;">
      <td style="padding: 8px; text-align: center;">EdgeNext</td>
      <td style="padding: 8px; text-align: center;">30</td>
      <td style="padding: 8px; text-align: center;">0.1898</td>
      <td style="padding: 8px; text-align: center;">0.1897</td>
      <td style="padding: 8px; text-align: center;">0.1896</td>
      <td style="padding: 8px; text-align: center;">0.176</td>
    </tr>
    <tr style="background-color: #B08E86; color: white;">
      <td style="padding: 8px; text-align: center;">ImageNet</td>
      <td style="padding: 8px; text-align: center;">31</td>
      <td style="padding: 8px; text-align: center;">0.1748</td>
      <td style="padding: 8px; text-align: center;">0.1748</td>
      <td style="padding: 8px; text-align: center;">0.1744</td>
      <td style="padding: 8px; text-align: center;">0.178</td>
    </tr>
  </tbody>
</table>

<b><span style='color:#E0BFB8; font-size: 50px; font-weight: bold;'>Imports</span></b>

In [1]:
# Handle warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data preprocessing
import numpy as np
import polars as pl
import pandas as pd
from pathlib import Path

In [3]:
# Exploratory data analysis
import plotly.express as px
import plotly.graph_objects as go

In [4]:
# Evaluation metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix

In [5]:
# Model development
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GroupKFold

<b><span style='color:#E0BFB8; font-size: 50px; font-weight: bold;'>Configuration</span></b>

In [6]:
class CFG:
    
    # Paths to competition data
    train_path = Path('/kaggle/input/isic-2024-challenge/train-metadata.csv') 
    test_path = Path('/kaggle/input/isic-2024-challenge/test-metadata.csv') 
    subm_path = Path('/kaggle/input/isic-2024-challenge/sample_submission.csv') 
    
    # Feature engineering arguments
    N = 39300
    batch_size = 131072
    
    # First model
    use_cnn1 = False
    tr1_path = Path('/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv')
    te1_path = Path('submission_effnetv1b0.csv')
    
    # Second model
    use_cnn2 = False
    tr2_path = Path('/kaggle/input/isic-inference-eva02-for-training-data/train_eva02.csv')
    te2_path = Path('submission_eva02.csv')
    
    # Third model
    use_cnn3 = False
    tr3_path = Path('/kaggle/input/isic-2024-pl-submission-script-and-preds/train_preds.csv')
    te3_path = Path('submission_image3.csv')
    
    # Fourth model
    use_cnn4 = False
    tr4_path = Path('/kaggle/input/isic-inference-edgenext-for-training-data/train_edgenext.csv')
    te4_path = Path('submission_edgenext.csv')
    
    # Fifth model
    use_cnn5 = True
    tr5_path = Path('/kaggle/input/isic-2024-imagenet-lr-ramp-target-mods/v5oof_predictions.csv')
    te5_path = Path('submission_imagenet.csv')
    
    # Model development arguments
    colorscale = 'Redor'
    early_stop = 100
    top_models = 3
    
    # LightGBM weight and parameters
    lgb_w = 0.28
    lgb_p = {
        'min_child_samples': 48,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'objective': 'binary',
        'extra_trees': True,
        'metric': 'binary',
        'reg_lambda': 0.8,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'device': 'cpu',
        'max_bin': 128,
        'max_depth': 4,
        'verbose': -1,
        'seed': 42
    }
    
    # CatBoost weight and parameters
    ctb_w = 0.72
    ctb_p = {
        'grow_policy': 'Depthwise',
        'loss_function': 'Logloss',
        'min_child_samples': 48,
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'reg_lambda': 0.8,
        'num_trees': 6000,
        'depth': 4
    }

<b><span style='color:#E0BFB8; font-size: 50px; font-weight: bold;'>Infer image models</span></b>

In [7]:
# Select first model
if CFG.use_cnn1:
    
    # Execute script to generate predictions on test data
    !python /kaggle/input/isic-script-inference-effnetv1b0-f313ae/main.py /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
    !mv submission.csv submission_effnetv1b0.csv

In [8]:
# Select second  model
if CFG.use_cnn2:
    
    # Execute script to generate predictions on test data
    !python /kaggle/input/isic-script-inference-eva02/main.py /kaggle/input/isic-pytorch-training-baseline-eva02/AUROC0.5177_Loss0.2829_epoch7.bin
    !mv submission.csv submission_eva02.csv

In [9]:
# Select third model
if CFG.use_cnn3:
    
    # Execute script to generate predictions on test data
    !python /kaggle/input/isic-2024-pl-submission-script-and-preds/pl_submission.py
    !mv submission.csv submission_image3.csv

In [10]:
# Select fourth model
if CFG.use_cnn4:
    
    # Execute script to generate predictions on test data
    !python /kaggle/input/isic-script-inference-edgenext/main.py /kaggle/input/isic-pytorch-training-edgenext/Final_model.bin
    !mv submission.csv submission_edgenext.csv

In [11]:
# Select fifth model
if CFG.use_cnn5:
    
    # Execute script to generate predictions on test data
    !python /kaggle/input/script-inference-imagenet/script.py
    !mv submission.csv submission_imagenet.csv

Using device: cuda
Predicting: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.03s/it]
Predictions saved to submission.csv
        isic_id    target
0  ISIC_0015657  0.008290
1  ISIC_0015729  0.002698
2  ISIC_0015740  0.001408


In [12]:
class FE:
    
    def __init__(self, 
                 N, 
                 batch_size,
                 use_cnn1,
                 tr1_path,
                 te1_path,
                 use_cnn2,
                 tr2_path,
                 te2_path,
                 use_cnn3,
                 tr3_path,
                 te3_path,
                 use_cnn4,
                 tr4_path,
                 te4_path,
                 use_cnn5,
                 tr5_path,
                 te5_path):
        
        self.N = N
        self.batch_size = batch_size
        self.use_cnn1 = use_cnn1
        self.tr1_path = tr1_path
        self.te1_path = te1_path
        self.use_cnn2 = use_cnn2
        self.tr2_path = tr2_path
        self.te2_path = te2_path
        self.use_cnn3 = use_cnn3
        self.tr3_path = tr3_path
        self.te3_path = te3_path
        self.use_cnn4 = use_cnn4
        self.tr4_path = tr4_path
        self.te4_path = te4_path
        self.use_cnn5 = use_cnn5
        self.tr5_path = tr5_path
        self.te5_path = te5_path
        
    def filter_data(self, path):
        
        # Read dataset as polars DataFrame
        df = pl.read_csv(path, batch_size=self.batch_size)
            
        # Drop redundant columns 
        for col in ['image_type', # Only one unique value on train metadata
                    'tbp_lv_location_simple', # Similar information to 'tbp_lv_location'
                    'copyright_license', # Redundant information for lesion classification
                    
                    # Included only on train metadata
                    'lesion_id',
                    'iddx_full',
                    'iddx_1',
                    'iddx_2',
                    'iddx_3',
                    'iddx_4',
                    'iddx_5',
                    'mel_mitotic_index',
                    'mel_thick_mm',
                    'tbp_lv_dnn_lesion_confidence']:
            
            if col in df.columns:
                df = df.drop(col)            
                    
        return df 
    
    def set_datatypes(self, df):
        
        # Handle NA values in age approximation column
        if ('age_approx' in df.columns) and df.select(pl.col('age_approx').str.contains('NA').any()).item():
            
            # Replace the value with -1
            df = df.with_columns(pl.when(pl.col('age_approx') == 'NA').then(-1).otherwise(pl.col('age_approx'))
                   .alias('age_approx'))
            
        # Define numeric columns (int)
        for col in ['target',
                    'age_approx',
                    'tbp_lv_symm_2axis_angle']:
            
            # Set dtype for numeric columns (int)
            if col in df.columns:
                df = df.with_columns(pl.col(col).cast(pl.Int16))
                        
        # Define numeric columns (float)
        for col in ['clin_size_long_diam_mm', 
                    'tbp_lv_A', 
                    'tbp_lv_Aext', 
                    'tbp_lv_B', 
                    'tbp_lv_Bext', 
                    'tbp_lv_C', 
                    'tbp_lv_Cext',
                    'tbp_lv_H', 
                    'tbp_lv_Hext',
                    'tbp_lv_L', 
                    'tbp_lv_Lext',
                    'tbp_lv_areaMM2', 
                    'tbp_lv_area_perim_ratio',
                    'tbp_lv_color_std_mean',
                    'tbp_lv_deltaA', 
                    'tbp_lv_deltaB',
                    'tbp_lv_deltaL', 
                    'tbp_lv_deltaLB',
                    'tbp_lv_deltaLBnorm',
                    'tbp_lv_eccentricity', 
                    'tbp_lv_minorAxisMM',
                    'tbp_lv_nevi_confidence',
                    'tbp_lv_norm_border',
                    'tbp_lv_norm_color',
                    'tbp_lv_perimeterMM',
                    'tbp_lv_radial_color_std_max',
                    'tbp_lv_stdL',
                    'tbp_lv_stdLExt',
                    'tbp_lv_symm_2axis',
                    'tbp_lv_x',
                    'tbp_lv_y',
                    'tbp_lv_z']: 
            
            # Set dtype for numeric columns (float)
            if col in df.columns:
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                
        # Define categorical columns
        for col in ['sex', 
                    'anatom_site_general', 
                    'tbp_tile_type', 
                    'tbp_lv_location',
                    'attribution']:
            
            # Set dtype for categorical columns
            if col in df.columns:
                df = df.with_columns(pl.col(col).cast(pl.Categorical))
                                            
        return df
    
    def aggregate_data(self, df):

        df = df.with_columns([

            # Ratio between the smallest and largest diameter of the lesion
            pl.col('tbp_lv_minorAxisMM')
            .truediv(pl.col('clin_size_long_diam_mm'))
            .cast(pl.Float32).alias('tbp_lv_diam_ratio'),

            # Absolute difference between the hue (color tone) inside and outside the lesion
            (pl.col('tbp_lv_H')
            .sub(pl.col('tbp_lv_Hext'))).abs()
            .cast(pl.Float32).alias('tbp_lv_H_contrast'),

            # Absolute difference in lightness (L) between the inside and outside of the lesion
            (pl.col('tbp_lv_L')
            .sub(pl.col('tbp_lv_Lext'))).abs()
            .cast(pl.Float32).alias('tbp_lv_L_contrast'),

            # Euclidean distance combining the differences in the L*A*B* color channels
            (pl.col('tbp_lv_deltaA')
            .add(pl.col('tbp_lv_deltaB'))
            .add(pl.col('tbp_lv_deltaL')))
            .sqrt()
            .cast(pl.Float32).alias('tbp_lv_vector_deltaLAB'),

            # Shape complexity index, calculated as the ratio of lesion area to the square of its perimeter
            pl.col('tbp_lv_areaMM2')
            .truediv(pl.col('tbp_lv_perimeterMM').pow(2))
            .cast(pl.Float32).alias('tbp_lv_shape_index'),

            # Ratio of lesion area to perimeter
            pl.col('tbp_lv_areaMM2')
            .truediv(pl.col('tbp_lv_perimeterMM'))
            .cast(pl.Float32).alias('tbp_lv_ratio_area_perim'),

            # Ratio of lesion perimeter to area
            pl.col('tbp_lv_perimeterMM')
            .truediv(pl.col('tbp_lv_areaMM2'))
            .cast(pl.Float32).alias('tbp_lv_ratio_perim_area'),

            # Combined score of border irregularity and asymmetry
            pl.col('tbp_lv_norm_border')
            .add(pl.col('tbp_lv_symm_2axis'))
            .cast(pl.Float32).alias('border_complexity'),

            # Measure of color distribution uniformity, calculated as the ratio of color irregularity to asymmetry
            pl.col('tbp_lv_color_std_mean')
            .truediv(pl.col('tbp_lv_radial_color_std_max')
                     .add(pl.lit(1e-6)))  # Adding a small constant to avoid division by zero
            .cast(pl.Float32).alias('color_uniformity'),

            # 3D Euclidean distance of the lesion's position, based on its (x, y, z) coordinates
            (pl.col('tbp_lv_x').pow(2)
             .add(pl.col('tbp_lv_y').pow(2))
             .add(pl.col('tbp_lv_z').pow(2)))
            .sqrt()
            .cast(pl.Float32).alias('position_distance_3d'),

            # Visibility score combining lesion contrast and normalized color metrics
            pl.col('tbp_lv_deltaLBnorm')
            .add(pl.col('tbp_lv_norm_color'))
            .cast(pl.Float32).alias('lesion_visibility_score'),

            # Concatenation of the general anatomical site and the lesion's specific location
            pl.col('anatom_site_general')
            .add('_')
            .add(pl.col('tbp_lv_location'))
            .cast(pl.Categorical).alias('combined_anatomical_site'),

            # Product of symmetry and border irregularity scores
            pl.col('tbp_lv_symm_2axis')
            .mul(pl.col('tbp_lv_norm_border'))
            .cast(pl.Float32).alias('symmetry_border_consistency'),

            # Ratio of the product of symmetry and border irregularity to their sum
            pl.col('tbp_lv_symm_2axis')
            .mul(pl.col('tbp_lv_norm_border'))
            .truediv(pl.col('tbp_lv_symm_2axis')
                     .add(pl.col('tbp_lv_norm_border')))
            .cast(pl.Float32).alias('consistency_symmetry_border'),

            # Ratio of standard deviation of lightness inside the lesion to the external lightness
            pl.col('tbp_lv_stdL')
            .truediv(pl.col('tbp_lv_Lext'))
            .cast(pl.Float32).alias('color_consistency'),

            # Combination of internal and external lightness variability, normalized to their sum
            pl.col('tbp_lv_stdL')
            .mul(pl.col('tbp_lv_Lext'))
            .truediv(pl.col('tbp_lv_stdL')
                     .add(pl.col('tbp_lv_Lext')))
            .cast(pl.Float32).alias('consistency_color'),

            # Interaction term between the lesion's size and the patient's age
            pl.col('clin_size_long_diam_mm')
            .mul(pl.col('age_approx'))
            .cast(pl.Float32).alias('size_age_interaction'),

            # Interaction between the lesion's hue and color variance
            pl.col('tbp_lv_H')
            .mul(pl.col('tbp_lv_color_std_mean'))
            .cast(pl.Float32).alias('hue_color_std_interaction'),

            # Composite index for lesion severity, averaging border irregularity, color variation, and eccentricity
            (pl.col('tbp_lv_norm_border')
             .add(pl.col('tbp_lv_norm_color'))
             .add(pl.col('tbp_lv_eccentricity')))
            .truediv(3)
            .cast(pl.Float32).alias('lesion_severity_index'),

            ])

        df = df.with_columns([

            # Combined index of shape complexity, including both border complexity and shape index
            pl.col('border_complexity')
            .add(pl.col('tbp_lv_shape_index'))
            .cast(pl.Float32).alias('shape_complexity_index'),

            # Composite color contrast index, summing the differences in L*A*B* channels and the contrast with surrounding skin
            pl.col('tbp_lv_deltaA')
            .add(pl.col('tbp_lv_deltaB'))
            .add(pl.col('tbp_lv_deltaL'))
            .add(pl.col('tbp_lv_deltaLBnorm'))
            .cast(pl.Float32).alias('color_contrast_index'),

            # Logarithmic transformation of lesion area, to reduce the skewness of area values
            pl.col('tbp_lv_areaMM2').log1p()
            .cast(pl.Float32).alias('log_lesion_area'),

            # Normalized lesion size by dividing the long diameter by the patient's approximate age
            pl.col('clin_size_long_diam_mm')
            .truediv(pl.col('age_approx'))
            .cast(pl.Float32).alias('normalized_lesion_size'),

            # Mean of the hue inside and outside the lesion.
            (pl.col('tbp_lv_H')
             .add(pl.col('tbp_lv_Hext')))
            .truediv(2)
            .cast(pl.Float32).alias('mean_hue_difference'),

            # Standard deviation of contrast across L*A*B* channels, measuring overall color variability
            ((pl.col('tbp_lv_deltaA').pow(2)
             .add(pl.col('tbp_lv_deltaB').pow(2))
             .add(pl.col('tbp_lv_deltaL').pow(2)))
            .truediv(3))
            .sqrt()
            .cast(pl.Float32).alias('std_dev_contrast'),

            # Composite index combining color irregularity, area-to-perimeter ratio, and symmetry
            (pl.col('tbp_lv_color_std_mean')
             .add(pl.col('tbp_lv_area_perim_ratio'))
             .add(pl.col('tbp_lv_symm_2axis')))
            .truediv(3)
            .cast(pl.Float32).alias('color_shape_composite_index'),

            # 3D orientation of the lesion, calculated using the arctangent of its Y and X coordinates
            pl.arctan2(pl.col('tbp_lv_y'), 
                       pl.col('tbp_lv_x'))
            .cast(pl.Float32).alias('lesion_orientation_3d'),

            # Mean color difference across the L*A*B* channels, providing an overall color difference score
            (pl.col('tbp_lv_deltaA')
             .add(pl.col('tbp_lv_deltaB'))
             .add(pl.col('tbp_lv_deltaL')))
            .truediv(3)
            .cast(pl.Float32).alias('overall_color_difference'),

            # Interaction between symmetry and perimeter.
            pl.col('tbp_lv_symm_2axis')
            .mul(pl.col('tbp_lv_perimeterMM'))
            .cast(pl.Float32).alias('symmetry_perimeter_interaction'),

            # Average of area-perimeter ratio, eccentricity, color irregularity, and symmetry
            (pl.col('tbp_lv_area_perim_ratio')
             .add(pl.col('tbp_lv_eccentricity'))
             .add(pl.col('tbp_lv_norm_color'))
             .add(pl.col('tbp_lv_symm_2axis')))
            .truediv(4)
            .cast(pl.Float32).alias('comprehensive_lesion_index'),

            # Ratio of internal color variance to external color standard deviation
            pl.col('tbp_lv_color_std_mean')
            .truediv(pl.col('tbp_lv_stdLExt'))
            .cast(pl.Float32).alias('color_variance_ratio'),

            # Interaction between border irregularity and color irregularity
            pl.col('tbp_lv_norm_border')
            .mul(pl.col('tbp_lv_norm_color'))
            .cast(pl.Float32).alias('border_color_interaction'),

            # Normalized interaction between border irregularity and color irregularity
            pl.col('tbp_lv_norm_border')
            .mul(pl.col('tbp_lv_norm_color'))
            .truediv(pl.col('tbp_lv_norm_border')
                     .add(pl.col('tbp_lv_norm_color')))
            .cast(pl.Float32).alias('border_color_interaction_2'),

            # Ratio of lesion size to contrast with the surrounding skin
            pl.col('clin_size_long_diam_mm')
            .truediv(pl.col('tbp_lv_deltaLBnorm'))
            .cast(pl.Float32).alias('size_color_contrast_ratio'),

            # Nevus confidence score normalized by patient age
            pl.col('tbp_lv_nevi_confidence')
            .truediv(pl.col('age_approx'))
            .cast(pl.Float32).alias('age_normalized_nevi_confidence'),

            # Geometric mean of lesion size and age
            (pl.col('clin_size_long_diam_mm').pow(2)
             .add(pl.col('age_approx').pow(2)))
            .sqrt()
            .cast(pl.Float32).alias('age_normalized_nevi_confidence_2'),

            # Interaction between color asymmetry and symmetry
            pl.col('tbp_lv_radial_color_std_max')
            .mul(pl.col('tbp_lv_symm_2axis'))
            .cast(pl.Float32).alias('color_asymmetry_index'),

            # Approximation of lesion volume in 3D space
            pl.col('tbp_lv_areaMM2')
            .mul((pl.col('tbp_lv_x').pow(2)
                  .add(pl.col('tbp_lv_y').pow(2))
                  .add(pl.col('tbp_lv_z').pow(2)))
                 .sqrt())
            .cast(pl.Float32).alias('volume_approximation_3d'),

            # Sum of absolute differences between the L*A*B* channels inside and outside the lesion
            (pl.col('tbp_lv_L')
             .sub(pl.col('tbp_lv_Lext'))).abs()
            .add((pl.col('tbp_lv_A')
                  .sub(pl.col('tbp_lv_Aext'))).abs())
            .add((pl.col('tbp_lv_B')
                  .sub(pl.col('tbp_lv_Bext'))).abs())
            .cast(pl.Float32).alias('color_range'),

            # Interaction between lesion eccentricity and color irregularity
            pl.col('tbp_lv_eccentricity')
            .mul(pl.col('tbp_lv_color_std_mean'))
            .cast(pl.Float32).alias('shape_color_consistency'),

            # Border length ratio, calculated as the ratio of the lesion's perimeter to a perfect circle's perimeter with the same area
            pl.col('tbp_lv_perimeterMM')
            .truediv(pl.lit(2)
                     .mul(np.pi)
                     .mul((pl.col('tbp_lv_areaMM2')
                           .truediv(np.pi))
                          .sqrt()))
            .cast(pl.Float32).alias('border_length_ratio'),

            # Composite index combining age, lesion size, and symmetry, indicating the interaction between these factors
            pl.col('age_approx')
            .mul(pl.col('clin_size_long_diam_mm'))
            .mul(pl.col('tbp_lv_symm_2axis'))
            .cast(pl.Float32).alias('age_size_symmetry_index'),

            # Alternative composite index combining age, lesion area, and symmetry
            pl.col('age_approx')
            .mul(pl.col('tbp_lv_areaMM2'))
            .mul(pl.col('tbp_lv_symm_2axis'))
            .cast(pl.Float32).alias('index_age_size_symmetry'),

            # Count of lesions per patient
            pl.col('isic_id').count()
            .over('patient_id')
            .cast(pl.Int16).alias('tbp_lv_count'),

        ])

        # Aggregate all numeric (float) columns using the z-score
        df = df.with_columns([
            pl.col(col).sub(pl.col(col).mean()).truediv(pl.col(col).std()).over('patient_id')
            .cast(pl.Float32).alias(f'{col}_zscore') for col in df.columns if df[col].dtype == pl.Float32
        ])

        return df
    
    def extract_cat_cols(self, df):
        
        # Define a list of categorical columns
        cat_cols = []
        
        # Find categorical columns
        for col in df.columns:
            if df[col].dtype == pl.Categorical:
                cat_cols.append(col)
                
        return cat_cols
    
    def add_cnn_preds(self, df):

        # Select first model
        if self.use_cnn1:

            if 'target' in df.columns:

                # Load train predictions of the first model
                tr1 = pl.read_csv(self.tr1_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(tr1.select(pl.col('target_effnetv1b0').cast(pl.Float32).alias('target_effnetv1b0')))

            else:

                # Load test predictions of the first model
                te1 = pl.read_csv(self.te1_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(te1.select(pl.col('target').cast(pl.Float32).alias('target_effnetv1b0')))

        # Select second model
        if self.use_cnn2:

            if 'target' in df.columns:

                # Load train predictions of the second model
                tr2 = pl.read_csv(self.tr2_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(tr2.select(pl.col('target_eva02').cast(pl.Float32).alias('target_eva02')))

            else:

                # Load test predictions of the second model
                te2 = pl.read_csv(self.te2_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(te2.select(pl.col('target').cast(pl.Float32).alias('target_eva02')))

        # Select third model
        if self.use_cnn3:

            if 'target' in df.columns:

                # Load train predictions of the third model
                tr3 = pl.read_csv(self.tr3_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(tr3.select(pl.col('pred').cast(pl.Float32).alias('target_3')))

            else:

                # Load test predictions of the third model
                te3 = pl.read_csv(self.te3_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(te3.select(pl.col('target').cast(pl.Float32).alias('target_3')))

        # Select fourth model
        if self.use_cnn4:

            if 'target' in df.columns:

                # Load train predictions of the fourth model
                tr4 = pl.read_csv(self.tr4_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(tr4.select(pl.col('target_edgenext').cast(pl.Float32).alias('target_edgenext')))

            else:

                # Load test predictions of the fourth model
                te4 = pl.read_csv(self.te4_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(te4.select(pl.col('target').cast(pl.Float32).alias('target_edgenext')))

        # Select fifth model
        if self.use_cnn5:

            if 'target' in df.columns:

                # Load train predictions of the fifth model
                tr5 = pl.read_csv(self.tr5_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(tr5.select(pl.col('oof_prediction').cast(pl.Float32).alias('oof_prediction')))

            else:

                # Load test predictions of the fifth model
                te5 = pl.read_csv(self.te5_path, batch_size=self.batch_size)

                # Assign predictions to the dataset
                df = df.with_columns(te5.select(pl.col('target').cast(pl.Float32).alias('oof_prediction')))

        return df
    
    def select_benign_cases(self, df):
        
        # Ensure downsampling only works for train data (which include the label column)
        if 'target' in df.columns:
    
            # Extract the counts of positive and negative cases
            p_cases = df[df['target'] == 1]
            n_cases = df[df['target'] == 0]

            # Select N negative cases, for initial dataset
            n_cases = n_cases.sample(n=self.N * 10, random_state=42)

            # Concatenate reduced negative cases with positive cases
            df = pd.concat([n_cases, p_cases])

        return df
    
    def display_info(self, df):
        
        # Display the shape of the DataFrame
        print(f'Shape: {df.shape}')
            
        # Display count of unique patients
        count = df['patient_id'].nunique()
        print(f'Unique patients: {count}')
        
        # Display the memory usage of the DataFrame
        mem = df.memory_usage().sum() / 1024**2
        print('Memory usage: {:.2f} MB\n'.format(mem))
    
    def process_data(self, path):
        
        # Load and clean dataset
        df = self.filter_data(path)
        
        # Set proper datatypes
        df = self.set_datatypes(df)
        
        # Aggregate dataset
        df = self.aggregate_data(df)
            
        # Extract categorical columns
        cat_cols = self.extract_cat_cols(df)
        
        # Add CNN generated predictions to the DataFrame
        df = self.add_cnn_preds(df)
        
        # Convert to pandas DataFrame
        df = df.to_pandas()
            
        # Downsample benign cases
        df = self.select_benign_cases(df)

        # Reset DataFrame indices
        df = df.reset_index(drop=True)

        # Display info about DataFrame
        self.display_info(df)
        
        return df, cat_cols
    
    def downsample_data(self, df, index):

        # Separate the malignant (positive) and benign (negative) cases
        p_cases = df[df['target'] == 1] 
        n_cases = df[df['target'] == 0]
            
        # Raise error if the index is invalid
        if not (1 <= index <= 10):
            raise ValueError('Index must be an integer between 1 and 10!')
            
        # Define start and finish indices for downsampling
        start = (index - 1) * self.N
        finish = index * self.N
        
        # Select N unique benign (negative) cases for training dataset
        n_cases = n_cases.iloc[start:finish]

        # Concatenated selected negative cases with all positive cases
        df = pd.concat([p_cases, n_cases], axis=0)
        
        # Reset the indices
        df = df.reset_index(drop=True)
        
        # Display info about DataFrame
        self.display_info(df)
        
        return df

In [13]:
# Initialize class for feature engineering
fe = FE(CFG.N, 
        CFG.batch_size,
        CFG.use_cnn1,
        CFG.tr1_path,
        CFG.te1_path,
        CFG.use_cnn2,
        CFG.tr2_path,
        CFG.te2_path,
        CFG.use_cnn3,
        CFG.tr3_path,
        CFG.te3_path,
        CFG.use_cnn4,
        CFG.tr4_path,
        CFG.te4_path,
        CFG.use_cnn5,
        CFG.tr5_path,
        CFG.te5_path)

In [14]:
class Metrics:
    
    @staticmethod
    def pauc(y_true, y_scores, tpr_threshold=0.8):
        
        # Rescale labels: set 0s to 1s and 1s to 0s (because sklearn only has max_fpr, not min_tpr)
        rescaled_labels = abs(np.asarray(y_true) - 1)

        # Flip the prediction scores to their complements (to work with rescaled label)
        flipped_preds = -1.0 * np.asarray(y_scores)

        # Calculate the maximum false positive rate based on the given TPR threshold
        max_fpr = abs(1 - tpr_threshold)

        # Calculate the ROC curve
        fpr, tpr, _ = roc_curve(rescaled_labels, flipped_preds, sample_weight=None)

        # Find the index where FPR exceeds max_fpr
        interp_idx = np.searchsorted(fpr, max_fpr, 'right')

        # Define points for linear interpolation
        x_interp = [fpr[interp_idx - 1], fpr[interp_idx]]
        y_interp = [tpr[interp_idx - 1], tpr[interp_idx]]

        # Add interpolated point to TPR and FPR arrays
        tpr = np.append(tpr[:interp_idx], np.interp(max_fpr, x_interp, y_interp))
        fpr = np.append(fpr[:interp_idx], max_fpr)

        # Calculate the partial AUC
        partial_auc = auc(fpr, tpr)
        
        return partial_auc
    
    @staticmethod
    def plot_cv(fold_scores, model_name):
        
        # Round the fold scores to 4 decimal places
        fold_scores = [round(score, 4) for score in fold_scores]
        mean_score = round(np.mean(fold_scores), 4)
        std_score = round(np.std(fold_scores), 4)

        # Create a new figure for plotting
        fig = go.Figure()

        # Add scatter plot for individual fold scores
        fig.add_trace(go.Scatter(
            x = list(range(1, len(fold_scores) + 1)),
            y = fold_scores,
            mode = 'markers', 
            name = 'Fold Scores',
            marker = dict(size = 24, color='#93C572', symbol='diamond'),  # Diamond shape marker, colored Pistachio
            text = [f'{score:.4f}' for score in fold_scores],
            hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
            hoverlabel=dict(font=dict(size=16))  # Adjust the font size here
        ))

        # Add a horizontal line for the mean score
        fig.add_trace(go.Scatter(
            x = [1, len(fold_scores)],
            y = [mean_score, mean_score],
            mode = 'lines',
            name = f'Mean: {mean_score:.4f}',
            line = dict(dash = 'dash', color = '#F08000'), # Colored Tangerine
            hoverinfo = 'none'
        ))

        # Update the layout of the plot
        fig.update_layout(
            title = f'{model_name} Cross-Validation pAUC Scores | Variation of CV scores: {mean_score} ± {std_score}',
            xaxis_title = 'Fold',
            yaxis_title = 'pAUC Score',
            plot_bgcolor = 'rgba(0,0,0,0)',
            paper_bgcolor = 'rgba(0,0,0,0)',
            xaxis = dict(
                gridcolor = 'lightgray',
                tickmode = 'linear',
                tick0 = 1,
                dtick = 1,
                range = [0.5, len(fold_scores) + 0.5]
            ),
            yaxis = dict(gridcolor = 'lightgray')
        )

        # Display the plot
        fig.show() 
        
    @staticmethod
    def plot_cm(y_true, y_pred, colorscale, valid_pauc):
        
        # Get unique labels
        labels = sorted(np.unique(y_true))
        
        # Compute ROC curve to find the TPR80 threshold
        _, tpr, thresholds = roc_curve(y_true, y_pred)

        # Find the TRP80 threshold
        tpr80 = thresholds[np.where(tpr >= 0.8)[0][0]]

        # Compute confusion matrix
        cm = confusion_matrix(y_true, 
                              y_pred=(y_pred >= tpr80).astype(int), 
                              labels=labels)
        
        # Create the heatmap
        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=labels,
            y=labels,
            colorscale=colorscale,
            zmin=0,
            
            # Use the maximum value in the confusion matrix
            zmax=np.max(cm),  
            text=cm,
            texttemplate='%{text:.0f}',
            hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z:,.0f}<extra></extra>',
            
            # Create a custom hover text format
            customdata = [str(int(val)) for val in cm.flatten()]
        ))
        
        # Update layout for a transparent background and square aspect ratio
        fig.update_layout(
            title = f'Ensemble pAUC score: {round(valid_pauc, 4)} | Confusion Matrix:',
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            xaxis_title='Predicted Labels',
            yaxis_title='True Labels',
            xaxis=dict(constrain='domain'),
            yaxis=dict(constrain='domain', scaleanchor='x'),
            width=900,  
            height=900,  
            margin=dict(t=90, b=90, l=90, r=90) 
        )
        
        # Show the plot
        fig.show()

In [15]:
class MD:
    
    def __init__(self, 
                 colorscale,
                 early_stop,
                 top_models,
                 lgb_w,
                 lgb_p,
                 ctb_w,
                 ctb_p):
        
        self.colorscale = colorscale
        self.early_stop = early_stop
        self.top_models = top_models
        self.lgb_w = lgb_w
        self.lgb_p = lgb_p
        self.ctb_w = ctb_w
        self.ctb_p = ctb_p
    
    def train_lgb(self, data, cat_cols, title):
        
        # Convert categorical columns to category dtype
        for col in cat_cols:
            data[col] = data[col].astype('category')
        
        # Split features and label
        X = data.drop(['target', 'isic_id', 'patient_id'], axis=1)
        y = data['target']
        groups = data['patient_id']
        
        # Initialize cross validation strategy (GroupKFold)
        cv = GroupKFold(5)
        
        # Initialize lists to store models, cv scores, and OOF predictions
        models, scores = [], []
        oof_preds = np.zeros(len(X))
        
        # Perform cross-validation
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, groups)):
            
            # Split the data into training and validation sets for the current fold
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            # Create LightGBM datasets
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols, reference=train_data)
            
            # Train the model
            model = lgb.train(self.lgb_p, 
                              train_data, 
                              valid_sets=[valid_data], 
                              callbacks=[lgb.early_stopping(self.early_stop, verbose=0), 
                                         lgb.log_evaluation(0)])
            
            # Append the trained model to the list
            models.append(model)
            
            # Make predictions on the validation set
            oof_preds[valid_index] = model.predict(X_valid)
            
            # Calculate and store the pAUC score for the current (valid) fold
            score = Metrics.pauc(y_valid, oof_preds[valid_index])
            scores.append(score)
        
        # Plot the cross-validation results
        Metrics.plot_cv(scores, title)
        
        return models, oof_preds
    
    def train_ctb(self, data, cat_cols, title):
        
        # Convert categorical columns to string
        for col in cat_cols:
            data[col] = data[col].astype(str)
        
        # Split features and label
        X = data.drop(['target', 'isic_id', 'patient_id'], axis=1)
        y = data['target']
        groups = data['patient_id']
        
        # Initialize cross validation strategy (GroupKFold)
        cv = GroupKFold(5)
        
        # Initialize lists to store models, cv scores, and OOF predictions
        models, scores = [], []
        oof_preds = np.zeros(len(X))
        
        # Perform cross-validation
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, groups)):
            
            # Split the data into training and validation sets for the current fold
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            # Create CatBoost pools
            train_pool = Pool(X_train, y_train, cat_features=cat_cols)
            valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
            
            # Initialize CatBoost
            model = CatBoostClassifier(**self.ctb_p, verbose=0)
            
            # Train the model
            model.fit(train_pool, 
                      eval_set=valid_pool, 
                      early_stopping_rounds=self.early_stop)
            
            # Append the trained model to the list
            models.append(model)
            
            # Make predictions on the validation set
            oof_preds[valid_index] = model.predict_proba(valid_pool)[:, 1]
            
            # Calculate and store the pAUC score for the current (valid) fold
            score = Metrics.pauc(y_valid, oof_preds[valid_index])
            scores.append(score)
        
        # Plot the cross-validation results
        Metrics.plot_cv(scores, title)
        
        return models, oof_preds

    def infer_lgb(self, data, cat_cols, models):

        # Convert categorical columns to category dtype
        for col in cat_cols:
            data[col] = data[col].astype('category')

        # Return the averaged predictions of LightGBM classifiers
        return np.mean([model.predict(data) for model in models], axis=0)
    
    def infer_ctb(self, data, cat_cols, models):
        
        # Convert categorical columns to string
        for col in cat_cols:
            data[col] = data[col].astype(str)
        
        # Create CatBoost pool for inference
        pool = Pool(data, cat_features=cat_cols)
        
        # Return the averaged predictions of CatBoost classifiers
        return np.mean([model.predict_proba(pool)[:, 1] for model in models], axis=0)
    
    def generate_preds(self, train_data, test_data, cat_cols):
        
        # Extract features columns and label
        X = train_data.drop(['target', 'isic_id', 'patient_id'], axis=1)
        y = train_data['target']
        
        # Train LightGBM and CatBoost
        lgb_models, oof_lgb_preds = self.train_lgb(train_data, cat_cols, 'LightGBM')
        ctb_models, oof_ctb_preds = self.train_ctb(train_data, cat_cols, 'CatBoost')
        
        # Blend the out-of-fold (OOF) predictions of LightGBM and CatBoost
        valid_preds = oof_lgb_preds * self.lgb_w + oof_ctb_preds * self.ctb_w
        
        # Calculate pAUC score and plot confusion matrix for out-of-fold (OOF) predictions
        valid_pauc = Metrics.pauc(y, valid_preds)
        Metrics.plot_cm(y, valid_preds, self.colorscale, valid_pauc)  
        
        # Prepare test data for inference
        test_data = test_data.drop(['isic_id', 'patient_id'], axis=1)

        # Infer LightGBM and CatBoost on test data
        test_lgb_preds = self.infer_lgb(test_data, cat_cols, lgb_models)
        test_ctb_preds = self.infer_ctb(test_data, cat_cols, ctb_models)
        
        # Blend LightGBM and CatBoost predictions
        test_preds = test_lgb_preds * self.lgb_w + test_ctb_preds * self.ctb_w
        
        return test_preds, valid_pauc
    
    def cherry_pick_models(self, results):
        
        # Sort the list of tuples by pAUC score in descending order (best score first)
        results.sort(key=lambda x: x[1], reverse=True)

        # Select the ensembles with highest validation pAUC score
        models = results[:self.top_models] 

        # Extract predictions and pAUC scores of the ensembles
        top_preds = [preds for preds, pauc in models]
        top_scores = [pauc for preds, pauc in models]

        # Round the values of the pAUC scores to 4 digits
        top_scores = [round(score, 4) for score in top_scores]
        print('(Ranked) pAUC scores:', *top_scores)
        
        # Return top predictions as a NumPy array
        return np.array(top_preds)

In [16]:
# Initialize class for model development
md = MD(CFG.colorscale,
        CFG.early_stop,
        CFG.top_models,
        CFG.lgb_w,
        CFG.lgb_p,
        CFG.ctb_w,
        CFG.ctb_p)

<b><span style='color:#E0BFB8; font-size: 50px; font-weight: bold;'>Feature Engineering</span></b>

In [17]:
# Load train data and categorical columns
train_data, cat_cols = fe.process_data(CFG.train_path)

Shape: (393393, 161)
Unique patients: 1042
Memory usage: 234.86 MB



In [18]:
# Load test data
test_data, _ = fe.process_data(CFG.test_path)

Shape: (3, 160)
Unique patients: 3
Memory usage: 0.00 MB



In [19]:
# Initialize a list to store test predictions and pAUC scores
results = []

<b><span style='color:#E0BFB8; font-size: 50px; font-weight: bold;'>Model Development</span></b>

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 1<span style="vertical-align: super; font-size: 60%;">st</span> ensemble</span></b>

In [20]:
# Create 1st training dataset
train_01 = fe.downsample_data(train_data, 1)

Shape: (39693, 161)
Unique patients: 1023
Memory usage: 23.70 MB



In [21]:
# Generate predictions on the test data using the 1st training dataset
preds_01, pauc_01 = md.generate_preds(train_01, test_data, cat_cols)

In [22]:
# Store the predictions and validation pAUC of the 1st ensemble
results.append((preds_01, pauc_01))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 2<span style="vertical-align: super; font-size: 60%;">nd</span> ensemble</span></b>

In [23]:
# Create 2nd training dataset
train_02 = fe.downsample_data(train_data, 2)

Shape: (39693, 161)
Unique patients: 1026
Memory usage: 23.70 MB



In [24]:
# Generate predictions on the test data using the 2nd training dataset
preds_02, pauc_02 = md.generate_preds(train_02, test_data, cat_cols)

In [25]:
# Store the predictions and validation pAUC of the 2nd ensemble
results.append((preds_02, pauc_02))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 3<span style="vertical-align: super; font-size: 60%;">rd</span> ensemble</span></b>

In [26]:
# Create 3rd training dataset
train_03 = fe.downsample_data(train_data, 3)

Shape: (39693, 161)
Unique patients: 1024
Memory usage: 23.70 MB



In [27]:
# Generate predictions on the test data using the 3rd training dataset
preds_03, pauc_03 = md.generate_preds(train_03, test_data, cat_cols)

In [28]:
# Store the predictions and validation pAUC of the 3rd ensemble
results.append((preds_03, pauc_03))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 4<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [29]:
# Create 4th training dataset
train_04 = fe.downsample_data(train_data, 4)

Shape: (39693, 161)
Unique patients: 1023
Memory usage: 23.70 MB



In [30]:
# Generate predictions on the test data using the 4th training dataset
preds_04, pauc_04 = md.generate_preds(train_04, test_data, cat_cols)

In [31]:
# Store the predictions and validation pAUC of the 4th ensemble
results.append((preds_04, pauc_04))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 5<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [32]:
# Create 5th training dataset
train_05 = fe.downsample_data(train_data, 5)

Shape: (39693, 161)
Unique patients: 1020
Memory usage: 23.70 MB



In [33]:
# Generate predictions on the test data using the 5th training dataset
preds_05, pauc_05 = md.generate_preds(train_05, test_data, cat_cols)

In [34]:
# Store the predictions and validation pAUC of the 5th ensemble
results.append((preds_05, pauc_05))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 6<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [35]:
# Create 6th training dataset
train_06 = fe.downsample_data(train_data, 6)

Shape: (39693, 161)
Unique patients: 1025
Memory usage: 23.70 MB



In [36]:
# Generate predictions on the test data using the 6th training dataset
preds_06, pauc_06 = md.generate_preds(train_06, test_data, cat_cols)

In [37]:
# Store the predictions and validation pAUC of the 6th ensemble
results.append((preds_06, pauc_06))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 7<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [38]:
# Create 7th training dataset
train_07 = fe.downsample_data(train_data, 7)

Shape: (39693, 161)
Unique patients: 1024
Memory usage: 23.70 MB



In [39]:
# Generate predictions on the test data using the 7th training dataset
preds_07, pauc_07 = md.generate_preds(train_07, test_data, cat_cols)

In [40]:
# Store the predictions and validation pAUC of the 7th ensemble
results.append((preds_07, pauc_07))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 8<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [41]:
# Create 8th training dataset
train_08 = fe.downsample_data(train_data, 8)

Shape: (39693, 161)
Unique patients: 1025
Memory usage: 23.70 MB



In [42]:
# Generate predictions on the test data using the 8th training dataset
preds_08, pauc_08 = md.generate_preds(train_08, test_data, cat_cols)

In [43]:
# Store the predictions and validation pAUC of the 8th ensemble
results.append((preds_08, pauc_08))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 9<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [44]:
# Create 9th training dataset
train_09 = fe.downsample_data(train_data, 9)

Shape: (39693, 161)
Unique patients: 1024
Memory usage: 23.70 MB



In [45]:
# Generate predictions on the test data using the 9th training dataset
preds_09, pauc_09 = md.generate_preds(train_09, test_data, cat_cols)

In [46]:
# Store the predictions and validation pAUC of the 9th ensemble
results.append((preds_09, pauc_09))

<b><span style='color:#E0BFB8; font-size: 30px; font-weight: bold;'>Develop 10<span style="vertical-align: super; font-size: 60%;">th</span> ensemble</span></b>

In [47]:
# Create 10th training dataset
train_10 = fe.downsample_data(train_data, 10)

Shape: (39693, 161)
Unique patients: 1023
Memory usage: 23.70 MB



In [48]:
# Generate predictions on the test data using the 10th training dataset
preds_10, pauc_10 = md.generate_preds(train_10, test_data, cat_cols)

In [49]:
# Store the predictions and validation pAUC of the 10th ensemble
results.append((preds_10, pauc_10))

<b><span style='color:#E0BFB8; font-size: 50px; font-weight: bold;'>Post processing</span></b>

In [50]:
# Cherry pick best performing ensembles
top_preds = md.cherry_pick_models(results)

(Ranked) pAUC scores: 0.1748 0.1748 0.1744


In [51]:
# Average the predictions of the best performing ensembles
preds = np.mean(top_preds, axis=0)

In [52]:
# Load submission data and assign predictions to submission DataFrame
subm_data = pd.read_csv(CFG.subm_path)
subm_data['target'] = preds

In [53]:
# Save the submission DataFrame
subm_data.to_csv('submission.csv', index=False)
display(subm_data.head())

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.411508
1,ISIC_0015729,0.521557
2,ISIC_0015740,0.537487
