### <b><span style='color:#C9A9A6'>Table of Contents</span></b> <a class='anchor' id='top'></a>

<div style="padding: 13px 13px; border-radius: 8px; color: white">
    <li><a href="#notebook-configuration" style="color: #C9A9A6;">Notebook Configuration</a></li>
    <li><a href="#feature-engineering" style="color: #C9A9A6;">Feature Engineering</a></li>
    <li><a href="#model-development" style="color: #C9A9A6;">Model Development</a></li>
</div>

<div style="background-color: #C9A9A6; padding: 10px; border-radius: 10px;">
    <a class='anchor' id='notebook-configuration'></a>
    <p style="text-align: center; font-size: 140%; font-weight: bold; margin: 0; color: black !important; font-family: 'Arial Rounded MT Bold', Arial, sans-serif; text-shadow: none;">
        <a href="#notebook-configuration" style="color: black;">Notebook Configuration</a>
    </p>
</div>

In [1]:
# System operations
import os
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

In [2]:
# Exploratory data analysis
import plotly.express as px
import plotly.graph_objects as go

In [3]:
# Data preprocessing
import numpy as np
import pandas as pd
import polars as pl

In [4]:
# Evaluation metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix

In [5]:
# Model development
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GroupKFold

In [6]:
class CFG:
    
    # Directories to competition data
    train_path = Path('/kaggle/input/isic-2024-challenge/train-metadata.csv')
    test_path = Path('/kaggle/input/isic-2024-challenge/test-metadata.csv')
    subm_path = Path('/kaggle/input/isic-2024-challenge/sample_submission.csv')
    
    # Directories to sources with pretrained-CNN models
    source1_path = Path('/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv')
    source2_path = Path('/kaggle/input/nextvit/train_effnetv1b0.csv')
    source3_path = Path('/kaggle/input/selecsls42b-in1k-drop/train_effnetv1b0.csv')
    source4_path = Path('/kaggle/input/isic-2024-pl-submission-script-and-preds/train_preds.csv')
    
    # Directories to sample submissions of pretrained CNN-models
    subm1_path = Path('submission_effnetv1b0.csv')
    subm2_path = Path('/kaggle/input/nextvit/submission.csv')
    subm3_path = Path('/kaggle/input/selecsls42b-in1k-drop/submission.csv')
    subm4_path = Path('submission_image3.csv')
    
    # Number of negative samples to retain
    N = 117900 # 1:300 = positive:negative
    
    # Ensembling weights
    lgb_weight = 0.19
    ctb_weight = 0.81
    
    # Early stopping steps
    early_stop = 30
    
    # LightGBM parameters
    lgb_params = {
        'min_child_samples': 48,
        'num_iterations': 3000,
        'learning_rate': 0.03,
        'objective': 'binary',
        'extra_trees': True,
        'metric': 'binary',
        'reg_lambda': 0.8,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'device': 'cpu',
        'max_bin': 128,
        'max_depth': 4,
        'verbose': -1,
        'seed': 42
    }
    
    # CatBoost parameters
    ctb_params = {
        'grow_policy': 'Depthwise',
        'loss_function': 'Logloss',
        'min_child_samples': 48,
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'reg_lambda': 0.8,
        'num_trees': 3000,
        'depth': 4
    }

<div style="background-color: #C9A9A6; padding: 10px; border-radius: 10px;">
    <a class='anchor' id='feature-engineering'></a>
    <p style="text-align: center; font-size: 140%; font-weight: bold; margin: 0; color: black !important; font-family: 'Arial Rounded MT Bold', Arial, sans-serif; text-shadow: none;">
        <a href="#feature-engineering" style="color: black;">Feature Engineering</a>
    </p>
</div>

In [7]:
!python /kaggle/input/isic-script-inference-effnetv1b0-f313ae/main.py /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
!mv submission.csv submission_effnetv1b0.csv

BEST_WEIGHT = /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.01s/it]


In [8]:
!python /kaggle/input/isic-script-inference-eva02/main.py /kaggle/input/isic-pytorch-training-baseline-eva02/AUROC0.5177_Loss0.2829_epoch7.bin
!mv submission.csv submission_eva02.csv

BEST_WEIGHT = /kaggle/input/isic-pytorch-training-baseline-eva02/AUROC0.5177_Loss0.2829_epoch7.bin
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.63it/s]


In [9]:
!python /kaggle/input/isic-2024-pl-submission-script-and-preds/pl_submission.py
!mv submission.csv submission_image3.csv

  df_train_meta = pd.read_csv(BASE_DATA_DIR + "train-metadata.csv")


In [10]:
class FeatureEngineering:
    
    def __init__(self, 
                 source1_path, 
                 source2_path, 
                 source3_path, 
                 source4_path, 
                 subm1_path, 
                 subm2_path, 
                 subm3_path, 
                 subm4_path):
        
        self.source1_path = source1_path
        self.source2_path = source2_path
        self.source3_path = source3_path
        self.source4_path = source4_path
        self.subm1_path = subm1_path
        self.subm2_path = subm2_path
        self.subm3_path = subm3_path
        self.subm4_path = subm4_path
    
    def filter_data(self, path):
        
        # Read dataset as polars DataFrame
        df = pl.read_csv(path, low_memory=True)
            
        # Drop redundant columns 
        for col in ['isic_id', # Redundant for loading train data
                    'image_type', # Only one unique value on train metadata
                    'tbp_lv_location_simple', # Similar information to 'tbp_lv_location'
                    'copyright_license', # Redundant information for lesion classification
                    
                    # Included only on train metadata
                    'lesion_id',
                    'iddx_full',
                    'iddx_1',
                    'iddx_2',
                    'iddx_3',
                    'iddx_4',
                    'iddx_5',
                    'mel_mitotic_index',
                    'mel_thick_mm',
                    'tbp_lv_dnn_lesion_confidence']:
            
            if col in df.columns:
                df = df.drop(col)            
                    
        return df 
    
    def set_datatypes(self, df):
        
        # Handle NA values in age approximation column
        if ('age_approx' in df.columns) and df.select(pl.col('age_approx').str.contains('NA').any()).item():
            
            # Replace the value with -1
            df = df.with_columns(pl.when(pl.col('age_approx') == 'NA').then(-1).otherwise(pl.col('age_approx'))
                   .alias('age_approx'))
            
        # Define numeric columns (int)
        for col in ['target',
                    'age_approx',
                    'tbp_lv_symm_2axis_angle']:
            
            # Set dtype for numeric columns (int)
            if col in df.columns:
                df = df.with_columns(pl.col(col).cast(pl.Int16))
                        
        # Define numeric columns (float)
        for col in ['clin_size_long_diam_mm', 
                    'tbp_lv_A', 
                    'tbp_lv_Aext', 
                    'tbp_lv_B', 
                    'tbp_lv_Bext', 
                    'tbp_lv_C', 
                    'tbp_lv_Cext',
                    'tbp_lv_H', 
                    'tbp_lv_Hext',
                    'tbp_lv_L', 
                    'tbp_lv_Lext',
                    'tbp_lv_areaMM2', 
                    'tbp_lv_area_perim_ratio',
                    'tbp_lv_color_std_mean',
                    'tbp_lv_deltaA', 
                    'tbp_lv_deltaB',
                    'tbp_lv_deltaL', 
                    'tbp_lv_deltaLB',
                    'tbp_lv_deltaLBnorm',
                    'tbp_lv_eccentricity', 
                    'tbp_lv_minorAxisMM',
                    'tbp_lv_nevi_confidence',
                    'tbp_lv_norm_border',
                    'tbp_lv_norm_color',
                    'tbp_lv_perimeterMM',
                    'tbp_lv_radial_color_std_max',
                    'tbp_lv_stdL',
                    'tbp_lv_stdLExt',
                    'tbp_lv_symm_2axis',
                    'tbp_lv_x',
                    'tbp_lv_y',
                    'tbp_lv_z']: 
            
            # Set dtype for numeric columns (float)
            if col in df.columns:
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                
        # Define categorical columns
        for col in ['sex', 
                    'anatom_site_general', 
                    'tbp_tile_type', 
                    'tbp_lv_location',
                    'attribution']:
            
            # Set dtype for categorical columns
            if col in df.columns:
                df = df.with_columns(pl.col(col).cast(pl.Categorical))
                                            
        return df
    
    @staticmethod
    def aggregate_data(df):
        
        df = df.with_columns([
            
            # Ratio of A* to the product of A*ext and age minimum A*
            pl.col('tbp_lv_A').truediv(pl.col('tbp_lv_Aext').mul(pl.col('tbp_lv_A').min()))
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_ratio_A'),
            
            # Ratio of B* to the product of B*ext and age minimum B*
            pl.col('tbp_lv_B').truediv(pl.col('tbp_lv_Bext').mul(pl.col('tbp_lv_B').min()))
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_ratio_B'),
            
            # Ratio of C* to the product of C*ext and age minimum C*
            pl.col('tbp_lv_C').truediv(pl.col('tbp_lv_Cext').mul(pl.col('tbp_lv_C').min()))
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_ratio_C'),
            
            # Ratio of H* to the product of H*ext and age minimum H*
            pl.col('tbp_lv_H').truediv(pl.col('tbp_lv_Hext').mul(pl.col('tbp_lv_H').min()))
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_ratio_H'),
            
            # Ratio of L* to the product of L*ext and age minimum L*
            pl.col('tbp_lv_L').truediv(pl.col('tbp_lv_Lext').mul(pl.col('tbp_lv_L').min()))
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_ratio_L'),
            
        ])
        
        df = df.with_columns([
            
            # Contrast between A* and A*ext
            pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext'))
            .cast(pl.Float32).alias('tbp_lv_contrast_A'),
            
            # Contrast between B* and B*ext
            pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext'))
            .cast(pl.Float32).alias('tbp_lv_contrast_B'),
            
            # Contrast between C* and C*ext
            pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext'))
            .cast(pl.Float32).alias('tbp_lv_contrast_C'),
            
            # Contrast between H* and H*ext
            pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext'))
            .cast(pl.Float32).alias('tbp_lv_contrast_H'),
            
            # Contrast between L* and L*ext
            pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext'))
            .cast(pl.Float32).alias('tbp_lv_contrast_L'), 
            
        ])
        
        df = df.with_columns([     
            
            # Ratio of tbp_lv_ratio_A to patient average
            pl.col('tbp_lv_ratio_A').truediv(pl.col('tbp_lv_ratio_A').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_ratio_A'),
            
            # Ratio of tbp_lv_ratio_B to patient average
            pl.col('tbp_lv_ratio_B').truediv(pl.col('tbp_lv_ratio_B').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_ratio_B'),
            
            # Ratio of tbp_lv_ratio_C to patient average
            pl.col('tbp_lv_ratio_C').truediv(pl.col('tbp_lv_ratio_C').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_ratio_C'),
            
            # Ratio of tbp_lv_ratio_H to patient average
            pl.col('tbp_lv_ratio_H').truediv(pl.col('tbp_lv_ratio_H').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_ratio_H'),
            
            # Ratio of tbp_lv_ratio_L to patient average
            pl.col('tbp_lv_ratio_L').truediv(pl.col('tbp_lv_ratio_L').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_ratio_L'),
            
        ])
        
        df = df.with_columns([     
            
            # Ratio of tbp_lv_contrast_A to patient average
            pl.col('tbp_lv_contrast_A').truediv(pl.col('tbp_lv_contrast_A').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_contrast_A'),
            
            # Ratio of tbp_lv_contrast_B to patient average
            pl.col('tbp_lv_contrast_B').truediv(pl.col('tbp_lv_contrast_B').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_contrast_B'),
            
            # Ratio of tbp_lv_contrast_C to patient average
            pl.col('tbp_lv_contrast_C').truediv(pl.col('tbp_lv_contrast_C').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_contrast_C'),
            
            # Ratio of tbp_lv_contrast_H to patient average
            pl.col('tbp_lv_contrast_H').truediv(pl.col('tbp_lv_contrast_H').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_contrast_H'),
            
            # Ratio of tbp_lv_contrast_L to patient average
            pl.col('tbp_lv_contrast_L').truediv(pl.col('tbp_lv_contrast_L').mean())
            .over('patient_id')
            .cast(pl.Float32).alias('tbp_lv_patient_contrast_L'),
            
        ])
        
        df = df.with_columns([     
            
            # Ratio of tbp_lv_ratio_A to age average
            pl.col('tbp_lv_ratio_A').truediv(pl.col('tbp_lv_ratio_A').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_ratio_A'),
            
            # Ratio of tbp_lv_ratio_B to age average
            pl.col('tbp_lv_ratio_B').truediv(pl.col('tbp_lv_ratio_B').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_ratio_B'),
            
            # Ratio of tbp_lv_ratio_C to age average
            pl.col('tbp_lv_ratio_C').truediv(pl.col('tbp_lv_ratio_C').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_ratio_C'),
            
            # Ratio of tbp_lv_ratio_H to age average
            pl.col('tbp_lv_ratio_H').truediv(pl.col('tbp_lv_ratio_H').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_ratio_H'),
            
            # Ratio of tbp_lv_ratio_L to age average
            pl.col('tbp_lv_ratio_L').truediv(pl.col('tbp_lv_ratio_L').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_ratio_L'),
            
        ])
        
        df = df.with_columns([     
            
            # Ratio of tbp_lv_contrast_A to age average
            pl.col('tbp_lv_contrast_A').truediv(pl.col('tbp_lv_contrast_A').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_contrast_A'),
            
            # Ratio of tbp_lv_contrast_B to age average
            pl.col('tbp_lv_contrast_B').truediv(pl.col('tbp_lv_contrast_B').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_contrast_B'),
            
            # Ratio of tbp_lv_contrast_C to age average
            pl.col('tbp_lv_contrast_C').truediv(pl.col('tbp_lv_contrast_C').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_contrast_C'),
            
            # Ratio of tbp_lv_contrast_H to age average
            pl.col('tbp_lv_contrast_H').truediv(pl.col('tbp_lv_contrast_H').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_contrast_H'),
            
            # Ratio of tbp_lv_contrast_L to age average
            pl.col('tbp_lv_contrast_L').truediv(pl.col('tbp_lv_contrast_L').mean())
            .over('age_approx')
            .cast(pl.Float32).alias('tbp_lv_age_contrast_L'),
            
        ]) 
        
        return df
    
    def extract_cat_cols(self, df):
        
        # Define a list of categorical columns
        cat_cols = []
        
        # Find categorical columns
        for col in df.columns:
            if df[col].dtype == pl.Categorical:
                cat_cols.append(col)
                
        return cat_cols
    
    def add_cnn_preds(self, df, is_train):

        # Use generated predictions for train data
        if is_train:

            source1 = pd.read_csv(self.source1_path)
            source2 = pd.read_csv(self.source2_path)
            source3 = pd.read_csv(self.source3_path)
            source4 = pd.read_csv(self.source4_path)

            source1 = source1[['target_effnetv1b0']]
            source2 = source2[['target_effnetv1b0']]
            source3 = source3[['target_effnetv1b0']]
            source4 = source4[['pred']]

            df['target_effnetv1b0'] = source1['target_effnetv1b0']
            df['target_nexnetv1b0'] = source2['target_effnetv1b0']
            df['target_selnetv1b0'] = source3['target_effnetv1b0']
            df['target_3'] = source4['pred']

        # Use submission files for test data
        else:

            subm1 = pd.read_csv(self.subm1_path)
            subm2 = pd.read_csv(self.subm2_path)
            subm3 = pd.read_csv(self.subm3_path)
            subm4 = pd.read_csv(self.subm4_path)

            df['target_effnetv1b0'] = subm1['target']
            df['target_nexnetv1b0'] = subm2['target']
            df['target_selnetv1b0'] = subm3['target']
            df['target_3'] = subm4['target']

        return df
    
    def downsample_data(self, df, N):
    
        # Extract the counts of positive and negative cases
        p_cases = df[df['target'] == 1]
        n_cases = df[df['target'] == 0]

        # Select N negative cases
        n_cases = n_cases.sample(n=N, random_state=42)

        # Concatenate reduced negative cases with positive cases
        df = pd.concat([n_cases, p_cases])

        return df
    
    def display_info(self, df):
        
        # Display the shape of the DataFrame
        print(f'Shape: {df.shape}')
            
        # Display count of unique patients
        count = df['patient_id'].nunique()
        print(f'Unique patients: {count}')
        
        # Display the memory usage of the DataFrame
        mem = df.memory_usage().sum() / 1024**2
        print('Memory usage: {:.2f} MB\n'.format(mem))
        
    def process_data(self, path, N=None, is_train=True):
        
        # Load and clean dataset
        df = self.filter_data(path)
        
        # Set proper datatypes
        df = self.set_datatypes(df)
        
        # Aggregate dataset
        df = self.aggregate_data(df)
            
        # Extract categorical columns
        cat_cols = self.extract_cat_cols(df)
        
        # Convert to pandas DataFrame
        df = df.to_pandas()
        
        # Add CNN-generated predictions
        df = self.add_cnn_preds(df, is_train)
        
        # Downsample negative cases (only for train data)
        if N is not None:
            df = self.downsample_data(df, N)
        
        # Display info about DataFrame
        self.display_info(df)
        
        return df, cat_cols

In [11]:
# Initialize class for feature engineering
fe = FeatureEngineering(CFG.source1_path, 
                        CFG.source2_path, 
                        CFG.source3_path, 
                        CFG.source4_path, 
                        CFG.subm1_path, 
                        CFG.subm2_path, 
                        CFG.subm3_path, 
                        CFG.subm4_path)

In [12]:
# Load and process train metadata
train_data, cat_cols = fe.process_data(CFG.train_path, CFG.N)

Shape: (118293, 75)
Unique patients: 1037
Memory usage: 34.64 MB



In [13]:
# Load and process test metadata
test_data, _ = fe.process_data(CFG.test_path, is_train=False)

Shape: (3, 74)
Unique patients: 3
Memory usage: 0.00 MB



<div style="background-color: #C9A9A6; padding: 10px; border-radius: 10px;">
    <a class='anchor' id='model-development'></a>
    <p style="text-align: center; font-size: 140%; font-weight: bold; margin: 0; color: black !important; font-family: 'Arial Rounded MT Bold', Arial, sans-serif; text-shadow: none;">
        <a href="#model-development" style="color: black;">Model Development</a>
    </p>
</div>

In [14]:
class Metrics:
    
    @staticmethod
    def calculate_pauc(y_true, y_scores, tpr_threshold=0.8):
        
        # Calculate ROC curve
        fpr, tpr, thresholds = roc_curve(y_true, y_scores)
        
        # Create a mask for TPR values above the threshold
        mask = tpr >= tpr_threshold
        
        # Filter FPR and TPR values based on the mask
        fpr_above_threshold = fpr[mask]
        tpr_above_threshold = tpr[mask]
        
        # Calculate the partial AUC
        partial_auc = auc(fpr_above_threshold, tpr_above_threshold)
        
        # Normalize the partial AUC
        pauc = partial_auc * (1 - tpr_threshold)
        
        return pauc  
    
    @staticmethod
    def plot_cv(fold_scores, model_name):
        
        # Round the fold scores to 4 decimal places
        fold_scores = [round(score, 4) for score in fold_scores]
        mean_score = round(np.mean(fold_scores), 4)
        std_score = round(np.std(fold_scores), 4)

        # Create a new figure for plotting
        fig = go.Figure()

        # Add scatter plot for individual fold scores
        fig.add_trace(go.Scatter(
            x = list(range(1, len(fold_scores) + 1)),
            y = fold_scores,
            mode = 'lines+markers',
            name = 'Fold Scores',
            line = dict(color = '#E30B5C', width = 2), # Raspberry
            marker = dict(size = 12, color = '#E30B5C'), # Larger markers, Raspberry
            text = [f'{score:.4f}' for score in fold_scores],
            hovertemplate = 'Fold %{x}: %{text}<extra></extra>'
        ))

        # Add a horizontal line for the mean score
        fig.add_trace(go.Scatter(
            x = [1, len(fold_scores)],
            y = [mean_score, mean_score],
            mode = 'lines',
            name = f'Mean: {mean_score:.4f}',
            line = dict(dash = 'dash', color = '#FFAC1C'), # Bright Orange
            hoverinfo = 'none'
        ))

        # Update the layout of the plot
        fig.update_layout(
            title = f'{model_name} Cross-Validation pAUC Scores | Variation of CV scores: {mean_score} ± {std_score}',
            xaxis_title = 'Fold',
            yaxis_title = 'pAUC Score',
            plot_bgcolor = 'rgba(0,0,0,0)',
            paper_bgcolor = 'rgba(0,0,0,0)',
            xaxis = dict(
                gridcolor = 'lightgray',
                tickmode = 'linear',
                tick0 = 1,
                dtick = 1,
                range = [0.5, len(fold_scores) + 0.5]
            ),
            yaxis = dict(gridcolor = 'lightgray')
        )

        # Display the plot
        fig.show() 
        
    @staticmethod
    def plot_cm(y_true, y_pred):
        
        # Get unique labels
        labels = sorted(np.unique(y_true))
        
        # Compute confusion matrix
        cm = confusion_matrix(y_true, 
                              y_pred=(y_pred > 0.5).astype(int), 
                              labels=labels)
      
        # Create the heatmap
        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=labels,
            y=labels,
            colorscale='Redor',
            zmin=0,
            
            # Use the maximum value in the confusion matrix
            zmax=np.max(cm),  
            text=cm,
            texttemplate='%{text:.0f}',
            hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z:,.0f}<extra></extra>',
            
            # Create a custom hover text format
            customdata = [str(int(val)) for val in cm.flatten()]
        ))
        
        # Update layout for a transparent background and square aspect ratio
        fig.update_layout(
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            xaxis_title='Predicted Labels',
            yaxis_title='True Labels',
            xaxis=dict(constrain='domain'),
            yaxis=dict(constrain='domain', scaleanchor='x'),
            width=800,  
            height=800,  
            margin=dict(t=80, b=80, l=80, r=80) 
        )
        
        # Show the plot
        fig.show()

In [15]:
class ModelDevelopment:
   
    @staticmethod
    def train_lgb(data, cat_cols, params, early_stop):
        
        # Convert categorical columns to category dtype
        for col in cat_cols:
            data[col] = data[col].astype('category')
        
        # Split features and label
        X = data.drop(['target', 'patient_id'], axis=1)
        y = data['target']
        groups = data['patient_id']
        
        # Initialize cross validation strategy (GroupKFold)
        cv = GroupKFold(5)
        
        # Initialize lists to store models and cv scores
        models = []
        scores = []
        
        # Perform cross-validation
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, groups)):
            
            # Split the data into training and validation sets for the current fold
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            # Create LightGBM datasets
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols, reference=train_data)
            
            # Train the model
            model = lgb.train(params, 
                              train_data, 
                              valid_sets=[valid_data], 
                              callbacks=[lgb.early_stopping(early_stop, verbose=0), 
                                         lgb.log_evaluation(0)])
            
            # Append the trained model to the list
            models.append(model)
            
            # Calculate and store the pAUC score for the current (valid) fold
            y_pred = model.predict(X_valid)
            score = Metrics.calculate_pauc(y_valid, y_pred)
            scores.append(score)
        
        # Plot the cross-validation results
        Metrics.plot_cv(scores, 'LightGBM')
        
        return models
    
    @staticmethod
    def train_ctb(data, cat_cols, params, early_stop):
        
        # Convert categorical columns to string
        for col in cat_cols:
            data[col] = data[col].astype(str)
        
        # Split features and label
        X = data.drop(['target', 'patient_id'], axis=1)
        y = data['target']
        groups = data['patient_id']
        
        # Initialize cross validation strategy (GroupKFold)
        cv = GroupKFold(5)
        
        # Initialize lists to store models and cv scores
        models = []
        scores = []
        
        # Perform cross-validation
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, groups)):
            
            # Split the data into training and validation sets for the current fold
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            # Create CatBoost pools
            train_pool = Pool(X_train, y_train, cat_features=cat_cols)
            valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
            
            # Initialize CatBoost
            model = CatBoostClassifier(**params, verbose=0)
            
            # Train the model
            model.fit(train_pool, 
                      eval_set=valid_pool, 
                      early_stopping_rounds=early_stop)
            
            # Append the trained model to the list
            models.append(model)
            
            # Calculate and store the pAUC score for the current (valid) fold
            y_pred = model.predict_proba(valid_pool)[:, 1]
            score = Metrics.calculate_pauc(y_valid, y_pred)
            scores.append(score)
        
        # Plot the cross-validation results
        Metrics.plot_cv(scores, 'CatBoost')
        
        return models

    @staticmethod
    def infer_lgb(data, cat_cols, models):

        # Convert categorical columns to category dtype
        for col in cat_cols:
            data[col] = data[col].astype('category')

        # Average the predictions of the LightGBM classifiers
        preds = np.mean([model.predict(data) for model in models], axis=0)

        return preds    
    
    @staticmethod
    def infer_ctb(data, cat_cols, models):
        
        # Convert categorical columns to string
        for col in cat_cols:
            data[col] = data[col].astype(str)
        
        # Create CatBoost pool for inference
        inference_pool = Pool(data, cat_features=cat_cols)
        
        # Average the predictions of the CatBoost classifiers
        preds = np.mean([model.predict_proba(inference_pool)[:, 1] for model in models], axis=0)
        
        return preds
   
    def generate_preds(self, train, test, cat_cols, lgb_params, ctb_params, early_stop, lgb_weight, ctb_weight):
        
        # Train LightGBM and CatBoost
        lgb_models = self.train_lgb(train, cat_cols, lgb_params, early_stop)
        ctb_models = self.train_ctb(train, cat_cols, ctb_params, early_stop)
        
        # Extract features label column from train data
        X = train.drop(['target', 'patient_id'], axis=1)
        y = train['target']
        
        # Infer LightGBM and CatBoost on train data
        train_lgb_preds = self.infer_lgb(X, cat_cols, lgb_models)
        train_ctb_preds = self.infer_ctb(X, cat_cols, ctb_models)
        
        # Weight-ensemble LightGBM and CatBoost predictions
        train_preds = train_lgb_preds * lgb_weight + train_ctb_preds * ctb_weight
        
        # Calculate pAUC scores
        train_pauc = Metrics.calculate_pauc(y, train_preds)
        print(f'Ensemble pAUC: {train_pauc:.3f}')
    
        # Plot confusion matrix for Ensemble predictions on train data
        print('Ensemble confusion matrix:')
        Metrics.plot_cm(y, train_preds)  
        
        # Prepare test data for inference
        test = test.drop('patient_id', axis=1)

        # Infer LightGBM and CatBoost on test data
        test_lgb_preds = self.infer_lgb(test, cat_cols, lgb_models)
        test_ctb_preds = self.infer_ctb(test, cat_cols, ctb_models)
        
        # Weight-ensemble LightGBM and CatBoost predictions
        test_preds = test_lgb_preds * lgb_weight + test_ctb_preds * ctb_weight
        
        return test_preds

In [16]:
# Initialize class for model training
md = ModelDevelopment()

In [17]:
# Generate predictions on test data using LightGBM and CatBoost
final_preds = md.generate_preds(train_data,
                                test_data,
                                cat_cols,
                                CFG.lgb_params,
                                CFG.ctb_params,
                                CFG.early_stop,
                                CFG.lgb_weight,
                                CFG.ctb_weight)

Ensemble pAUC: 0.198
Ensemble confusion matrix:


In [18]:
# Load submission data
subm_data = pd.read_csv(CFG.subm_path)
display(subm_data.head())

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.3
1,ISIC_0015729,0.3
2,ISIC_0015740,0.3


In [19]:
# Assign predictions to submission DataFrame
subm_data['target'] = final_preds

In [20]:
# Save the submission dataframe
subm_data.to_csv('submission.csv', index=False)
display(subm_data.head())

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.000271
1,ISIC_0015729,9.4e-05
2,ISIC_0015740,0.000116
