### Import necessary libraries

In [1]:
import os
import sys
import torch
import random

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score

import eli5
from eli5.sklearn import PermutationImportance

from tqdm.auto import tqdm

sys.path.append('../..')
from src.utils import load_env_vars

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Device for inference

In [3]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {DEVICE} for inference')

Using cuda for inference


### Global Configurations

In [4]:
DATA_DIR, _, _ = load_env_vars()

In [5]:
UNDER_SAMPLING_RATIO = 0.01
OVER_SAMPLING_RATIO = 0.003

N_SPLITS = 5
SEED = 42

### Reproducibility

In [6]:
def seed_everything(seed : int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(SEED)

### Paths

In [7]:
TRAIN_META_PATH = os.path.join(DATA_DIR, 'isic_2024/train-metadata.csv')
TEST_META_PATH = os.path.join(DATA_DIR, 'isic_2024/test-metadata.csv')

IMG_FEATURES = os.path.join(DATA_DIR, 'isic_2024/features.csv')
EVA_FEATURES = os.path.join(DATA_DIR, 'isic_2024/train_eva02_v2.csv')

### Data loading

In [8]:
train_metadata = pd.read_csv(TRAIN_META_PATH)
test_metadata = pd.read_csv(TEST_META_PATH)

eva_features = pd.read_csv(EVA_FEATURES)
img_features = pd.read_csv(IMG_FEATURES)

### Drop Train Only Columns

In [9]:
drop_columns = [col for col in train_metadata.columns if col not in test_metadata and col != 'target']
train_metadata.drop(columns=drop_columns, inplace=True)

### Merge Image Features

In [10]:
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
splits = list(sgkf.split(X=train_metadata, y=train_metadata.target, groups=train_metadata.patient_id))

In [11]:
oof_eva_preds = []

for i,(train_idx,val_idx) in enumerate(splits):
    a = eva_features[['isic_id',f'eva02_fold{i}']].iloc[val_idx]
    a['eva02'] = a[f'eva02_fold{i}']
    a = a.drop(columns=[f'eva02_fold{i}'])
    oof_eva_preds.append(a)

oof_eva_preds = pd.concat(oof_eva_preds)

#train_metadata = train_metadata.merge(oof_eva_preds, on='isic_id')

In [12]:
img_features = img_features.drop(columns=["resnet18","coat_lite_tiny","swin_tiny_patch4_window7_224"])
#train_metadata = train_metadata.merge(img_features, on='isic_id')

### Columns Organization

In [13]:
# Get the categorical & numerical columns
categorical_cols = train_metadata.select_dtypes(include=[object]).columns.tolist()
numerical_cols = train_metadata.select_dtypes(include=[np.number]).columns.tolist()

In [14]:
id_col, target_col, group_col = 'isic_id', 'target', 'patient_id'

irrelevant_cols = ['isic_id', 'patient_id', 'image_type', 'copyright_license', 'target']

# Filter out irrelevant columns
cat_features = [col for col in categorical_cols if col not in irrelevant_cols]
num_features = [col for col in numerical_cols if col not in irrelevant_cols]

# Train columns
train_cols = cat_features + num_features

### Impute Missing Values

In [15]:
# Fill missing values of numerical columns with median
for feature in num_features:
    median = train_metadata[feature].median()
    train_metadata[feature] = train_metadata[feature].fillna(median)
    #test_metadata[feature] = test_metadata[feature].fillna(median)

# Fill missing values of categorical columns with 'missing'
train_metadata[cat_features] = train_metadata[cat_features].fillna('missing')
#test_metadata[cat_features] = test_metadata[cat_features].fillna('missing')

### Feature Engineering

##### ABCDE criteria features

In [17]:
def generate_abcde_features(metadata: pd.DataFrame):

    epsilon = 1e-8 # To avoid division by zero
    columns = []

    pl_df = pl.DataFrame(metadata)

    # Diameter: Melanoma growths are normally larger than 6mm in diameter, which is about the diameter of a standard pencil eraser

    pl_df = pl_df.with_columns([
        # Diameter ratio: This is a measure of how elongated the mole is. A perfectly round mole has an axis ratio of 1.0
        pl.col('tbp_lv_minorAxisMM').truediv(pl.col('clin_size_long_diam_mm') + epsilon)
        .cast(pl.Float32).alias('diam_ratio'),

        # Diameter difference: Difference between the long diameter and minor axis
        pl.col('clin_size_long_diam_mm').sub(pl.col('tbp_lv_minorAxisMM'))
        .cast(pl.Float32).alias('diam_difference'),

        # Long diameter greater than 6mm
        pl.when(pl.col('clin_size_long_diam_mm') > 6).then(1.0).otherwise(0.0)
        .cast(pl.Float32).alias('long_diam_gt_6mm'),

        # Minor axis greater than 6mm
        pl.when(pl.col('tbp_lv_minorAxisMM') > 6).then(1.0).otherwise(0.0)
        .cast(pl.Float32).alias('short_diam_gt_6mm')
    ])
    columns += ['diam_ratio', 'diam_difference', 'long_diam_gt_6mm', 'short_diam_gt_6mm']


    # Evolution: Melanoma lesions often change in size, shape, color, or texture over time, while non-cancerous moles usually stay the same
    pl_df = pl_df.with_columns([
        # Position of the mole in the 3d space
        (pl.col('tbp_lv_x').pow(2) + pl.col('tbp_lv_y').pow(2) + pl.col('tbp_lv_z').pow(2)).sqrt()
        .cast(pl.Float32).alias('3d_position'),

        # Perimeter area ratio
        pl.col('tbp_lv_perimeterMM').truediv(pl.col('tbp_lv_areaMM2') + epsilon)
        .cast(pl.Float32).alias('perim_area_ratio'),

        # log area
        pl.col('tbp_lv_areaMM2').log().cast(pl.Float32).alias('log_area'),

        # log perimeter
        pl.col('tbp_lv_perimeterMM').log().cast(pl.Float32).alias('log_perimeter'),

        # Roundness: This is a measure of how round the mole is. A perfectly round mole has a roundness index of 1.0
        pl.col('tbp_lv_areaMM2').mul(4 * np.pi).truediv(pl.col('tbp_lv_perimeterMM').pow(2) + epsilon)
        .cast(pl.Float32).alias('roundness_index'),
    ])
    columns += ['3d_position', 'perim_area_ratio', 'log_area', 'log_perimeter', 'roundness_index']


    # Asymmetry: Melanoma is often asymmetrical, which means the shape isn't uniform, Non-cancerous moles are typically uniform and symmetrical
    pl_df = pl_df.with_columns([
        # Asymmetry angle sinus
        pl.col('tbp_lv_symm_2axis_angle').sin()
        .cast(pl.Float32).alias('asymmetry_angle_sin'),

        # Asymmetry angle cosinus
        pl.col('tbp_lv_symm_2axis_angle').cos()
        .cast(pl.Float32).alias('asymmetry_angle_cos'),

        # Asymmetry index
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_eccentricity'))
        .cast(pl.Float32).alias('symmetry_index'),

        # Symmetry border interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_norm_border'))
        .cast(pl.Float32).alias('symmetry_border_interaction'),

        # Symmetry eccentricity interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_eccentricity'))
        .cast(pl.Float32).alias('symmetry_eccentricity_interaction'),

        # Symmetry color interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_radial_color_std_max'))
        .cast(pl.Float32).alias('symmetry_color_interaction'),

        # Symmetry border jaggedness interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_area_perim_ratio'))
        .cast(pl.Float32).alias('symmetry_border_jaggedness_interaction'),
    ])
    columns += ['asymmetry_angle_sin', 'asymmetry_angle_cos', 'symmetry_index', 'symmetry_border_interaction', 'symmetry_eccentricity_interaction', 'symmetry_color_interaction', 'symmetry_border_jaggedness_interaction']


    # Color: Melanoma lesions are often more than one color or shade, Moles that are benign are usually one color
    pl_df = pl_df.with_columns([
        # Color std ratio
        pl.col('tbp_lv_color_std_mean').truediv(pl.col('tbp_lv_radial_color_std_max') + epsilon)
        .cast(pl.Float32).alias('color_std_ratio'),

        (pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext')) + pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext')) + pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext')))
        .truediv((pl.col('tbp_lv_L') + pl.col('tbp_lv_A') + pl.col('tbp_lv_B') + epsilon))
        .cast(pl.Float32).alias('color_difference_ratio'),

        (pl.col('tbp_lv_deltaA')**2 + pl.col('tbp_lv_deltaB')**2 + pl.col('tbp_lv_deltaL')**2).sqrt()
        .cast(pl.Float32).alias('color_euclidean_distance'),

        # Color eccentricity interaction
        pl.col('tbp_lv_radial_color_std_max').mul(pl.col('tbp_lv_eccentricity'))
        .cast(pl.Float32).alias('color_eccentricity_interaction'),

        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_L').mean()).truediv(pl.col('tbp_lv_stdL') + epsilon)
        .cast(pl.Float32).alias('L_normalized'),

        # H Contrast
        pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext'))
        .cast(pl.Float32).alias('H_contrast'),

        # L Contrast
        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext'))
        .cast(pl.Float32).alias('L_contrast'),

        # A Contrast
        pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext'))
        .cast(pl.Float32).alias('A_contrast'),

        # B Contrast
        pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext'))
        .cast(pl.Float32).alias('B_contrast'),

        # C Contrast
        pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext'))
        .cast(pl.Float32).alias('C_contrast'),

        # A/B ratio
        pl.col('tbp_lv_deltaA').truediv(pl.col('tbp_lv_deltaB') + epsilon)
        .cast(pl.Float32).alias('A_B_ratio'),

        # L uniformity
        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext')).truediv(pl.col('tbp_lv_L') + epsilon)
        .cast(pl.Float32).alias('L_uniformity'),

        # A uniformity
        pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext')).truediv(pl.col('tbp_lv_A') + epsilon)
        .cast(pl.Float32).alias('A_uniformity'),

        # B uniformity
        pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext')).truediv(pl.col('tbp_lv_B') + epsilon)
        .cast(pl.Float32).alias('B_uniformity'),

        # C uniformity
        pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext')).truediv(pl.col('tbp_lv_C') + epsilon)
        .cast(pl.Float32).alias('C_uniformity'),

        # H uniformity ratio
        pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext')).truediv(pl.col('tbp_lv_H') + epsilon)
        .cast(pl.Float32).alias('H_uniformity_ratio'),

        pl.col('tbp_lv_deltaA').add(pl.col('tbp_lv_deltaB')).add(pl.col('tbp_lv_deltaL'))
        .cast(pl.Float32).alias('delta_sum'),
    ])
    columns += ['color_std_ratio', 'color_difference_ratio', 'color_euclidean_distance', 'color_eccentricity_interaction', 'L_normalized', 'H_contrast', 'L_contrast', 'A_contrast', 'B_contrast', 'C_contrast', 'A_B_ratio', 'L_uniformity', 'A_uniformity', 'B_uniformity', 'C_uniformity', 'H_uniformity_ratio', 'delta_sum']

    pl_df = pl_df.with_columns([
        # lesions count by patient
        pl.col('isic_id').count().over('patient_id').alias('lesions_count_by_patient'),
        
        #(pl.col('eva02') + pl.col('eva02_tiny_patch14_224') + pl.col('coat_lite_tiny_2')).truediv(3)
        #.cast(pl.Float32).alias('image_features_ensemble')
    ])
    columns += ['lesions_count_by_patient']
    
    pd_df = pl_df.to_pandas()
    
    numericalc_cols = pd_df.select_dtypes(np.number).columns
    
    for col in numericalc_cols:
        pd_df[col] = pd_df[col].replace(np.inf, 1e8).replace(-np.inf, -1e8)

    return pd_df, columns

In [18]:
train_metadata, new_features = generate_abcde_features(train_metadata)
#test_metadata = generate_abcde_features(test_metadata)

train_cols += new_features

##### Z-Score features

In [19]:
def generate_zscore_features(metadata: pd.DataFrame):
    epsilon = 1e-8
    columns = []
    numerical_cols = metadata.select_dtypes(include=[np.number]).columns.tolist()
    df = pl.DataFrame(metadata)

    # Apply z-score aggregation to all numeric (float dtype) columns: z-score = (x - mean) / std
    z_score = [
        (pl.col(col).sub(pl.col(col).mean())).truediv(pl.col(col).std() + epsilon).over('patient_id')
        .cast(pl.Float32).alias(f'{col}_zscore') for col in numerical_cols if col != 'target'
        ]
    
    columns += [f'{col}_zscore' for col in numerical_cols if col != 'target']
    df = df.with_columns(z_score)

    df = df.to_pandas()
    
    return df, columns

In [20]:
train_metadata, new_features = generate_zscore_features(train_metadata)
test_metadata, _ = generate_zscore_features(test_metadata)

train_cols += new_features

##### Stats aggregations on img features

In [21]:
def generate_aggregated_features(metadata: pd.DataFrame, features: list):
    columns = []
    df = pl.DataFrame(metadata)

    # Apply aggregation functions to specified columns
    min_agg = [pl.col(col).min().over('patient_id').alias(f'{col}_min_by_patient') for col in features]
    max_agg = [pl.col(col).max().over('patient_id').alias(f'{col}_max_by_patient') for col in features]
    mean_agg = [pl.col(col).mean().over('patient_id').alias(f'{col}_mean_by_patient') for col in features]
    std_agg = [pl.col(col).std().over('patient_id').alias(f'{col}_std_by_patient') for col in features]
    median_agg = [pl.col(col).median().over('patient_id').alias(f'{col}_median_by_patient') for col in features]
    skew_agg = [pl.col(col).skew().over('patient_id').alias(f'{col}_skew_by_patient') for col in features]
    q25_agg = [pl.col(col).quantile(0.25).over('patient_id').alias(f'{col}_q25_by_patient') for col in features]
    q75_agg = [pl.col(col).quantile(0.75).over('patient_id').alias(f'{col}_q75_by_patient') for col in features]

    columns += [f'{col}_{agg}_by_patient' for col in features for agg in ['min', 'max', 'mean', 'std', 'median', 'skew', 'q25', 'q75']]

    df = df.with_columns(min_agg + max_agg + mean_agg + std_agg + median_agg + skew_agg + q25_agg + q75_agg)

    df = df.to_pandas()
    
    return df, columns

In [22]:
stats_features = ['tbp_lv_nevi_confidence']

train_metadata, new_features = generate_aggregated_features(train_metadata, stats_features)
#test_metadata, _ = generate_aggregated_features(test_metadata, num_features)

train_cols += new_features

In [23]:
numerical_cols = train_metadata.select_dtypes(include=[np.number]).columns.tolist()
num_features = [col for col in numerical_cols if col not in irrelevant_cols]

# Fill missing values of numerical columns with median
for feature in num_features:
    median = train_metadata[feature].median()
    train_metadata[feature] = train_metadata[feature].fillna(median)
    #test_metadata[feature] = test_metadata[feature].fillna(median)

### Encode Categorical Variables

In [24]:
def encode_categorical(df_train, df_test, cat_features):
    
    encoder = OrdinalEncoder(
        categories='auto',
        dtype=np.int32,
        handle_unknown='use_encoded_value',
        unknown_value=-2,
        encoded_missing_value=-1,
    )

    df_train[cat_features] = encoder.fit_transform(df_train[cat_features])
    df_test[cat_features] = encoder.transform(df_test[cat_features])

    return df_train, df_test

In [25]:
train_metadata, test_metadata = encode_categorical(train_metadata, test_metadata, cat_features)

### Partial AUC

In [26]:
def pauc_score(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

In [27]:
def oof_pauc(y_true, y_hat):
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

### Training

##### LightGBM

In [28]:
lgb_best_params = {
    'random_state': SEED,
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_iter': 217,
    'learning_rate': 0.03238691082828724,
    'min_data_in_leaf': 93,
    'colsample_bytree': 0.6336220063217671,
    'colsample_bynode': 0.6449520328952557,
    'bagging_fraction': 0.4759532875970691,
    'bagging_freq': 5,        
    'lambda_l1': 0.26214445542677706,
    'lambda_l2': 0.5374677609248407,
    'num_leaves': 107,
    'max_depth': 6,
    'min_child_samples': 38,
    'scale_pos_weight': 2.454410437484347,
    'verbosity': -1,
}

##### CatBoost

In [29]:
cb_best_params = {
    'random_state': SEED,
    'loss_function': 'Logloss',
    'iterations': 240,
    'learning_rate': 0.046047767547706654,
    'scale_pos_weight': 4.670443106586375,
    'reg_lambda': 6.153764402866752,
    'subsample': 0.4712307759317672,
    'min_data_in_leaf': 26,
    'max_depth': 4,
    'cat_features': cat_features,
    'verbose': False,
}

##### XGBoost

In [30]:
xgb_best_params = {
    'random_state': SEED,
    'tree_method': 'hist',
    'objective': 'binary:logistic',
    'learning_rate': 0.09211299188601348,
    'colsample_bytree': 0.8178496272146406,
    'colsample_bynode': 0.8688187031865214,
    'colsample_bylevel': 0.3476250413253686,
    'scale_pos_weight': 2.1101053079075625,
    'max_depth': 6,
    'subsample': 0.7286594187699229,
    'lambda': 4.586386144644716,
    'alpha': 0.21956352903435347,
    'enable_categorical': True,
    'verbosity': 0,
}

##### Cross-Validation

In [31]:
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

In [32]:
X = train_metadata[train_cols]
y = train_metadata[target_col]
groups = train_metadata[group_col]

##### Training

In [33]:
scores = {
    'lgb': [],
    'cb': [],
    'xgb': [],
    'soft': [],
}

preds = {
    'lgb': [],
    'cb': [],
    'xgb': [],
    'soft': [],
}

models = {
    'lgb': [],
    'cb': [],
    'xgb': [],
    'soft': [],
}

y_val = []

print(f'Starting Cross-Validation on {N_SPLITS} folds\n')

for fold, (train_idx, valid_idx) in enumerate(sgkf.split(X, y, groups)):

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    y_val.extend(y_valid)
    
    lgb_model = Pipeline([
        ('sampler1', RandomOverSampler(sampling_strategy=OVER_SAMPLING_RATIO, random_state=SEED)),
        ('sampler2', RandomUnderSampler(sampling_strategy=UNDER_SAMPLING_RATIO, random_state=SEED)),
        ('classifier', lgb.LGBMClassifier(**lgb_best_params)),
    ])

    lgb_model.fit(X_train, y_train)
    lgb_preds = lgb_model.predict_proba(X_valid)[:,1]
    lgb_score = pauc_score(lgb_model, X_valid, y_valid)

    cb_model = Pipeline([
        ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
        ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
        ('classifier', cb.CatBoostClassifier(**cb_best_params)),
    ])

    cb_model.fit(X_train, y_train)
    cb_preds = cb_model.predict_proba(X_valid)[:,1]
    cb_score = pauc_score(cb_model, X_valid, y_valid)

    xgb_model = Pipeline([
        ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
        ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
        ('classifier', xgb.XGBClassifier(**xgb_best_params)),
    ])

    xgb_model.fit(X_train, y_train)
    xgb_preds = xgb_model.predict_proba(X_valid)[:,1]
    xgb_score = pauc_score(xgb_model, X_valid, y_valid)

    soft_preds = lgb_preds + cb_preds + xgb_preds / 3
    soft_score = oof_pauc(y_valid, soft_preds)

    scores['lgb'].append(lgb_score)
    scores['cb'].append(cb_score)
    scores['xgb'].append(xgb_score)
    scores['soft'].append(soft_score)

    models['lgb'].append(lgb_model)
    models['cb'].append(cb_model)
    models['xgb'].append(xgb_model)
    
    preds['lgb'].extend(lgb_preds)
    preds['cb'].extend(cb_preds)
    preds['xgb'].extend(xgb_preds)
    preds['soft'].extend(soft_preds)

    print(f'Fold {fold + 1} - LGB: {lgb_score:.4f} - CB: {cb_score:.4f} - XGB: {xgb_score:.4f} - Ensemble: {soft_score:.4f}\n')

print(f'Average Scores - LGB: {np.mean(scores["lgb"]):.4f} - CB: {np.mean(scores["cb"]):.4f} - XGB: {np.mean(scores["xgb"]):.4f} - Ensemble: {np.mean(scores["soft"]):.4f} \n')

Starting Cross-Validation on 5 folds

Fold 1 - LGB: 0.1596 - CB: 0.1472 - XGB: 0.1673 - Ensemble: 0.1537

Fold 2 - LGB: 0.1689 - CB: 0.1688 - XGB: 0.1688 - Ensemble: 0.1703

Fold 3 - LGB: 0.1816 - CB: 0.1800 - XGB: 0.1829 - Ensemble: 0.1824

Fold 4 - LGB: 0.1695 - CB: 0.1634 - XGB: 0.1659 - Ensemble: 0.1667

Fold 5 - LGB: 0.1694 - CB: 0.1512 - XGB: 0.1665 - Ensemble: 0.1596

Average Scores - LGB: 0.1698 - CB: 0.1621 - XGB: 0.1703 - Ensemble: 0.1665 



In [34]:
oof_scores = {
    'lgb': oof_pauc(np.array(y_val), np.array(preds['lgb'])),
    'cb': oof_pauc(np.array(y_val), np.array(preds['cb'])),
    'xgb': oof_pauc(np.array(y_val), np.array(preds['xgb'])),
    'soft': oof_pauc(np.array(y_val), np.array(preds['soft']))
}

eval_df = pd.DataFrame(scores)
eval_df.loc['mean'] = eval_df.mean(axis=0)
eval_df.loc['std'] = eval_df.std(axis=0)
eval_df.loc['oof'] = oof_scores

eval_df

Unnamed: 0,lgb,cb,xgb,soft
0,0.159583,0.147197,0.167295,0.153678
1,0.168929,0.16875,0.168767,0.170296
2,0.181629,0.180032,0.18292,0.182382
3,0.169544,0.163434,0.165863,0.166698
4,0.169381,0.151239,0.166498,0.159645
mean,0.169813,0.162131,0.170269,0.16654
std,0.007004,0.011896,0.0064,0.009778
oof,0.168786,0.159286,0.169054,0.164272


In [35]:
# save oof preds from each model as featue in new dataframe
gbdt_data = pd.DataFrame({'isic_id': train_metadata['isic_id'], 'patient_id': train_metadata['patient_id'], 'target': train_metadata['target']})

for fold, (_, val_idx) in enumerate(sgkf.split(X, y, groups)):

    gbdt_data.loc[val_idx, 'lgb'] = models['lgb'][fold].predict_proba(X.iloc[val_idx])[:,1]
    gbdt_data.loc[val_idx, 'cb'] = models['cb'][fold].predict_proba(X.iloc[val_idx])[:,1]
    gbdt_data.loc[val_idx, 'xgb'] = models['xgb'][fold].predict_proba(X.iloc[val_idx])[:,1]

gbdt_data.head()

Unnamed: 0,isic_id,patient_id,target,lgb,cb,xgb
0,ISIC_0015670,IP_1235828,0,0.000327,0.003275,0.000411
1,ISIC_0015845,IP_8170065,0,0.705232,0.936068,0.934974
2,ISIC_0015864,IP_6724798,0,0.000188,0.004573,0.000351
3,ISIC_0015902,IP_4111386,0,0.000577,0.004582,0.001001
4,ISIC_0024200,IP_8313778,0,0.001121,0.006794,0.000599


In [43]:
gbdt_data = gbdt_data.merge(oof_eva_preds, on='isic_id')
gbdt_data = gbdt_data.merge(img_features, on='isic_id')

In [44]:
gbdt_data

Unnamed: 0,isic_id,patient_id,target,lgb,cb,xgb,eva02,eva02_tiny_patch14_224,coat_lite_tiny_2
0,ISIC_0015670,IP_1235828,0,0.000327,0.003275,0.000411,0.027064,0.016618,0.006317
1,ISIC_0015845,IP_8170065,0,0.705232,0.936068,0.934974,0.012015,0.057499,0.020128
2,ISIC_0015864,IP_6724798,0,0.000188,0.004573,0.000351,0.003760,0.003339,0.000935
3,ISIC_0015902,IP_4111386,0,0.000577,0.004582,0.001001,0.009385,0.001219,0.001194
4,ISIC_0024200,IP_8313778,0,0.001121,0.006794,0.000599,0.015275,0.016641,0.000733
...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,IP_1140263,0,0.010734,0.117205,0.007470,0.517441,0.048114,0.040109
401055,ISIC_9999951,IP_5678181,0,0.000452,0.004976,0.000193,0.046718,0.001689,0.029780
401056,ISIC_9999960,IP_0076153,0,0.002193,0.024668,0.001353,0.014100,0.001682,0.001176
401057,ISIC_9999964,IP_5231513,0,0.000357,0.002312,0.000327,0.299635,0.001080,0.002088


In [75]:
train_cols = ['lgb', 'xgb', 'cb', 'eva02', 'eva02_tiny_patch14_224', 'coat_lite_tiny_2']

In [76]:
train_df, new_feats = generate_zscore_features(gbdt_data)
train_cols += new_feats

In [77]:
train_df, new_feats = generate_aggregated_features(train_df, ['lgb', 'xgb', 'cb', 'eva02', 'eva02_tiny_patch14_224', 'coat_lite_tiny_2'])
train_cols += new_feats

In [78]:
for feature in train_cols:
    median = train_df[feature].median()
    train_df[feature] = train_df[feature].fillna(median)

In [79]:
X2 = train_df[train_cols]
y2 = train_df[target_col]
groups2 = train_df[group_col]

In [80]:
# train soft on the oof preds using the same folds
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# import logistic regression
from sklearn.linear_model import LogisticRegression

scores = []

for fold, (train_idx, valid_idx) in enumerate(sgkf.split(X2, y2, groups2)):
    X_train, X_valid = X2.iloc[train_idx], X2.iloc[valid_idx]
    y_train, y_valid = y2.iloc[train_idx], y2.iloc[valid_idx]

    lr = LogisticRegression(random_state=SEED, max_iter=1000)
    lr.fit(X_train, y_train)

    preds = lr.predict_proba(X_valid)[:,1]
    score = oof_pauc(y_valid, preds)

    scores.append(score)
    print(f'Fold {fold + 1} - LR: {score:.4f}\n')

print(f'Average Scores - LR: {np.mean(scores):.4f} \n')

Fold 1 - LR: 0.1709

Fold 2 - LR: 0.1709

Fold 3 - LR: 0.1566

Fold 4 - LR: 0.1578

Fold 5 - LR: 0.1827

Average Scores - LR: 0.1678 



In [156]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

In [157]:
class MLPStacker(nn.Module):
    def __init__(self, input_dim):
        super(MLPStacker, self).__init__()
        
        self.backbone = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        return self.sigmoid(self.backbone(x))

In [158]:
# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        y_preds = []
        y_true = []
        model.train()
        for batch_X, batch_y in tqdm(train_loader):
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()

            y_preds.extend(outputs.detach().cpu().numpy())
            y_true.extend(batch_y.detach().cpu().numpy())

        # pauc score
        pauc = oof_pauc(np.array(y_true), np.array(y_preds))
        print(f'Epoch {epoch + 1} - PAUC: {pauc:.4f}')

        # Validation loop
        model.eval()
        with torch.no_grad():
            val_preds = []
            val_true = []
            for val_X, val_y in train_loader:
                outputs = model(val_X)
                val_preds.extend(outputs.detach().cpu().numpy())
                val_true.extend(val_y.detach().cpu().numpy())

            val_pauc = oof_pauc(np.array(val_true), np.array(val_preds))
            print(f'Validation PAUC: {val_pauc:.4f}')

# Prediction function
def predict(model, X):
    model.eval()
    with torch.no_grad():
        return model(X)

In [159]:
n_splits = 5
kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

cv_scores = []
oof_predictions = np.zeros(len(X))

X2 = torch.tensor(X2.values, dtype=torch.float32)
y2 = torch.tensor(y2.values, dtype=torch.float32)

for fold, (train_idx, val_idx) in enumerate(kf.split(X2, y2, groups2)):
    print(f"Fold {fold}")
    
    # Split data
    X_train, X_val = X2[train_idx], X2[val_idx]
    y_train, y_val = y2[train_idx], y2[val_idx]
    
    # Create DataLoader
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    # Initialize model, loss, and optimizer
    model = MLPStacker(input_dim=X2.shape[1])
    criterion = nn.BCELoss()  # or nn.BCEWithLogitsLoss() for binary classification
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Train the model
    train_model(model, train_loader, criterion, optimizer)
    
    # Validate
    val_predictions = predict(model, X_val)

    # Calculate partial AUC
    cv_score = oof_pauc(y_val, val_predictions)

    cv_scores.append(cv_score)

    oof_predictions[val_idx] = val_predictions.cpu().numpy().flatten()

    print(f"Partial AUC FOLD{fold}: {cv_score:.4f}")

print(f"Mean Partial AUC: {np.mean(cv_scores):.4f}")

Fold 0


  0%|          | 0/10310 [00:00<?, ?it/s]

KeyboardInterrupt: 