In [1]:
import numpy as np
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier, Pool, cv
import lightgbm as lgb
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score, roc_curve, auc
from functools import partial 
from sklearn.base import BaseEstimator
from tqdm.auto import trange
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold

In [2]:
train_data = pl.read_csv('./isic-2024-challenge/train-metadata.csv')

In [3]:
def process_data(data):
    return (
        data
        .with_columns(
                pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
            )
        .with_columns(
                lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
                lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
                hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
                luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
                lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
                border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
                color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + 1e-6),
            )
        .with_columns(
                position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
                perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
                area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
                lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
                combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
                symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
                consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
            )
        .with_columns(
                color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
                consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
                size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
                hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
                lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
                shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
                color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
            )
        .with_columns(
                log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
                normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
                mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
                std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
                color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
                lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
                overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
            )
        .with_columns(
                symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
                comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
                color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
                border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
                border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
                size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
                age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
                age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
                color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
            )
        .with_columns(
                volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
                color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
                shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
                border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
                age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
                index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
            )
        .with_columns(
                count_per_patient = pl.col('isic_id').count().over('patient_id'),
            )
        .to_pandas()
    )

In [4]:
train_data = process_data(train_data)

In [5]:
def p_auc_tpr(v_gt, v_pred, min_tpr=None, sample_weight=None):
    """Computes the area under the AUC above a minumum TPR.

    Args:
        v_gt: ground truth vector (1s and 0s)
        v_p: predictions vector of scores ranging [0, 1]
        min_tpr: minimum true positive threshold (sensitivity)

    Returns:
        Float value range [0, 1]
    """
    if len(np.unique(v_gt)) != 2:
        raise ValueError(
            "Only one class present in y_true. ROC AUC score "
            "is not defined in that case."
        )
    
    # redefine the target. set 0s to 1s and 1s to 0s
    v_gt = abs(np.asarray(v_gt)-1)
    v_pred = abs(np.asarray(v_pred)-1)
    max_fpr = abs(1-min_tpr)
    
    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=sample_weight)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)

    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)
    return(partial_auc)

In [6]:
def lgb_custom_metric(preds, dataset):
    y_true = dataset.get_label()
    return 'prauc', p_auc_tpr(y_true, preds,min_tpr=0.80), True

def xgb_custom_metric(y_true, y_pred,*args, **kwargs):
    return p_auc_tpr(y_true, y_pred, min_tpr=0.80)


class PRAUCMetric:
    def is_max_optimal(self):
        return True # greater is better

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        y_pred = approx #np.rint(approx)
        y_true = np.array(target).astype(int)

        output_weight = 1 # weight is not used

        score = p_auc_tpr(y_true, y_pred,min_tpr=0.80)
 
        return score, output_weight

    def get_final_error(self, error, weight):
        return error


In [7]:


new_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
    'count_per_patient',
    'combined_anatomical_site'
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution', 'copyright_license','combined_anatomical_site',]
drop_cols = [
    'isic_id',
    'patient_id',
    'image_type',
    'lesion_id',
    'iddx_full',
    'iddx_1',
    'iddx_2',
    'iddx_3',
    'iddx_4',
    'iddx_5',
    'mel_mitotic_index',
    'mel_thick_mm', 
    'tbp_lv_dnn_lesion_confidence',
]

cb_params = {
    'loss_function':     'Logloss',
    'iterations':        200,
    'verbose':           False,
    'random_state':      56,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
    'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24,
    'eval_metric': PRAUCMetric(),
    'cat_features':      cat_cols,
}

lgb_params = {
    'enable_categorical': True,
    'objective':        'binary',
    'verbosity': -1,
    'n_iter': 200,
    'boosting_type': 'gbdt',
    'random_state': 56,
    'lambda_l1': 0.08758718919397321, 
    'lambda_l2': 0.0039689175176025465, 
    'learning_rate': 0.03231007103195577, 
    'max_depth': 4, 
    'num_leaves': 103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq': 4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

xgb_params = {
    'enable_categorical': True,
    #'verbose':50,
    'objective':'binary:logistic',
    'tree_method': 'hist',
    #'num_iterations': 400,
    'random_state': 56,
    'learning_rate': 0.08501257473292347, 
    'lambda': 8.879624125465703, 
    'alpha': 0.6779926606782505, 
    'eval_metric': xgb_custom_metric,
    'max_depth': 6,
    'subsample': 0.6012681388711075, 
    'colsample_bytree': 0.8437772277074493, 
    'colsample_bylevel': 0.5476090898823716, 
    'colsample_bynode': 0.9928601203635129, 
    'scale_pos_weight': 3.29440313334688,
}

params = {
    'cb_params':cb_params,
    'lgb_params':lgb_params,
    'xgb_params':xgb_params,
    'weights': [1,1,1],
    'early_stopping_rounds':50
}

label_col = 'target'
n_folds = 5
n_repits = 1

In [8]:
train_data[cat_cols] = train_data[cat_cols].astype('category')

In [9]:
class CustomModelsWrapper(BaseEstimator):
    def __init__(self,cb_params,xgb_params,lgb_params,weights=[1,1,1],early_stopping_rounds=100):
        self.cb_params = cb_params
        self.xgb_params = xgb_params
        self.lgb_params = lgb_params
        self.early_stopping_rounds = early_stopping_rounds
        self.weights = weights
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=-1):
        rus = RandomUnderSampler(sampling_strategy=0.01, random_state=42)
        X, y = rus.fit_resample(X, y)
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm = CatBoostClassifier(**self.cb_params)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        
        train_ds = lgb.Dataset(X,label=y,categorical_feature=cat_features)
        val_ds = lgb.Dataset(X_val,label=y_val,categorical_feature=cat_features)
        self.lgm = lgb.train(
            self.lgb_params,
            train_ds,
            valid_sets=[val_ds],
            feval=lgb_custom_metric,
            callbacks=[
                #lgb.early_stopping(self.early_stopping_rounds),
                lgb.log_evaluation(verbose)
            ]
        )
        
        #train_ds = xgb.DMatrix(X,label=y,enable_categorical=True)
        #val_ds = xgb.DMatrix(X_val,label=y_val,enable_categorical=True)
        self.xgm = xgb.XGBClassifier(**self.xgb_params).fit(X=X,y=y,eval_set=[(X_val,y_val)],verbose=False)
        
        pred_cb = self.cbm.predict_proba(eval_pool)[:,1]
        pred_lg = self.lgm.predict(X_val)
        pred_xg = self.xgm.predict_proba(X_val)[:,1]
        return (pred_cb * self.weights[0] + pred_lg * self.weights[1] + pred_xg * self.weights[2]) / sum(self.weights)
    
    def predict_proba(self, X,cat_features=None):
        test_pool = Pool(X,cat_features=cat_features)
        
        pred_cb = self.cbm.predict_proba(eval_pool)[:,1]
        pred_lg = self.lgm.predict_proba(X)[:,1]
        pred_xg = self.xgm.predict_proba(X)[:,1]
        
        return (pred_cb * self.weights[0] + pred_lg * self.weights[1] + pred_xg * self.weights[2]) / sum(self.weights)
    

In [10]:
class CustomKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=p_auc_tpr):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col],train_data['patient_id'])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = CustomModelsWrapper(**self.params)
                preds = model.fit(
                    X = train_df.drop([label_col]+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop([label_col]+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = [x for x in cat_features if x in train_df.drop(drop_cols,axis=1).columns.tolist()],
                    verbose=verbose
                )

                score = self.score_func(test_df[label_col], preds)
                self.scores += [score]
                self.models += [model]
                
        print(f"Total Score {np.mean(self.scores)}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        test_df = test_df.drop(drop_cols,axis=1)
        preds = np.mean([model.predict_proba(test_df,cat_features=cat_features)[:,1] for model in self.models],axis=0)
        return preds

In [11]:
model = CustomKfoldWraper(
    num_folds=n_folds,
    num_repits=n_repits,
    params=params,
    random_state=5656,
    score_func=partial(p_auc_tpr,min_tpr=0.80)
)

In [None]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=1000,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  _check_train_params(params)


0:	learn: 0.0572392	test: 0.0706576	best: 0.0706576 (0)	total: 102ms	remaining: 20.3s
199:	learn: 0.1956865	test: 0.1550211	best: 0.1559761 (86)	total: 10.4s	remaining: 0us

bestTest = 0.1559760602
bestIteration = 86

Shrink model to first 87 iterations.


  _check_train_params(params)


0:	learn: 0.0940297	test: 0.0691277	best: 0.0691277 (0)	total: 57.3ms	remaining: 11.4s
199:	learn: 0.1969786	test: 0.1551493	best: 0.1555788 (181)	total: 11.1s	remaining: 0us

bestTest = 0.1555787663
bestIteration = 181

Shrink model to first 182 iterations.


  _check_train_params(params)


0:	learn: 0.1241182	test: 0.1146401	best: 0.1146401 (0)	total: 45.4ms	remaining: 9.04s
199:	learn: 0.1974952	test: 0.1686201	best: 0.1689609 (196)	total: 9.97s	remaining: 0us

bestTest = 0.1689608508
bestIteration = 196

Shrink model to first 197 iterations.


  _check_train_params(params)


0:	learn: 0.0700936	test: 0.0781584	best: 0.0781584 (0)	total: 37.7ms	remaining: 7.51s


In [13]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_digits
from sklearn.metrics import fbeta_score

X, y = load_digits(n_class=2, return_X_y=True)


def error(y_true, y_pred, *args, **kwargs) -> float:
    classes = np.repeat(0, y_pred.shape[0])
    classes[y_pred > 0.5] = 1
    y_pred = classes
    return -fbeta_score(y_true, y_pred, *args, **kwargs, beta=1.0)


clf = xgb.XGBClassifier(eval_metric=xgb_custom_metric,enable_categorical=True)

clf.fit(X, y, eval_set=[(X, y)])

ValueError: Experimental support for categorical data is not implemented for current tree method yet.

In [208]:
fff

<xgboost.core.Booster at 0x7f5f8a879050>