In [1]:
import numpy as np
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier, Pool, cv
import lightgbm as lgb
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score, roc_curve, auc
from functools import partial 
from sklearn.base import BaseEstimator
from tqdm.auto import trange
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold

In [2]:
train_data = pl.read_csv('./isic-2024-challenge/train-metadata.csv')

In [3]:
def process_data(data):
    return (
        data
        .with_columns(
                pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
            )
        .with_columns(
                lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
                lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
                hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
                luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
                lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
                border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
                color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + 1e-6),
            )
        .with_columns(
                position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
                perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
                area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
                lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
                combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
                symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
                consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
            )
        .with_columns(
                color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
                consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
                size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
                hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
                lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
                shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
                color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
            )
        .with_columns(
                log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
                normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
                mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
                std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
                color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
                lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
                overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
            )
        .with_columns(
                symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
                comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
                color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
                border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
                border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
                size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
                age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
                age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
                color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
            )
        .with_columns(
                volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
                color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
                shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
                border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
                age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
                index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
            )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + 1e-5)).alias(f'{col}_patient_norm') for col in (num_cols + new_cols)
        )

        .with_columns(
                count_per_patient = pl.col('isic_id').count().over('patient_id'),
                age_difference = (pl.col('age_approx').max().over('patient_id') - pl.col('age_approx').min().over('patient_id')),
                is_tbp_tile_type = pl.col('tbp_tile_type').map_elements(lambda x: 1 if x == '3D: white' else 0)
        )
        .with_columns(
                tbp_tile_type_mean = pl.col('is_tbp_tile_type').mean().over('patient_id'),
                tbp_tile_type_sum = pl.col('is_tbp_tile_type').sum().over('patient_id'),
                tbp_tile_type_inv = pl.col('count_per_patient') - pl.col('is_tbp_tile_type').sum().over('patient_id'),
        )
        .to_pandas()
    )

In [4]:
def p_auc_tpr(v_gt, v_pred, min_tpr=None, sample_weight=None):
    """Computes the area under the AUC above a minumum TPR.

    Args:
        v_gt: ground truth vector (1s and 0s)
        v_p: predictions vector of scores ranging [0, 1]
        min_tpr: minimum true positive threshold (sensitivity)

    Returns:
        Float value range [0, 1]
    """
    if len(np.unique(v_gt)) != 2:
        raise ValueError(
            "Only one class present in y_true. ROC AUC score "
            "is not defined in that case."
        )
    
    # redefine the target. set 0s to 1s and 1s to 0s
    v_gt = abs(np.asarray(v_gt)-1)
    v_pred = abs(np.asarray(v_pred)-1)
    max_fpr = abs(1-min_tpr)
    
    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=sample_weight)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)

    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)
    return(partial_auc)

In [5]:
class PRAUCMetric:
    def is_max_optimal(self):
        return True # greater is better

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        y_pred = approx #np.rint(approx)
        y_true = np.array(target).astype(int)

        output_weight = 1 # weight is not used

        score = p_auc_tpr(y_true, y_pred,min_tpr=0.80)
 
        return score, output_weight

    def get_final_error(self, error, weight):
        return error

def lgb_custom_metric(preds, dataset):
    y_true = dataset#.get_label()
    try:
        return 'prauc', -p_auc_tpr(y_true, preds,min_tpr=0.80), True
    except:
        return 'prauc', -100, True

def xgb_custom_metric(y_true, y_pred,*args, **kwargs):
    return -p_auc_tpr(y_true, y_pred, min_tpr=0.80)

In [11]:
num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

aggr_cols = ['combined_anatomical_site'] + [f'{col}_patient_norm' for col in (num_cols + new_cols)]
cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution', 'copyright_license','combined_anatomical_site',]
drop_cols = [
    'isic_id',
    'patient_id',
    'image_type',
    'lesion_id',
    'iddx_full',
    'iddx_1',
    'iddx_2',
    'iddx_3',
    'iddx_4',
    'iddx_5',
    'mel_mitotic_index',
    'mel_thick_mm', 
    'tbp_lv_dnn_lesion_confidence',
    'is_tbp_tile_type',
]

cb_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'loss_function': 'CrossEntropy',
    'max_depth': 6,
    'eval_metric': PRAUCMetric(),
    'l2_leaf_reg': 6.216113851699493, 
    #'subsample': 0.6249261779711819, 
    'min_data_in_leaf': 24,
    'task_type': 'GPU',
    'random_seed': 56,
}


lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           300,
    'boosting_type':    'gbdt',
    'random_state':     56,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        5, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'extra_trees':      True,
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}


xgb_params = {
    'enable_categorical': True,
    #'verbose':50,
    'objective':'binary:logistic',
    'tree_method': 'hist',
    #'num_iterations': 400,
    'random_state': 56,
    'learning_rate': 0.08501257473292347, 
    'lambda': 8.879624125465703, 
    'early_stopping_rounds': 30,
    'alpha': 0.6779926606782505, 
    'eval_metric': xgb_custom_metric,
    'max_depth': 6,
    'subsample': 0.6012681388711075, 
    'colsample_bytree': 0.8437772277074493, 
    'colsample_bylevel': 0.5476090898823716, 
    'colsample_bynode': 0.9928601203635129, 
    'scale_pos_weight': 3.29440313334688,
    'random_state': 56
}

params = {
    'cb_params': cb_params,
    'lgb_params': lgb_params,
    'xgb_params': xgb_params
}

label_col = 'target'
n_folds = 5
n_repits = 1

In [12]:
class EnsembleClassifier(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostClassifier(**cb_params)
        self.lgbm = lgb.LGBMClassifier(**lgb_params)
        self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        self.lgbm.fit(X,y,eval_set=[(X_val,y_val)],eval_metric=lgb_custom_metric,categorical_feature=cat_features)
        self.xgbm.fit(X,y,eval_set=[(X_val,y_val)])
    
    def predict_proba(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict_proba(test_pool)[:,1]
        lgb_preds = self.lgbm.predict_proba(X_test)[:,1]
        xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds, lgb_preds, xgb_preds

In [13]:
class CustomBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=p_auc_tpr):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col],train_data['patient_id'])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = EnsembleClassifier(**self.params)
                model.fit(
                    X = train_df.drop([label_col]+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop([label_col]+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = cat_features,
                    verbose = verbose
                )
                cb_preds, lgb_preds, xgb_preds = model.predict_proba(test_df.drop([label_col]+drop_cols,axis=1),cat_features)
                avg_preds = (cb_preds + lgb_preds + xgb_preds) / 3
                self.scores += [[
                    self.score_func(test_df[label_col],cb_preds),
                    self.score_func(test_df[label_col],lgb_preds),
                    self.score_func(test_df[label_col],xgb_preds),
                    self.score_func(test_df[label_col],avg_preds)
                ]]
                print(self.scores[-1])
                self.models += [model]
                
        print(f"Total Score {np.mean([x[3] for x in self.scores])}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        preds = np.mean([
            np.mean(model.predict_proba(test_data.drop(drop_cols,axis=1),cat_features=cat_features)[:,1],axis=0)
            for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [9]:
train_data = process_data(train_data)
train_data[cat_cols] = train_data[cat_cols].astype('category')



In [14]:
model = CustomBoostKfoldWraper(
    num_folds=n_folds,
    num_repits=n_repits,
    params=params,
    random_state=56,
    score_func=partial(p_auc_tpr,min_tpr=0.80)
)

In [None]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=200,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.0265520	test: 0.0392256	best: 0.0392256 (0)	total: 50.1ms	remaining: 50s
200:	learn: 0.1761450	test: 0.1759944	best: 0.1765774 (190)	total: 7.06s	remaining: 28.1s
400:	learn: 0.1842021	test: 0.1762013	best: 0.1770677 (260)	total: 14s	remaining: 20.9s
600:	learn: 0.1892525	test: 0.1758834	best: 0.1770677 (260)	total: 21s	remaining: 13.9s
800:	learn: 0.1925490	test: 0.1759283	best: 0.1770677 (260)	total: 28.2s	remaining: 7s
999:	learn: 0.1947485	test: 0.1751129	best: 0.1770677 (260)	total: 35.3s	remaining: 0us
bestTest = 0.1770676772
bestIteration = 260
Shrink model to first 261 iterations.




[0]	validation_0-logloss:0.61237	validation_0-xgb_custom_metric:-0.02782
[1]	validation_0-logloss:0.54417	validation_0-xgb_custom_metric:-0.02781
[2]	validation_0-logloss:0.48586	validation_0-xgb_custom_metric:-0.02781
[3]	validation_0-logloss:0.43544	validation_0-xgb_custom_metric:-0.02780
[4]	validation_0-logloss:0.39154	validation_0-xgb_custom_metric:-0.02780
[5]	validation_0-logloss:0.35301	validation_0-xgb_custom_metric:-0.02779
[6]	validation_0-logloss:0.31904	validation_0-xgb_custom_metric:-0.03133
[7]	validation_0-logloss:0.28891	validation_0-xgb_custom_metric:-0.03445
[8]	validation_0-logloss:0.26210	validation_0-xgb_custom_metric:-0.03441
[9]	validation_0-logloss:0.23818	validation_0-xgb_custom_metric:-0.03999
[10]	validation_0-logloss:0.21676	validation_0-xgb_custom_metric:-0.04663
[11]	validation_0-logloss:0.19752	validation_0-xgb_custom_metric:-0.04796
[12]	validation_0-logloss:0.18019	validation_0-xgb_custom_metric:-0.04794
[13]	validation_0-logloss:0.16456	validation_0-x

  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.0302481	test: 0.0215900	best: 0.0215900 (0)	total: 36.4ms	remaining: 36.3s
200:	learn: 0.1774341	test: 0.1431264	best: 0.1431264 (200)	total: 7.09s	remaining: 28.2s
400:	learn: 0.1854246	test: 0.1503266	best: 0.1503266 (400)	total: 14.3s	remaining: 21.3s
600:	learn: 0.1898103	test: 0.1563356	best: 0.1563356 (600)	total: 21.4s	remaining: 14.2s
800:	learn: 0.1927889	test: 0.1585380	best: 0.1585380 (800)	total: 28.6s	remaining: 7.11s
999:	learn: 0.1946810	test: 0.1583486	best: 0.1590224 (915)	total: 35.9s	remaining: 0us
bestTest = 0.1590223593
bestIteration = 915
Shrink model to first 916 iterations.




[0]	validation_0-logloss:0.61240	validation_0-xgb_custom_metric:-0.02268
[1]	validation_0-logloss:0.54420	validation_0-xgb_custom_metric:-0.02842
[2]	validation_0-logloss:0.48585	validation_0-xgb_custom_metric:-0.02961
[3]	validation_0-logloss:0.43540	validation_0-xgb_custom_metric:-0.02961
[4]	validation_0-logloss:0.39149	validation_0-xgb_custom_metric:-0.03101
[5]	validation_0-logloss:0.35293	validation_0-xgb_custom_metric:-0.03099
[6]	validation_0-logloss:0.31894	validation_0-xgb_custom_metric:-0.03324
[7]	validation_0-logloss:0.28879	validation_0-xgb_custom_metric:-0.03488
[8]	validation_0-logloss:0.26198	validation_0-xgb_custom_metric:-0.04133
[9]	validation_0-logloss:0.23806	validation_0-xgb_custom_metric:-0.04409
[10]	validation_0-logloss:0.21663	validation_0-xgb_custom_metric:-0.04862
[11]	validation_0-logloss:0.19738	validation_0-xgb_custom_metric:-0.06724
[12]	validation_0-logloss:0.18006	validation_0-xgb_custom_metric:-0.06704
[13]	validation_0-logloss:0.16445	validation_0-x

  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.0278139	test: 0.0308909	best: 0.0308909 (0)	total: 39ms	remaining: 38.9s
200:	learn: 0.1765220	test: 0.1754662	best: 0.1754662 (200)	total: 7.13s	remaining: 28.3s
400:	learn: 0.1845159	test: 0.1770832	best: 0.1771501 (395)	total: 14.3s	remaining: 21.4s
600:	learn: 0.1891888	test: 0.1762037	best: 0.1774785 (440)	total: 21.5s	remaining: 14.3s
800:	learn: 0.1923116	test: 0.1770607	best: 0.1774785 (440)	total: 28.8s	remaining: 7.17s
999:	learn: 0.1945273	test: 0.1763955	best: 0.1774785 (440)	total: 36s	remaining: 0us
bestTest = 0.1774785008
bestIteration = 440
Shrink model to first 441 iterations.




[0]	validation_0-logloss:0.61237	validation_0-xgb_custom_metric:-0.03536
[1]	validation_0-logloss:0.54414	validation_0-xgb_custom_metric:-0.03536
[2]	validation_0-logloss:0.48579	validation_0-xgb_custom_metric:-0.03536
[3]	validation_0-logloss:0.43541	validation_0-xgb_custom_metric:-0.04598
[4]	validation_0-logloss:0.39149	validation_0-xgb_custom_metric:-0.04988
[5]	validation_0-logloss:0.35296	validation_0-xgb_custom_metric:-0.04988
[6]	validation_0-logloss:0.31896	validation_0-xgb_custom_metric:-0.04988
[7]	validation_0-logloss:0.28887	validation_0-xgb_custom_metric:-0.05422
[8]	validation_0-logloss:0.26209	validation_0-xgb_custom_metric:-0.05756
[9]	validation_0-logloss:0.23814	validation_0-xgb_custom_metric:-0.05930
[10]	validation_0-logloss:0.21670	validation_0-xgb_custom_metric:-0.06346
[11]	validation_0-logloss:0.19746	validation_0-xgb_custom_metric:-0.06822
[12]	validation_0-logloss:0.18013	validation_0-xgb_custom_metric:-0.07047
[13]	validation_0-logloss:0.16451	validation_0-x

  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.0286892	test: 0.0276001	best: 0.0276001 (0)	total: 36ms	remaining: 35.9s
200:	learn: 0.1757490	test: 0.1633765	best: 0.1633943 (190)	total: 6.98s	remaining: 27.8s
400:	learn: 0.1841716	test: 0.1680982	best: 0.1682363 (395)	total: 14.2s	remaining: 21.2s
600:	learn: 0.1894727	test: 0.1690273	best: 0.1693923 (575)	total: 21.3s	remaining: 14.1s
800:	learn: 0.1925122	test: 0.1697361	best: 0.1700028 (780)	total: 28.4s	remaining: 7.05s
999:	learn: 0.1946847	test: 0.1705472	best: 0.1711585 (975)	total: 35.4s	remaining: 0us
bestTest = 0.1711584834
bestIteration = 975
Shrink model to first 976 iterations.




[0]	validation_0-logloss:0.61232	validation_0-xgb_custom_metric:-0.02984
[1]	validation_0-logloss:0.54414	validation_0-xgb_custom_metric:-0.02984
[2]	validation_0-logloss:0.48584	validation_0-xgb_custom_metric:-0.03037
[3]	validation_0-logloss:0.43546	validation_0-xgb_custom_metric:-0.03604
[4]	validation_0-logloss:0.39150	validation_0-xgb_custom_metric:-0.03604
[5]	validation_0-logloss:0.35297	validation_0-xgb_custom_metric:-0.03764
[6]	validation_0-logloss:0.31898	validation_0-xgb_custom_metric:-0.03764
[7]	validation_0-logloss:0.28887	validation_0-xgb_custom_metric:-0.04314
[8]	validation_0-logloss:0.26209	validation_0-xgb_custom_metric:-0.04301
[9]	validation_0-logloss:0.23816	validation_0-xgb_custom_metric:-0.04280
[10]	validation_0-logloss:0.21671	validation_0-xgb_custom_metric:-0.04280
[11]	validation_0-logloss:0.19745	validation_0-xgb_custom_metric:-0.04269
[12]	validation_0-logloss:0.18010	validation_0-xgb_custom_metric:-0.04246
[13]	validation_0-logloss:0.16448	validation_0-x

  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.0290801	test: 0.0258610	best: 0.0258610 (0)	total: 36.1ms	remaining: 36.1s
200:	learn: 0.1778185	test: 0.1460303	best: 0.1466810 (190)	total: 7s	remaining: 27.8s
400:	learn: 0.1864046	test: 0.1511416	best: 0.1512514 (390)	total: 14.4s	remaining: 21.5s
600:	learn: 0.1914800	test: 0.1556895	best: 0.1556895 (600)	total: 21.5s	remaining: 14.3s
800:	learn: 0.1948858	test: 0.1572787	best: 0.1579061 (700)	total: 28.8s	remaining: 7.15s
