### Necessary Packages

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import polars as pl
import torch
import warnings
import h5py
import timm
import io
import os
import albumentations as A
import albumentations.pytorch as AP
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from torch import nn,Tensor
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import OneHotEncoder,FunctionTransformer,OrdinalEncoder
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from pandas.errors import DtypeWarning
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from typing import Any, Callable, Optional
from torch.utils.data import Dataset
from PIL import Image

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.14 (you have 1.4.8). Upgrade using: pip install --upgrade albumentations


In [2]:
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=UserWarning)
warnings.filterwarnings('ignore',category=DtypeWarning)

### Constants

In [3]:
TRAIN_METADATA = '/home/abdelnour/Documents/projects/skin-cancer-detection/data/isic2024/train-metadata.csv'
TEST_METADATA = '/home/abdelnour/Documents/projects/skin-cancer-detection/data/isic2024/test-metadata.csv'
TEST_IMAGES = '/home/abdelnour/Documents/projects/skin-cancer-detection/data/isic2024/test-image.hdf5'
FEATURES = '/home/abdelnour/Documents/projects/skin-cancer-detection/data/features.csv'
EVA_FETURES = '/home/abdelnour/Documents/projects/skin-cancer-detection/data/eva.csv'
SEED = 42

### Data Loading

In [4]:
train_df = pd.read_csv(TRAIN_METADATA)
test_df = pd.read_csv(TEST_METADATA)
features = pd.read_csv(FEATURES)
eva_features = pd.read_csv(EVA_FETURES)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
id_col = 'isic_id'
group_col = 'patient_id'
target_col = 'target'
train_only_columns = train_df.columns.difference(test_df.columns).tolist()
drop_columns = train_only_columns + [id_col,group_col,target_col]
images_features = ['eva02','eva02_tiny_patch14_224','coat_lite_tiny_2']

In [6]:
cv = StratifiedGroupKFold(n_splits=5,shuffle=True,random_state=SEED)
splits = list(cv.split(X=train_df,y=train_df[target_col],groups=train_df[group_col]))

In [7]:
df = train_df.merge(features,on=id_col)

### Feature engneering

In [8]:
def feature_engineering(metadata: pd.DataFrame):

    epsilon = 1e-8 # To avoid division by zero

    pl_df = pl.DataFrame(metadata)

    # Diameter: Melanoma growths are normally larger than 6mm in diameter, which is about the diameter of a standard pencil eraser

    pl_df = pl_df.with_columns([
        # Diameter ratio: This is a measure of how elongated the mole is. A perfectly round mole has an axis ratio of 1.0
        pl.col('tbp_lv_minorAxisMM').truediv(pl.col('clin_size_long_diam_mm') + epsilon)
        .cast(pl.Float32).alias('diam_ratio'),

        # Diameter difference: Difference between the long diameter and minor axis
        pl.col('clin_size_long_diam_mm').sub(pl.col('tbp_lv_minorAxisMM'))
        .cast(pl.Float32).alias('diam_difference'),

        # Long diameter greater than 6mm
        pl.when(pl.col('clin_size_long_diam_mm') > 6).then(1.0).otherwise(0.0)
        .cast(pl.Float32).alias('long_diam_gt_6mm'),

        # Minor axis greater than 6mm
        pl.when(pl.col('tbp_lv_minorAxisMM') > 6).then(1.0).otherwise(0.0)
        .cast(pl.Float32).alias('short_diam_gt_6mm')
    ])

    # Evolution: Melanoma lesions often change in size, shape, color, or texture over time, while non-cancerous moles usually stay the same
    pl_df = pl_df.with_columns([
        # Position of the mole in the 3d space
        (pl.col('tbp_lv_x').pow(2) + pl.col('tbp_lv_y').pow(2) + pl.col('tbp_lv_z').pow(2)).sqrt()
        .cast(pl.Float32).alias('3d_position'),

        # Perimeter area ratio
        pl.col('tbp_lv_perimeterMM').truediv(pl.col('tbp_lv_areaMM2') + epsilon)
        .cast(pl.Float32).alias('perim_area_ratio'),

        # log area
        pl.col('tbp_lv_areaMM2').log().cast(pl.Float32).alias('log_area'),

        # log perimeter
        pl.col('tbp_lv_perimeterMM').log().cast(pl.Float32).alias('log_perimeter'),

        # Roundness: This is a measure of how round the mole is. A perfectly round mole has a roundness index of 1.0
        pl.col('tbp_lv_areaMM2').mul(4 * np.pi).truediv(pl.col('tbp_lv_perimeterMM').pow(2) + epsilon)
        .cast(pl.Float32).alias('roundness_index'),
    ])

    # Asymmetry: Melanoma is often asymmetrical, which means the shape isn't uniform, Non-cancerous moles are typically uniform and symmetrical
    pl_df = pl_df.with_columns([
        # Asymmetry angle sinus
        pl.col('tbp_lv_symm_2axis_angle').sin()
        .cast(pl.Float32).alias('asymmetry_angle_sin'),

        # Asymmetry angle cosinus
        pl.col('tbp_lv_symm_2axis_angle').cos()
        .cast(pl.Float32).alias('asymmetry_angle_cos'),

        # Asymmetry index
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_eccentricity'))
        .cast(pl.Float32).alias('symmetry_index'),

        # Symmetry border interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_norm_border'))
        .cast(pl.Float32).alias('symmetry_border_interaction'),

        # Symmetry eccentricity interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_eccentricity'))
        .cast(pl.Float32).alias('symmetry_eccentricity_interaction'),

        # Symmetry color interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_radial_color_std_max'))
        .cast(pl.Float32).alias('symmetry_color_interaction'),

        # Symmetry border jaggedness interaction
        pl.col('tbp_lv_symm_2axis').mul(pl.col('tbp_lv_area_perim_ratio'))
        .cast(pl.Float32).alias('symmetry_border_jaggedness_interaction'),
    ])

    # Color: Melanoma lesions are often more than one color or shade, Moles that are benign are usually one color
    pl_df = pl_df.with_columns([
        # Color std ratio
        pl.col('tbp_lv_color_std_mean').truediv(pl.col('tbp_lv_radial_color_std_max') + epsilon)
        .cast(pl.Float32).alias('color_std_ratio'),

        (pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext')) + pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext')) + pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext')))
        .truediv((pl.col('tbp_lv_L') + pl.col('tbp_lv_A') + pl.col('tbp_lv_B') + epsilon))
        .cast(pl.Float32).alias('color_difference_ratio'),

        (pl.col('tbp_lv_deltaA')**2 + pl.col('tbp_lv_deltaB')**2 + pl.col('tbp_lv_deltaL')**2).sqrt()
        .cast(pl.Float32).alias('color_euclidean_distance'),

        # Color eccentricity interaction
        pl.col('tbp_lv_radial_color_std_max').mul(pl.col('tbp_lv_eccentricity'))
        .cast(pl.Float32).alias('color_eccentricity_interaction'),

        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_L').mean()).truediv(pl.col('tbp_lv_stdL') + epsilon)
        .cast(pl.Float32).alias('L_normalized'),

        # H Contrast
        pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext'))
        .cast(pl.Float32).alias('H_contrast'),

        # L Contrast
        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext'))
        .cast(pl.Float32).alias('L_contrast'),

        # A Contrast
        pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext'))
        .cast(pl.Float32).alias('A_contrast'),

        # B Contrast
        pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext'))
        .cast(pl.Float32).alias('B_contrast'),

        # C Contrast
        pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext'))
        .cast(pl.Float32).alias('C_contrast'),

        # A/B ratio
        pl.col('tbp_lv_deltaA').truediv(pl.col('tbp_lv_deltaB') + epsilon)
        .cast(pl.Float32).alias('A_B_ratio'),

        # L uniformity
        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext')).truediv(pl.col('tbp_lv_L') + epsilon)
        .cast(pl.Float32).alias('L_uniformity'),

        # A uniformity
        pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext')).truediv(pl.col('tbp_lv_A') + epsilon)
        .cast(pl.Float32).alias('A_uniformity'),

        # B uniformity
        pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext')).truediv(pl.col('tbp_lv_B') + epsilon)
        .cast(pl.Float32).alias('B_uniformity'),

        # C uniformity
        pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext')).truediv(pl.col('tbp_lv_C') + epsilon)
        .cast(pl.Float32).alias('C_uniformity'),

        # H uniformity ratio
        pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext')).truediv(pl.col('tbp_lv_H') + epsilon)
        .cast(pl.Float32).alias('H_uniformity_ratio'),

        pl.col('tbp_lv_deltaA').add(pl.col('tbp_lv_deltaB')).add(pl.col('tbp_lv_deltaL'))
        .cast(pl.Float32).alias('delta_sum'),
    ])

    pl_df = pl_df.with_columns([
        # lesions count by patient
        pl.col('isic_id').count().over('patient_id').alias('lesions_count_by_patient'),
        
        (pl.col('eva02') + pl.col('eva02_tiny_patch14_224') + pl.col('coat_lite_tiny_2')).truediv(3)
            .cast(pl.Float32).alias('image_features_ensemble')
    ])

    pd_df = pl_df.to_pandas()
    
    numerical_cols = pd_df.select_dtypes(np.number).columns

    pl_df = pl.DataFrame(pd_df)

    pl_df = pl_df.with_columns([
        (pl.col(col).sub(pl.col(col).mean())).truediv(pl.col(col).std() + epsilon).over('patient_id')
            .cast(pl.Float32).alias(f'{col}_zscore') for col in numerical_cols if col != 'target'
    ]) \
    .with_columns([
        pl.col(col).min().over('patient_id').alias(f'{col}_min') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).max().over('patient_id').alias(f'{col}_max') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).mean().over('patient_id').alias(f'{col}_mean') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).std().over('patient_id').alias(f'{col}_std') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).median().over('patient_id').alias(f'{col}_median') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).skew().over('patient_id').alias(f'{col}_skew') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).quantile(0.25).over('patient_id').alias(f'{col}_q25') for col in images_features + ['image_features_ensemble']
    ]) \
    .with_columns([
        pl.col(col).quantile(0.75).over('patient_id').alias(f'{col}_q75') for col in images_features + ['image_features_ensemble']
    ])

    pd_df = pl_df.to_pandas()
    
    for col in numerical_cols:
        pd_df[col] = pd_df[col].replace(np.inf, 1e8).replace(-np.inf, -1e8)

    return pd_df

### Data Preparation

In [9]:
def prepare_data(
    df : pd.DataFrame,
    features : pd.DataFrame | None = None,
    preprocessor : Pipeline | None = None,
) -> tuple[tuple[pd.DataFrame,pd.Series],Pipeline]:
    
    X = df.drop(columns=[target_col],errors='ignore')
    y = df[target_col] if target_col in df.columns else None

    if features is not None:
        X = X.merge(features,on=id_col)

    if preprocessor is None:

        transforms  = [
            ('drop-columns-1',FunctionTransformer(lambda df: df.drop(columns=train_only_columns,errors='ignore'))),
            ('feature-engineering',FunctionTransformer(feature_engineering)),
            ('drop-columns-2',FunctionTransformer(lambda df: df.drop(columns=drop_columns,errors='ignore'))),
            ('preprocessing',ColumnTransformer([
                ('num',SimpleImputer(strategy='median'),make_column_selector(dtype_include=np.number)),
                ('cat',OrdinalEncoder(
                    handle_unknown='use_encoded_value',
                    unknown_value=-1,
                    dtype=np.int32
                ),make_column_selector(dtype_include=pd.CategoricalDtype)),
            ])),
        ]

        preprocessor = Pipeline(transforms).set_output(transform="pandas")

        preprocessor = preprocessor.fit(X,y)

    X = preprocessor.transform(X)

    X.columns = [
        f'cat_col_{i}' if col.startswith('cat') else f'num_col_{i}'
        for i,col in enumerate(X.columns)
    ]

    return (X,y),preprocessor

In [10]:
oof_eva_features = []

for i,(train_idx,val_idx) in enumerate(splits):
    a = eva_features[['isic_id',f'eva02_fold{i}']].iloc[val_idx]
    a['eva02'] = a[f'eva02_fold{i}']
    a = a.drop(columns=[f'eva02_fold{i}'])
    oof_eva_features.append(a)

oof_eva_features = pd.concat(oof_eva_features)
oof_eva_features.head()

Unnamed: 0,isic_id,eva02
1,ISIC_0015845,0.012015
3,ISIC_0015902,0.009385
4,ISIC_0024200,0.015275
7,ISIC_0051665,0.007602
10,ISIC_0051812,0.016717


In [11]:
features = features.drop(columns=["resnet18","coat_lite_tiny","swin_tiny_patch4_window7_224"])
features = features.merge(oof_eva_features,on='isic_id')

In [12]:
(X_train,y_train),preprocessor = prepare_data(train_df,features)

In [13]:
X_train.shape

(401059, 176)

In [14]:
# df = train_df.merge(features,on=id_col)

In [15]:
# pauc(df['target'],df['resnet18'])

### Splitting

In [16]:
def split(
    X : pd.DataFrame,
    y : pd.Series,
    splits : list
) -> list[dict[str,tuple[pd.DataFrame,pd.Series]]]:

    folds = []
    
    for train_idx,val_idx in splits:
        
        folds.append({
            'train': (X.iloc[train_idx],y.iloc[train_idx]),
            'test': (X.iloc[val_idx],y.iloc[val_idx]),
        })

    return folds

In [17]:
folds = split(X_train,y_train, splits)

### Training

In [18]:
def pauc(solution: np.array, submission: np.array, min_tpr : float = 0.8) -> float:
    
    v_gt = abs(np.asarray(solution)-1)
    
    # flip the submissions to their compliments
    v_pred = -1.0*np.asarray(submission)

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)
        
    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)
    
    return(partial_auc)

In [19]:
w_lgbm = 0.4
w_xgb = 0.4

In [20]:
def train(
    folds : list[dict[str,tuple[pd.DataFrame,pd.Series]]],
    params : dict,
) -> list[float]:

    models = {
        'xgb' : [],
        'lgb' : [],
        'cat' : [],
        'soft' : [],
        'stacking' : [],
        'max' : []
    }

    scores = {
        'xgb' : [],
        'lgb' : [],
        'cat' : [],
        'soft' : [],
        'stacking' : [],
        'max' : []
    }

    preds = {
        'xgb' : [],
        'lgb' : [],
        'cat' : [],
        'soft' : [],
        'stacking' : [],
        'max' : []
    }

    for fold_id,fold in enumerate(folds):
        
        ### Get the data
        X_train,y_train = fold['train']
        X_val,y_val = fold['test']

        ### ** LGBM ** ###

        ### Training
        lgb_model = Pipeline([
            ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
            ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
            ('classifier', lgb.LGBMClassifier(**params["lgbm"])),
        ])

        lgb_model.fit(X_train,y_train)

        ### Predictions
        lgb_y_pred = lgb_model.predict_proba(X_val)[:,1]

        ### Evaluation
        lgb_pauc = pauc(y_val,lgb_y_pred)
        print(f'Fold {fold_id+1} LGBM : {lgb_pauc}')

        ### ** XGB ** ###

        ### Training
        xgb_model = Pipeline([
            ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
            ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
            ('classifier', xgb.XGBClassifier(**params["xgb"])),
        ])
        xgb_model.fit(X_train,y_train)

        ### Predictions
        xgb_y_pred = xgb_model.predict_proba(X_val)[:,1]

        ### Evaluation
        xgb_pauc = pauc(y_val,xgb_y_pred)
        print(f'Fold {fold_id+1} XGB : {xgb_pauc}')

        ### ** Cat ** ###

        ### Training
        cb_model = Pipeline([
            ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
            ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
            ('classifier', cb.CatBoostClassifier(**params["cat"])),
        ])

        cb_model.fit(X_train,y_train)

        ### Predictions
        cb_y_pred = cb_model.predict_proba(X_val)[:,1]

        ### Evaluation
        cb_pauc = pauc(y_val,cb_y_pred)
        print(f'Fold {fold_id+1} CatBoost : {cb_pauc}')

        ### ** Soft ** ###
        soft_model = Pipeline([
            ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
            ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
            ('classifier', VotingClassifier([
                ('lgb',lgb.LGBMClassifier(**params["lgbm"])),
                ('xgb',xgb.XGBClassifier(**params["xgb"])),
                ('cat',cb.CatBoostClassifier(**params["cat"])),
            ],voting='soft',weights=[w_lgbm,w_xgb,1.0 - w_lgbm - w_xgb])),
        ])

        soft_model.fit(X_train,y_train)

        ### Predictions
        soft_y_pred = soft_model.predict_proba(X_val)[:,1]

        ### Evaluation
        soft_pauc = pauc(y_val,soft_y_pred)
        print(f'Fold {fold_id+1} Soft : {soft_pauc}')

        ### ** Stacking ** ###
        stacking_model = Pipeline([
            ('sampler1', RandomOverSampler(sampling_strategy= 0.003 , random_state=SEED)),
            ('sampler2', RandomUnderSampler(sampling_strategy=0.01, random_state=SEED)),
            ('classifier', StackingClassifier([
                ('lgb',lgb.LGBMClassifier(**params["lgbm"])),
                ('xgb',xgb.XGBClassifier(**params["xgb"])),
                ('cat',cb.CatBoostClassifier(**params["cat"])),
            ],final_estimator=LogisticRegression(),stack_method='predict_proba')),
        ])

        stacking_model.fit(X_train,y_train)

        ### Predictions
        stacking_y_pred = stacking_model.predict_proba(X_val)[:,1]

        ### Evaluation
        stacking_pauc = pauc(y_val,stacking_y_pred)
        print(f'Fold {fold_id+1} Stacking : {stacking_pauc}')

        ### ** Max ** ###
        scores_ = [lgb_pauc,xgb_pauc,cb_pauc]
        max_model = [lgb_model,xgb_model,cb_model][scores_.index(max(scores_))]
        max_y_pred = max_model.predict_proba(X_val)[:,1]


        ### ** Save ** ###
        models['lgb'].append(lgb_model)
        models['xgb'].append(xgb_model)
        models['cat'].append(cb_model)
        models['soft'].append(soft_model)
        models['stacking'].append(stacking_model)
        models['max'].append(max_model)

        scores['lgb'].append(lgb_pauc)
        scores['xgb'].append(xgb_pauc)
        scores['cat'].append(cb_pauc)
        scores['soft'].append(soft_pauc)
        scores['stacking'].append(stacking_pauc)
        scores['max'].append(max(scores_))

        preds['lgb'].extend(lgb_y_pred)
        preds['xgb'].extend(xgb_y_pred)
        preds['cat'].extend(cb_y_pred)
        preds['soft'].extend(soft_y_pred)
        preds['stacking'].extend(stacking_y_pred)
        preds['max'].extend(max_y_pred)

        print()

    y_val = []

    for fold in folds:
        y_val.extend(fold['test'][1])

    y_val = np.array(y_val)

    for model,pred in preds.items():
        pred = np.array(pred)
        oof_score = pauc(y_val,pred)
        print(f'{model} OOF : {oof_score}')

    return models,scores,preds,y_val

- Baseline Model

In [21]:
"""params = {
    "lgbm" :  {
        'objective':        'binary',
        'verbosity':        -1,
        'n_iter':           500,
        'boosting_type':    'gbdt',
        'random_state':     SEED,
        'lambda_l1':        0.08758718919397321, 
        'lambda_l2':        0.0039689175176025465, 
        'learning_rate':    0.03231007103195577, 
        'max_depth':        4, 
        'num_leaves':       103, 
        'colsample_bytree': 0.8329551585827726, 
        'colsample_bynode': 0.4025961355653304, 
        'bagging_fraction': 0.7738954452473223, 
        'bagging_freq':     4, 
        'min_data_in_leaf': 85
    },
    "xgb" : {
        'enable_categorical': True,
        'tree_method':        'hist',
        'random_state':       SEED,
        'learning_rate':      0.08501257473292347, 
        'lambda':             8.879624125465703, 
        'alpha':              0.6779926606782505, 
        'max_depth':          6, 
        'subsample':          0.6012681388711075, 
        'colsample_bytree':   0.8437772277074493, 
        'colsample_bylevel':  0.5476090898823716, 
        'colsample_bynode':   0.9928601203635129, 
        'scale_pos_weight':   3.29440313334688,
    },
    "cat" : {
        'loss_function':     'Logloss',
        'iterations':        200,
        'verbose':           False,
        'random_state':      SEED,
        'max_depth':         7, 
        'learning_rate':     0.06936242010150652, 
        'scale_pos_weight':  2.6149345838209532, 
        'l2_leaf_reg':       6.216113851699493, 
        'subsample':         0.6249261779711819, 
        'min_data_in_leaf':  24,
        'cat_features':      X_train.columns[X_train.columns.str.startswith('cat')].tolist(),
    }
}"""

""

''

In [22]:
params = {
    "lgbm" :  {
        'random_state': SEED,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'n_iter': 217,
        'learning_rate': 0.03238691082828724,
        'min_data_in_leaf': 93,
        'colsample_bytree': 0.6336220063217671,
        'colsample_bynode': 0.6449520328952557,
        'bagging_fraction': 0.4759532875970691,
        'bagging_freq': 5,        
        'lambda_l1': 0.26214445542677706,
        'lambda_l2': 0.5374677609248407,
        'num_leaves': 107,
        'max_depth': 6,
        'min_child_samples': 38,
        'scale_pos_weight': 2.454410437484347,
        'verbosity': -1,
    },
    "xgb" : {
        'random_state': SEED,
        'tree_method': 'hist',
        'objective': 'binary:logistic',
        'learning_rate': 0.09211299188601348,
        'colsample_bytree': 0.8178496272146406,
        'colsample_bynode': 0.8688187031865214,
        'colsample_bylevel': 0.3476250413253686,
        'scale_pos_weight': 2.1101053079075625,
        'max_depth': 6,
        'subsample': 0.7286594187699229,
        'lambda': 4.586386144644716,
        'alpha': 0.21956352903435347,
        'enable_categorical': True,
        'verbosity': 0,
    },
    "cat" : {
        'random_state': SEED,
        'loss_function': 'Logloss',
        'iterations': 240,
        'learning_rate': 0.046047767547706654,
        'scale_pos_weight': 4.670443106586375,
        'reg_lambda': 6.153764402866752,
        'subsample': 0.4712307759317672,
        'min_data_in_leaf': 26,
        'max_depth': 4,
        'cat_features': X_train.columns[X_train.columns.str.startswith('cat')].tolist(),
        'verbose': False,
    }
}

In [23]:
models,scores,preds,y_val = train(folds,params=params)

Fold 1 LGBM : 0.18305228906509674
Fold 1 XGB : 0.18571332925291573
Fold 1 CatBoost : 0.18023608905028252
Fold 1 Soft : 0.1836374351812788
Fold 1 Stacking : 0.10859052196179377

Fold 2 LGBM : 0.1861106407198867


KeyboardInterrupt: 

In [None]:
eval_df = pd.DataFrame(scores)
eval_df.loc['mean'] = eval_df.mean(axis=0)
eval_df.loc['std'] = eval_df.std(axis=0)
eval_df

Unnamed: 0,xgb,lgb,cat,soft,stacking,max
0,0.185713,0.183052,0.180236,0.183637,0.108591,0.185713
1,0.187129,0.186111,0.185594,0.187303,0.102902,0.187129
2,0.188769,0.191145,0.191438,0.191408,0.138603,0.191438
3,0.165086,0.167145,0.163704,0.16996,0.035475,0.167145
4,0.18695,0.188936,0.189911,0.190397,0.049348,0.189911
mean,0.182729,0.183278,0.182177,0.184541,0.086984,0.184267
std,0.008875,0.008512,0.010023,0.007778,0.038612,0.008795


In [None]:
raise Exception

Exception: 

### Submission

- Dataset

In [None]:
__cols__ = ['age_approx','sex','anatom_site_general','clin_size_long_diam_mm','tbp_lv_A']
__cols__ += ['tbp_lv_Aext','tbp_lv_B','tbp_lv_Bext','tbp_lv_C','tbp_lv_Cext','tbp_lv_H','tbp_lv_Hext','tbp_lv_L']
__cols__ += ['tbp_lv_Lext','tbp_lv_areaMM2','tbp_lv_area_perim_ratio','tbp_lv_color_std_mean','tbp_lv_deltaA']
__cols__ += ['tbp_lv_deltaB','tbp_lv_deltaL','tbp_lv_deltaLB','tbp_lv_deltaLBnorm','tbp_lv_eccentricity','tbp_lv_location']
__cols__ += ['tbp_lv_minorAxisMM','tbp_lv_nevi_confidence','tbp_lv_norm_border','tbp_lv_norm_color']
__cols__ += ['tbp_lv_perimeterMM','tbp_lv_radial_color_std_max','tbp_lv_stdL','tbp_lv_stdLExt','tbp_lv_symm_2axis']
__cols__ += ['tbp_lv_symm_2axis_angle','tbp_lv_x','tbp_lv_y','tbp_lv_z']

class ISICDataset(Dataset):

    __cols__ = __cols__
    
    def __init__(self, 
        hdf5_file : str | h5py.File,
        metadata_file : str | pd.DataFrame,
        img_transform : Optional[Callable] = None,
        metadata_transform : Optional[Callable] = None,
        target_transform : Optional[Callable] = None,
        return_metadata : bool = False,
        target_col : str = "target",
        mode : str = "train"
    ) -> None:
        
        super().__init__()
        
        self.hdf5_file = hdf5_file
        self.metadata_file = metadata_file
        self.img_transform = img_transform
        self.target_transform = target_transform
        self.metadata_transform = metadata_transform
        self.mode = mode
        self.target_col = target_col
        
        self.metadata = pd.read_csv(self.metadata_file) if isinstance(self.metadata_file, str) else self.metadata_file
        self.return_metadata = return_metadata
        
        self.hdf5 = h5py.File(self.hdf5_file, "r") if isinstance(self.hdf5_file, str) else self.hdf5_file

    def get_labels(self) -> list[int]:
        return self.metadata['target'].tolist()
       
    def __len__(self):
        return self.metadata.shape[0]
    
    def __getitem__(self, index : int) -> tuple[tuple[Any,Any],Any]:
        
        ### Get the metadata row
        row = self.metadata.iloc[index]
        
        ### Get the target
        target = row[self.target_col] if self.mode != "test" else 0.0
        
        ### The image
        image_name = row['isic_id']
        dataset = self.hdf5[image_name]
        buffer = dataset[()]
        image_file = io.BytesIO(buffer)
        img = Image.open(image_file)
        img = np.array(img)

        ### The metadata
        metadata = row[ISICDataset.__cols__]
        
        ### Apply the transformations
        if self.target_transform is not None:
            target = self.target_transform(target)
        
        if self.img_transform is not None:
            img = self.img_transform(image=img)
            
        if self.metadata_transform is not None and self.return_metadata:
            metadata = self.metadata_transform(metadata)
        
        if self.return_metadata:
            return (img,metadata), target
        
        return img, target

In [None]:
dataset = ISICDataset(
    hdf5_file=TEST_IMAGES,
    metadata_file=TEST_METADATA,
    mode="test"
)

- Models

In [None]:
class BaseModule(nn.Module):

    def __init__(self,
        model_name : str,
        num_classes : int = 1,
        pretrained : bool = True,
        dropout : float = 0.0,
    ):

        super(BaseModule, self).__init__()

        self.model_name = model_name
        self.num_classes = num_classes
        self.pretrained = pretrained
        self.dropout = dropout

        self.model = self.create_model()

        if self.dropout > 0.0:

            classifier = nn.Sequential(
                nn.Dropout(self.dropout),
                nn.Linear(self.get_dim(), self.num_classes),
            )
            
        else:
            classifier = nn.Linear(self.get_dim(), self.num_classes)

        self.replace_classifier(classifier)


    def create_model(self) -> nn.Module:
        raise NotImplementedError
    
    def replace_classifier(self, classifier : nn.Module) -> 'BaseModule':
        raise NotImplementedError
    
    def get_dim(self) -> int:
        raise NotImplementedError
    
    def forward(self, x: Tensor) -> Tensor:
        
        x = self.model(x)

        if self.num_classes == 1:
            x = torch.squeeze(x, dim=-1)

        return x
    
    def predict(self, x: Tensor) -> Tensor:

        x =  self.forward(x)

        if self.num_classes == 1:
            x = torch.sigmoid(x)
        else:
            x = torch.softmax(x, dim=-1)

        return x
    
    def as_backbone(self) -> 'BaseModule':
        self.replace_classifier(nn.Identity())
        return self

In [None]:
class Coat(BaseModule):

    __models__ = {
        'coat_lite_tiny' : (timm.models.coat.coat_lite_tiny,320),
        'coat_lite_mini' : (timm.models.coat.coat_lite_mini,512),
    }

    def __init__(self, model_name: str, num_classes: int = 1, pretrained: bool = True, dropout: float = 0):
        super().__init__(model_name, num_classes, pretrained, dropout)

    def create_model(self) -> nn.Module:
        model_fn, _ = self.__models__[self.model_name]
        model = model_fn(pretrained=self.pretrained)
        return model
    
    def replace_classifier(self, classifier: nn.Module) -> 'Coat':
        self.model.head = classifier
        return self
    
    def get_dim(self) -> int:
        _, dim = self.__models__[self.model_name]
        return dim

In [None]:
class Eva02(BaseModule):

    __models = {
        'eva02_tiny_patch14_224' : timm.models.eva.eva02_tiny_patch14_224,
        'eva02_tiny_patch14_336' : timm.models.eva.eva02_tiny_patch14_336
    }

    __dims__ = {
        'eva02_tiny_patch14_224' : 192,
        'eva02_tiny_patch14_336' : 192
    }

    def __init__(self, model_name: str, num_classes: int = 1, pretrained: bool = True, dropout: float = 0):
        super().__init__(model_name, num_classes, pretrained, dropout)

    def create_model(self) -> nn.Module:
        model_fn = self.__models[self.model_name]
        model = model_fn(pretrained=self.pretrained, num_classes=self.num_classes)
        return model
    
    def get_dim(self) -> int:
        return self.__dims__[self.model_name]
    
    def replace_classifier(self, classifier: nn.Module) -> 'Eva02':
        self.model.head = classifier
        return self

In [None]:
def create_model(model_name : str):

    if model_name.startswith('coat'):
        return Coat(model_name,pretrained=False)
    elif model_name.startswith('eva02'):
        return Eva02(model_name,pretrained=False)
    else:
        raise ValueError(f"Unknown model {model_name}")

In [None]:
configs = [
    {
        "model_name" : "coat_lite_tiny",
        "checkpoints" : "/home/abdelnour/Documents/projects/skin-cancer-detection/expirements/coat_lite_tiny_2/checkpoints",
        "feature_name" : "coat_lite_tiny_2",
        "transforms" : A.Compose([
            A.Resize(224,224,p=1.0),
            A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225),max_pixel_value=255.0,p=1.0),
            AP.ToTensorV2(p=1.0),
        ],p=1.0),
        "order" : ['fold_0/model_fold=0.pt','fold_1/model_fold=1.pt','fold_2/model_fold=2.pt','fold_3/model_fold=3.pt','fold_4/model_fold=4.pt'],
    },
    {
        "model_name" : "eva02_tiny_patch14_224",
        "checkpoints" : "/home/abdelnour/Documents/projects/skin-cancer-detection/expirements/eva02_224/checkpoints",
        "feature_name" : "eva02_tiny_patch14_224",
        "transforms" : A.Compose([
            A.Resize(224,224,p=1.0),
            A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225),max_pixel_value=255.0,p=1.0),
            AP.ToTensorV2(p=1.0),
        ],p=1.0),
        "order" : ['fold_0/model_fold=0.pt','fold_1/model_fold=1.pt','fold_2/model_fold=2.pt','fold_3/model_fold=3.pt','fold_4/model_fold=4.pt'],
    },
]

In [None]:
def load_models(model_type : str,checkpoints_dir : str, files : list[str]):
    
    models = []
        
    for file in tqdm(files):
        
        model = create_model(model_type)
        
        state_dict = torch.load(os.path.join(checkpoints_dir, file),map_location=DEVICE)
        msg = model.load_state_dict(state_dict,strict=False)

        print(msg)
        
        model = model.to(DEVICE)
        model = model.eval()
        
        models.append(model)
        
    return models

- Adding features

In [None]:
dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=False,
    num_workers=4,
    prefetch_factor=2,
)

In [None]:
def predict(models : list[torch.nn.Module], dataloader: DataLoader):

    result = dict()
        
    for i,_ in enumerate(models):
        result[f"y_pred_{i}"] = []
    
    with torch.inference_mode():
        
        for x,_ in tqdm(dataloader):

            x = x["image"].to(DEVICE)

            for i,model in enumerate(models):
                y_hat = model.predict(x).detach().cpu().numpy()
                result[f"y_pred_{i}"].extend(y_hat)
            
    result['isic_id'] = dataloader.dataset.metadata['isic_id']

    return pd.DataFrame(result)

In [None]:
def get_features(configs : list[dict]):

    features = []

    for config in configs:

        ### Preprocessing
        dataset.img_transform = config['transforms']
        
        ### Load the models
        models = load_models(config['model_name'],config['checkpoints'],config['order'])

        ### Predict
        preds = predict(models,dataloader)

        ### Save the features
        features.append(pd.DataFrame({
            'isic_id' : preds['isic_id'],
            config['feature_name'] : preds.drop(columns='isic_id').mean(axis=1)
        }))

    merged_features = features[0]

    for feature in features[1:]:
        merged_features = merged_features.merge(feature,on='isic_id')

    return merged_features

In [None]:
test_features = get_features(configs)

  0%|          | 0/5 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
test_features.head()

Unnamed: 0,isic_id,coat_lite_tiny_2,eva02_tiny_patch14_224
0,ISIC_0015657,0.00722,0.00939
1,ISIC_0015729,0.000469,0.001132
2,ISIC_0015740,0.000926,0.000969


- Preprocessing

In [None]:
(test_df,_),_ = prepare_data(test_df,test_features,preprocessor)

- Predictions

In [None]:
def predict(
    models : list,
    test_df : pd.DataFrame,
):

    preds = []

    for i in range(5):
        pred = w_lgbm * models['lgb'][i].predict_proba(test_df)[:,1] \
            + (1 - w_xgb - w_lgbm) * models['cat'][i].predict_proba(test_df)[:,1] \
            + w_xgb * models['xgb'][i].predict_proba(test_df)[:,1]
        preds.append(pred)

    preds = np.mean(preds,axis=0)

    return preds

In [None]:
preds = predict(models,test_df)

In [None]:
submission = pd.DataFrame({
    'isic_id' : dataset.metadata.isic_id,
    'target' : preds
})

In [None]:
submission.head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.001045
1,ISIC_0015729,0.000259
2,ISIC_0015740,0.00036
