### Import necessary libraries

In [29]:
import os
import sys
import torch
import optuna
import random
import numpy as np
import pandas as pd
import optuna
from tqdm.auto import tqdm
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import sys
import logging
sys.path.append('../..')
from definitions import ISIS_2024_DIR
import pickle

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Device for inference

In [3]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {DEVICE} for inference')

Using cuda for inference


### Reproducibility

In [4]:
def seed_everything(seed : int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

### Global variables

In [5]:
DATA_DIR = "/kaggle/input/isic-2024-challenge"

In [6]:
N_SPLITS = 5
N_NEGATIVE = 91700
LGB_WEIGHTS = 0.4
CB_WEIGHTS = 0.6
EARLY_STOPPING_ROUNDS = 30

In [7]:
SEED = 42
seed_everything(SEED)

### Data loading

In [8]:
train_meta_path = os.path.join(ISIS_2024_DIR, 'metadata.csv')
# test_meta_path = os.path.join(DATA_DIR, 'test-metadata.csv')

In [9]:
train_metadata = pd.read_csv(train_meta_path)
# test_metadata = pd.read_csv(test_meta_path)

In [10]:
train_metadata['age_approx'] = train_metadata['age_approx'].fillna(-1)
# test_metadata['age_approx'] = test_metadata['age_approx'].fillna(-1)

### Columns

In [11]:
# Included only on train metadata
train_only_columns = ['lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence']

# Train and Test Features
general_features = ['age_approx', 'sex', 'tbp_tile_type']
color_features = ['tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_norm_color', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt']
shape_features = ['clin_size_long_diam_mm', 'tbp_lv_areaMM2', 'tbp_lv_perimeterMM', 'tbp_lv_area_perim_ratio', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM']
location_features = ['anatom_site_general', 'tbp_lv_location', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z']
additional_features = ['tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'attribution']
target = 'target'
group_col = 'patient_id'

# Drop features
drop_features = ['isic_id', 'image_type', 'copyright_license', 'tbp_lv_location_simple'] + train_only_columns

### Downsampling

In [12]:
positive_samples = train_metadata[train_metadata['target'] == 1]
negative_samples = train_metadata[train_metadata['target'] == 0].sample(n=N_NEGATIVE, random_state=SEED)

#df = pd.concat([positive_samples, negative_samples], axis=0).reset_index(drop=True)

df = train_metadata.copy()

df.drop(drop_features, axis=1, inplace=True)

### Feature engineering and aggregation

In [13]:
def aggregate_features(df):
    columns = []
    ### Ratio Aggregations ###

    df = pl.DataFrame(df)
        
    df = df.with_columns([
        
        # Ratio of A* to the product of A*ext and age minimum A*
        pl.col('tbp_lv_A').truediv(pl.col('tbp_lv_Aext').mul(pl.col('tbp_lv_A').min()))
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_ratio_A'),
        
        # Ratio of B* to the product of B*ext and age minimum B*
        pl.col('tbp_lv_B').truediv(pl.col('tbp_lv_Bext').mul(pl.col('tbp_lv_B').min()))
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_ratio_B'),
        
        # Ratio of C* to the product of C*ext and age minimum C*
        pl.col('tbp_lv_C').truediv(pl.col('tbp_lv_Cext').mul(pl.col('tbp_lv_C').min()))
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_ratio_C'),
        
        # Ratio of H* to the product of H*ext and age minimum H*
        pl.col('tbp_lv_H').truediv(pl.col('tbp_lv_Hext').mul(pl.col('tbp_lv_H').min()))
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_ratio_H'),
        
        # Ratio of L* to the product of L*ext and age minimum L*
        pl.col('tbp_lv_L').truediv(pl.col('tbp_lv_Lext').mul(pl.col('tbp_lv_L').min()))
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_ratio_L'),
        
    ])
    
    df = df.with_columns([
        
        # Contrast between A* and A*ext
        pl.col('tbp_lv_A').sub(pl.col('tbp_lv_Aext'))
        .cast(pl.Float32).alias('tbp_lv_contrast_A'),
        
        # Contrast between B* and B*ext
        pl.col('tbp_lv_B').sub(pl.col('tbp_lv_Bext'))
        .cast(pl.Float32).alias('tbp_lv_contrast_B'),
        
        # Contrast between C* and C*ext
        pl.col('tbp_lv_C').sub(pl.col('tbp_lv_Cext'))
        .cast(pl.Float32).alias('tbp_lv_contrast_C'),
        
        # Contrast between H* and H*ext
        pl.col('tbp_lv_H').sub(pl.col('tbp_lv_Hext'))
        .cast(pl.Float32).alias('tbp_lv_contrast_H'),
        
        # Contrast between L* and L*ext
        pl.col('tbp_lv_L').sub(pl.col('tbp_lv_Lext'))
        .cast(pl.Float32).alias('tbp_lv_contrast_L'), 
        
    ])
    
    df = df.with_columns([     
        
        # Ratio of tbp_lv_ratio_A to patient average
        pl.col('tbp_lv_ratio_A').truediv(pl.col('tbp_lv_ratio_A').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_ratio_A'),
        
        # Ratio of tbp_lv_ratio_B to patient average
        pl.col('tbp_lv_ratio_B').truediv(pl.col('tbp_lv_ratio_B').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_ratio_B'),
        
        # Ratio of tbp_lv_ratio_C to patient average
        pl.col('tbp_lv_ratio_C').truediv(pl.col('tbp_lv_ratio_C').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_ratio_C'),
        
        # Ratio of tbp_lv_ratio_H to patient average
        pl.col('tbp_lv_ratio_H').truediv(pl.col('tbp_lv_ratio_H').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_ratio_H'),
        
        # Ratio of tbp_lv_ratio_L to patient average
        pl.col('tbp_lv_ratio_L').truediv(pl.col('tbp_lv_ratio_L').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_ratio_L'),
        
    ])
    
    df = df.with_columns([     
        
        # Ratio of tbp_lv_contrast_A to patient average
        pl.col('tbp_lv_contrast_A').truediv(pl.col('tbp_lv_contrast_A').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_contrast_A'),
        
        # Ratio of tbp_lv_contrast_B to patient average
        pl.col('tbp_lv_contrast_B').truediv(pl.col('tbp_lv_contrast_B').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_contrast_B'),
        
        # Ratio of tbp_lv_contrast_C to patient average
        pl.col('tbp_lv_contrast_C').truediv(pl.col('tbp_lv_contrast_C').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_contrast_C'),
        
        # Ratio of tbp_lv_contrast_H to patient average
        pl.col('tbp_lv_contrast_H').truediv(pl.col('tbp_lv_contrast_H').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_contrast_H'),
        
        # Ratio of tbp_lv_contrast_L to patient average
        pl.col('tbp_lv_contrast_L').truediv(pl.col('tbp_lv_contrast_L').mean())
        .over('patient_id')
        .cast(pl.Float32).alias('tbp_lv_patient_contrast_L'),
        
    ])
    
    df = df.with_columns([     
        
        # Ratio of tbp_lv_ratio_A to age average
        pl.col('tbp_lv_ratio_A').truediv(pl.col('tbp_lv_ratio_A').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_ratio_A'),
        
        # Ratio of tbp_lv_ratio_B to age average
        pl.col('tbp_lv_ratio_B').truediv(pl.col('tbp_lv_ratio_B').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_ratio_B'),
        
        # Ratio of tbp_lv_ratio_C to age average
        pl.col('tbp_lv_ratio_C').truediv(pl.col('tbp_lv_ratio_C').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_ratio_C'),
        
        # Ratio of tbp_lv_ratio_H to age average
        pl.col('tbp_lv_ratio_H').truediv(pl.col('tbp_lv_ratio_H').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_ratio_H'),
        
        # Ratio of tbp_lv_ratio_L to age average
        pl.col('tbp_lv_ratio_L').truediv(pl.col('tbp_lv_ratio_L').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_ratio_L'),
        
    ])
    
    df = df.with_columns([     
        
        # Ratio of tbp_lv_contrast_A to age average
        pl.col('tbp_lv_contrast_A').truediv(pl.col('tbp_lv_contrast_A').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_contrast_A'),
        
        # Ratio of tbp_lv_contrast_B to age average
        pl.col('tbp_lv_contrast_B').truediv(pl.col('tbp_lv_contrast_B').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_contrast_B'),
        
        # Ratio of tbp_lv_contrast_C to age average
        pl.col('tbp_lv_contrast_C').truediv(pl.col('tbp_lv_contrast_C').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_contrast_C'),
        
        # Ratio of tbp_lv_contrast_H to age average
        pl.col('tbp_lv_contrast_H').truediv(pl.col('tbp_lv_contrast_H').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_contrast_H'),
        
        # Ratio of tbp_lv_contrast_L to age average
        pl.col('tbp_lv_contrast_L').truediv(pl.col('tbp_lv_contrast_L').mean())
        .over('age_approx')
        .cast(pl.Float32).alias('tbp_lv_age_contrast_L'),
        
    ]) 

    columns += ['tbp_lv_ratio_A', 'tbp_lv_ratio_B', 'tbp_lv_ratio_C', 'tbp_lv_ratio_H', 'tbp_lv_ratio_L']
    columns += ['tbp_lv_contrast_A', 'tbp_lv_contrast_B', 'tbp_lv_contrast_C', 'tbp_lv_contrast_H', 'tbp_lv_contrast_L']
    columns += ['tbp_lv_patient_ratio_A', 'tbp_lv_patient_ratio_B', 'tbp_lv_patient_ratio_C', 'tbp_lv_patient_ratio_H', 'tbp_lv_patient_ratio_L']
    columns += ['tbp_lv_patient_contrast_A', 'tbp_lv_patient_contrast_B', 'tbp_lv_patient_contrast_C', 'tbp_lv_patient_contrast_H', 'tbp_lv_patient_contrast_L']
    columns += ['tbp_lv_age_ratio_A', 'tbp_lv_age_ratio_B', 'tbp_lv_age_ratio_C', 'tbp_lv_age_ratio_H', 'tbp_lv_age_ratio_L']
    columns += ['tbp_lv_age_contrast_A', 'tbp_lv_age_contrast_B', 'tbp_lv_age_contrast_C', 'tbp_lv_age_contrast_H', 'tbp_lv_age_contrast_L']

    return df, columns

In [14]:
df, new_cols = aggregate_features(df)
df = df.to_pandas()

"""test_df = test_metadata.copy()

test_df, _ = aggregate_features(test_df)
test_df = test_df.to_pandas()
test_df.drop(columns=['isic_id', 'image_type', 'copyright_license', 'tbp_lv_location_simple', 'patient_id'], inplace=True)"""

"test_df = test_metadata.copy()\n\ntest_df, _ = aggregate_features(test_df)\ntest_df = test_df.to_pandas()\ntest_df.drop(columns=['isic_id', 'image_type', 'copyright_license', 'tbp_lv_location_simple', 'patient_id'], inplace=True)"

### Encoding

In [15]:
train_cols = general_features + color_features + shape_features + location_features + additional_features + new_cols
categorical_cols = df[train_cols].select_dtypes(include=['object']).columns
num_features = df[train_cols].select_dtypes(include=[np.number]).columns

# cast numerical columns to float32
df[num_features] = df[num_features].astype('float32')
# test_df[num_features] = test_df[num_features].astype('float32')

In [16]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

sex                    11517
anatom_site_general     5756
dtype: int64

In [17]:
df[categorical_cols] = df[categorical_cols].fillna('missing')
# test_df[categorical_cols] = test_df[categorical_cols].fillna('missing')

### Partial AUC

In [18]:
import pandas.api.types
from sklearn.metrics import roc_curve, auc, roc_auc_score

def pauc_score(solution: pd.DataFrame, submission: pd.DataFrame, min_tpr: float=0.80) -> float:
    '''
    2024 ISIC Challenge metric: pAUC
    
    Given a solution file and submission file, this function returns the
    the partial area under the receiver operating characteristic (pAUC) 
    above a given true positive rate (TPR) = 0.80.
    https://en.wikipedia.org/wiki/Partial_Area_Under_the_ROC_Curve.
    
    (c) 2024 Nicholas R Kurtansky, MSKCC

    Args:
        solution: ground truth pd.DataFrame of 1s and 0s
        submission: solution dataframe of predictions of scores ranging [0, 1]

    Returns:
        Float value range [0, max_fpr]
    '''



    # check submission is numeric
    if not pandas.api.types.is_numeric_dtype(submission.values):
        raise ValueError("Expected numeric values in submission, got: %r" % submission.values)
    
    # rescale the target. set 0s to 1s and 1s to 0s (since sklearn only has max_fpr)
    v_gt = abs(np.asarray(solution.values)-1)
    
    # flip the submissions to their compliments
    v_pred = -1.0*np.asarray(submission.values)

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)
        
    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)

#     # Equivalent code that uses sklearn's roc_auc_score
#     v_gt = abs(np.asarray(solution.values)-1)
#     v_pred = np.array([1.0 - x for x in submission.values])
#     max_fpr = abs(1-min_tpr)
#     partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
#     # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
#     # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
#     partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return(partial_auc)

### Cross Validation Setup

In [19]:
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS)
"""
df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(sgkf.split(df, df[target], df[group_col])):
    df.loc[val_idx, 'fold'] = fold
"""

"\ndf['fold'] = -1\nfor fold, (train_idx, val_idx) in enumerate(sgkf.split(df, df[target], df[group_col])):\n    df.loc[val_idx, 'fold'] = fold\n"

### LightGBM Training

In [20]:
def train_lgb(data, cat_cols, train_cols, params, early_stoping_rounds,pos_weight=1.0):

    # Convert categorical columns to category type
    for col in cat_cols:
        data[col] = data[col].astype('category')

    # Split features and label
    X = data[train_cols]
    y = data[target]
    groups = data[group_col]

    cat_features = [X.columns.get_loc(col) for col in cat_cols]

    # Initialize variables
    models = []
    scores = []

    # Perform cross-validation
    for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y, groups)):

        # Split data
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        weights = np.array([(1.0 if label == 0 else pos_weight) for label in y_train])

        # Creat LightGBM datasets
        train_data = lgb.Dataset(X_train, label=y_train,weight=weights,categorical_feature=cat_features)
        val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_features)

        # Train model
        model = lgb.train(params, train_data, valid_sets=[val_data], callbacks=[lgb.early_stopping(early_stoping_rounds, verbose=0), lgb.log_evaluation(0)])

        # Predict validation data
        y_pred = model.predict(X_val)

        # Calculate PAUC
        score = pauc_score(pd.DataFrame(y_val, columns=['target']), pd.DataFrame(y_pred, columns=['prediction']))
        print(f'Fold {fold+1} - PAUC: {score}')

        
        # Append model and score
        models.append(model)
        scores.append(score)
    
    return models, scores


- Baseline model

In [21]:
### LightGBM parameters
lgb_params = {
    'seed': SEED,
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'min_child_samples': 48,
    'num_iterations': 3000,
    'learning_rate': 0.03,
    'extra_trees': True,
    'metric': 'binary_logloss',
    'reg_alpha': 0.1,
    'reg_lambda': 0.8,
    'num_leaves': 64,
    'device': 'gpu',
    'max_bin': 128,
    'max_depth': 4,
    'verbose': -1
}

### CatBoost parameters
cb_params = {
    'random_seed': SEED,
    'grow_policy': 'Depthwise',
    'loss_function': 'Logloss',
    'min_child_samples': 48,
    'learning_rate': 0.03,
    'task_type': 'GPU',
    'reg_lambda': 0.8,
    'num_leaves': 3000,
    'depth': 4
}

In [22]:
models, scores = train_lgb(df, categorical_cols, train_cols, lgb_params, EARLY_STOPPING_ROUNDS)

Fold 1 - PAUC: 0.17517988538029966
Fold 2 - PAUC: 0.16151157736407795
Fold 3 - PAUC: 0.16284337545241287
Fold 4 - PAUC: 0.15494436012948978
Fold 5 - PAUC: 0.16490062543748288


In [23]:
print(f'Average PAUC: {np.mean(scores)}')
print(f'STD PAUC: {np.std(scores)}')

Average PAUC: 0.16387596475275262
STD PAUC: 0.006561382180697987


### Hyper-parameter tuning

In [24]:
top_k_models = {}
k = 5

In [33]:
def objective(trial: optuna.Trial) -> float:
    
    lgb_params = {
        'seed': SEED,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'min_child_samples': trial.suggest_int("min_child_samples", 5, 500),
        'num_iterations': trial.suggest_int("n_estimators", 100, 10000, step=100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "max_depth": trial.suggest_int("max_depth", 2, 128),
        'device': 'gpu',
        'verbose': -1,
        "subsample": trial.suggest_float("subsample", 0.4, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1),
    }
    
    pos_class_weight = trial.suggest_float("pos_class_weight", 1, 1200)
    models, scores = train_lgb(df, categorical_cols, train_cols, lgb_params, EARLY_STOPPING_ROUNDS, pos_class_weight)

    mean = np.mean(scores)

    return mean

In [37]:
def load_or_create_study(study_name: str) -> optuna.study.Study:

    sampler = optuna.samplers.TPESampler(seed=SEED)

    if not os.path.exists('./sampler.pkl'):
        with open('./sampler.pkl', 'wb') as f:
            pickle.dump(sampler, f)
    else:
        with open('./sampler.pkl', 'rb') as f:
            sampler = pickle.load(f)

    study = optuna.create_study(
        direction="maximize", 
        study_name=study_name, 
        storage=f"sqlite:///{study_name}.db",
        load_if_exists=True,
        sampler=sampler
    )

    return study

In [38]:
study = load_or_create_study('lgbm_optuna')

Using an existing study with name 'lgbm_optuna' instead of creating a new one.


[I 2024-08-01 17:39:21,043] Using an existing study with name 'lgbm_optuna' instead of creating a new one.


In [39]:
study.optimize(objective, n_trials=100, show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

Fold 1 - PAUC: 0.15875363311907095
Fold 2 - PAUC: 0.14701910483459799
Fold 3 - PAUC: 0.14391068438464638
Fold 4 - PAUC: 0.15342230720913655
Fold 5 - PAUC: 0.1496480283347117
Trial 1 finished with value: 0.15055075157643272 and parameters: {'min_child_samples': 190, 'n_estimators': 9600, 'learning_rate': 0.035093904781414616, 'reg_alpha': 0.24810409748678125, 'reg_lambda': 0.004207988669606638, 'num_leaves': 41, 'max_depth': 9, 'subsample': 0.9197056874649611, 'colsample_bytree': 0.7606690070459252, 'pos_class_weight': 849.9790207774586}. Best is trial 1 with value: 0.15055075157643272.
[I 2024-08-01 17:40:21,085] Trial 1 finished with value: 0.15055075157643272 and parameters: {'min_child_samples': 190, 'n_estimators': 9600, 'learning_rate': 0.035093904781414616, 'reg_alpha': 0.24810409748678125, 'reg_lambda': 0.004207988669606638, 'num_leaves': 41, 'max_depth': 9, 'subsample': 0.9197056874649611, 'colsample_bytree': 0.7606690070459252, 'pos_class_weight': 849.9790207774586}. Best is t

In [40]:
study.best_params

{'min_child_samples': 485,
 'n_estimators': 8400,
 'learning_rate': 0.007512103292915293,
 'reg_alpha': 0.0027291905128624325,
 'reg_lambda': 0.23105994775417862,
 'num_leaves': 191,
 'max_depth': 39,
 'subsample': 0.45990831207544924,
 'colsample_bytree': 0.4649846963529439,
 'pos_class_weight': 545.918879501255}

In [None]:
{
    'seed': SEED,
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'min_child_samples': 485,
    'num_iterations': 8400,
    'learning_rate': 0.007512103292915293,
    'reg_alpha': 0.0027291905128624325,
    'reg_lambda': 0.2310599477541786,
    "num_leaves":191,
    "max_depth": 39,
    'device': 'gpu',
    'verbose': -1,
    "subsample": 0.45990831207544924,
    "colsample_bytree": 545.918879501255,
}

### Inference

In [25]:
for col in categorical_cols:
    test_df[col] = test_df[col].astype('category')
    
lgb_preds = np.mean([model.predict(test_df[train_cols])for model in models], 0)

### Submission

In [26]:
sub = pd.DataFrame(columns=['isic_id', 'target'])
sub['isic_id'] = test_metadata['isic_id']
sub['target'] = lgb_preds

In [27]:
sub[['isic_id', 'target']].to_csv('submission.csv', index=False)
!head submission.csv

isic_id,target
ISIC_0015657,0.00010571025692889189
ISIC_0015729,7.672189515860861e-05
ISIC_0015740,0.00022154247309468452
