In [None]:
# ==================================================
# Import
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
import optuna
from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import f1_score

from imblearn.ensemble import BalancedRandomForestClassifier

from xgboost import XGBClassifier

# EDA

In [None]:
# ==================================================
# EDA
# ==================================================

df = pd.read_csv('train.csv')

# `dtype` & Missing Values
df.info()

In [None]:
# Categrical Variables VS 'Cancer'
eda_cat_col = df.copy().select_dtypes(exclude = 'number').columns
for c in eda_cat_col:
    if c != 'ID':
        ct = pd.crosstab(df[c], df['Cancer'], margins = True)
        ct['Odds'] = ct[1] / ct[0]
        ct['WOE'] = np.log(ct['Odds'] / ct.loc['All', 'Odds'])
        print('=' * 40)
        print(ct)
        print('=' * 40, '\n')

In [None]:
# Numerical Variables VS 'Cancer'
eda_num_col = df.copy().select_dtypes(include='number').columns.to_list()
n = len(eda_num_col)

fig, axes = plt.subplots(1, 5, figsize=(20, 4))
axes = axes.flatten()

for i, c in enumerate(eda_num_col):
    if c != 'Cancer':
        data_0 = df[df['Cancer'] == 0][c]
        data_1 = df[df['Cancer'] == 1][c]

        axes[i].boxplot([data_0, data_1], tick_labels=['0', '1'])
        axes[i].set_title(f'{c}')
        axes[i].set_xlabel('Cancer')
        axes[i].set_ylabel(c)

plt.tight_layout()
plt.show()

In [None]:
# Binned Numerical Variables VS 'Cancer'
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
axes = axes.flatten()
k = int(np.log2(len(df)) + 1)
bins = df.copy()[eda_num_col]
for i, c in enumerate(eda_num_col):
    if c != 'Cancer':
        bins[f'{c}_bin'] = pd.cut(df[c], bins = k, labels = False)
        ct = pd.crosstab(bins[f'{c}_bin'], bins['Cancer'], margins = True)
        ct['Odds'] = ct[1] / ct[0]
        ct['WOE'] = np.log(ct['Odds'] / ct.loc['All', 'Odds'])
        ct = ct.drop('All')

        axes[i].plot(ct.index.astype(int), ct['WOE'])
        axes[i].set_title(c)
        axes[i].set_xlabel('Bin')
        axes[i].set_ylabel('WOE')

plt.tight_layout()
plt.show()

In [None]:
# Clustered Numerical Variables VS 'Cancer'
clusters = df.copy()[eda_num_col]

scaler = StandardScaler()
kmeans = KMeans(n_clusters = 16, random_state = 42) # Based on Silhouette Score

clusters[[
    'Age', 'Nodule_Size','TSH_Result', 'T4_Result', 'T3_Result'
]] = scaler.fit_transform(clusters[[
    'Age', 'Nodule_Size','TSH_Result', 'T4_Result', 'T3_Result'
]])
clusters['Cluster'] = kmeans.fit_predict(clusters[[
    'Age', 'Nodule_Size','TSH_Result', 'T4_Result', 'T3_Result'
]])
ct = pd.crosstab(clusters['Cluster'], clusters['Cancer'], margins = True)
ct['Odds'] = ct[1] / ct[0]
ct['WOE'] = np.log(ct['Odds'] / ct.loc['All', 'Odds'])
ct = ct.drop('All')

plt.figure(figsize=(4, 4))
plt.bar(ct.index.astype(int), ct['WOE'])
plt.title('Cluster')
plt.xlabel('Cluster')
plt.ylabel('WOE')

plt.tight_layout()
plt.show()

# Preprocessing

In [None]:
# ==================================================
# Preprocessing
# ==================================================

# Clustered & Binned Numerical Variables
def AddVariables(X):
    train = pd.read_csv('train.csv')
    cols_to_modify = [
        'Age', 'Nodule_Size','TSH_Result', 'T4_Result', 'T3_Result'
    ]
    cols_binned = [col + '_Binned' for col in cols_to_modify]

    binner = KBinsDiscretizer(
        n_bins = 17, encode = 'ordinal'
    )
    binner = binner.fit(train[cols_to_modify])
    X[cols_binned] = binner.transform(X[cols_to_modify].copy())

    scaler = StandardScaler()
    cluster = KMeans(n_clusters = 16, random_state = 42)
    train[cols_to_modify] = scaler.fit_transform(train[cols_to_modify])
    cluster = cluster.fit(train[cols_to_modify])

    X_copy = X.copy()
    X_copy[cols_to_modify] = scaler.transform(X_copy[cols_to_modify])
    X['Cluster'] = cluster.predict(X_copy[cols_to_modify])

    X['Cluster'] = X['Cluster'].astype(object)
    X[cols_binned] = X[cols_binned].astype(object)

    return X

# Custom WOE Encoder
class WOEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns = None, handle_unknown = 'value', unknown_value = 0.0):
        self.columns = list(columns) if columns is not None else None
        self.handle_unknown = handle_unknown
        self.unknown_value = unknown_value
        self._output_transform = 'default'

    def set_output(self, *, transform = None):
        transform = transform or 'default'
        if transform not in ('default', 'pandas'):
            raise ValueError("transform must be 'default' or 'pandas'")
        self._output_transform = transform
        return self

    def get_feature_names_out(self, input_features = None):
        return list(self.columns) if self.columns is not None else list(input_features)

    def fit(self, X, y):
        X_df = pd.DataFrame(X)
        if self.columns is None:
            self.columns = X_df.columns.tolist()
        else:
            X_df = X_df[self.columns]

        y = pd.Series(y, name='target')
        total_pos = y.sum()
        total_neg = len(y) - total_pos

        self.woe_dict_ = {}
        self.inv_woe_dict_ = {}

        for col in self.columns:
            grp = X_df.assign(target = y).groupby(col)['target']
            pos = grp.sum().replace(0, 0.5)
            neg = grp.count().sub(pos).replace(0, 0.5)

            rate_pos = pos / total_pos
            rate_neg = neg / total_neg
            woe = np.log(rate_pos / rate_neg)

            mapping = woe.to_dict()
            self.woe_dict_[col] = mapping
            self.inv_woe_dict_[col] = {v: k for k, v in mapping.items()}

        return self

    def transform(self, X):
        X_df = pd.DataFrame(X)
        X_enc = X_df[self.columns].copy()

        for col in self.columns:
            mapping = self.woe_dict_[col]
            X_enc[col] = X_enc[col].map(mapping)
            if self.handle_unknown == 'value':
                X_enc[col] = X_enc[col].fillna(self.unknown_value)
            else:
                if X_enc[col].isnull().any():
                    unseen = set(X_df.loc[X_enc[col].isnull(), col])
                    raise ValueError(f"Unseen categories {unseen} in column '{col}'")

        arr = X_enc.values
        if self._output_transform == 'pandas':
            return pd.DataFrame(arr, columns=self.get_feature_names_out(), index = X_df.index)
        if arr.shape[1] == 1:
            return arr.ravel()
        return arr

    def inverse_transform(self, X_enc):
        arr = np.array(X_enc)
        if arr.ndim == 1:
            arr = arr.reshape(-1, 1)
        if arr.shape[1] != len(self.columns):
            raise ValueError(f"Expected {len(self.columns)} features, got {arr.shape[1]}")

        decoded = []
        for i, col in enumerate(self.columns):
            inv_map = self.inv_woe_dict_[col]
            col_vals = arr[:, i]
            decoded.append([inv_map.get(v, None) for v in col_vals])
        decoded = np.array(decoded).T
        if decoded.shape[1] == 1:
            decoded = decoded.ravel()
        return decoded if self._output_transform == 'default' else pd.DataFrame(decoded, columns = self.columns)


In [None]:
# ==================================================
# Preprocessing: Train Data
# ==================================================

# Add Derived Variable
train = pd.read_csv('train.csv')
train = AddVariables(train)

# Input-Output Split
x = train.drop(['ID', 'Cancer'], axis = 1)
y = train['Cancer']

# Categorize Columns
num_col = x.select_dtypes(include = 'number').columns.to_list()
cat_col = x.select_dtypes(exclude = 'number').columns.to_list()

# Preprocessor for Tree-Based Models
prep_tree = ColumnTransformer([
    ('cat', WOEEncoder(), cat_col),
    ('num', 'passthrough', num_col)
]).set_output(transform = 'pandas')

# Preprocessor for Non-Tree-Based Models
prep_scale = ColumnTransformer([
    ('cat', Pipeline([
        ('enc', WOEEncoder()),
        ('scale', StandardScaler())
    ]), cat_col),
    ('num', StandardScaler(), num_col)
])

# Set CV
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

In [None]:
# ==================================================
# Preprocessing: Test Data
# ==================================================

# Add Derived Variable
test = pd.read_csv('test.csv')
test = AddVariables(test)
x_test = test.drop('ID', axis = 1)

# LGBM

In [None]:
# ==================================================
# LGBM
# ==================================================

# Define Pipeline
lgbm = Pipeline([
    ('prep', prep_tree),
    ('model', LGBMClassifier(
        boosting_type = 'gbdt',
        objective = 'binary',
        is_unbalance = True,
        metric = 'auc',
        random_state = 42,
        verbosity = -1, 
        device = 'gpu'
    ))
])

# Optuna Objective
def lgbm_objective(trial):
    params = {
        'model__num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'model__max_depth': trial.suggest_int('max_depth', 4, 16),
        'model__learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.2, log = True),
        'model__n_estimators': trial.suggest_int('n_estimators', 300, 2000),
        'model__min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.1),
        'model__min_child_samples': trial.suggest_int('min_child_samples', 50, 300),
        'model__subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'model__subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
        'model__colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'model__reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'model__reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0)
    }

    optuna_pipeline = clone(lgbm).set_params(**params)

    scores = cross_validate(
        optuna_pipeline, x, y,
        scoring = 'f1',
        cv = cv,
        n_jobs = 3,
        verbose = 1
    )
    
    return scores['test_score'].mean()

In [None]:
# Run Optuna
os.environ['PYTHONHASHSEED'] = str(42)
lgbm_study = optuna.create_study(
    direction = 'maximize',
    study_name = 'lgbm',
    sampler = optuna.samplers.TPESampler(seed = 42)
)
lgbm_study.optimize(lgbm_objective, n_trials = 30, n_jobs = 3, show_progress_bar = True)

In [None]:
# Optuna Result
print('Best parameters:')
print(lgbm_study.best_params)
print('Best CV F1 Score:')
print(lgbm_study.best_trial.value)

# Save Best Parameters
with open('lgbm_params.json', 'w') as f:
    json.dump({f'model__{k}': v for k, v in lgbm_study.best_trial.params.items()}, f, indent = 4)

# Save Best CV F1 Score
with open('lgbm_cv_f1.txt', 'w') as f:
    f.write(str(lgbm_study.best_trial.value))

# RF

In [None]:
# ==================================================
# RF
# ==================================================

# Define Pipeline
classes, counts = np.unique(y, return_counts = True)
rf = Pipeline([
    ('prep', prep_tree),
    ('model', BalancedRandomForestClassifier(
        n_estimators = 100,
        n_jobs = -1,
        random_state = 42,
        verbose = 1,
        class_weight= {
            0: counts.sum() / counts[0],
            1: counts.sum() / counts[1]
        }
    ))
])

# Parameter Grid
rf_grid = {
    'model__n_estimators': np.arange(100, 1501, 100)
}

rf_study = GridSearchCV(
    rf,
    param_grid = rf_grid,
    scoring = 'f1',
    n_jobs = -1,
    cv = cv,
    verbose = 1
)

In [None]:
# Run GridSearchCV
rf_study.fit(x, y)

In [None]:
# GridSearchCV Result
print('Best parameter:')
print(rf_study.best_params_)
print('Best CV F1 Score:')
print(rf_study.best_score_)

# Save Best Parameters
rf_params = rf_study.best_params_
rf_params['model__n_estimators'] = int(rf_params['model__n_estimators'])
with open('rf_params.json', 'w') as f:
    json.dump(rf_study.best_params_, f, indent = 4)

# Save Best CV F1 Score
with open('rf_cv_f1.txt', 'w') as f:
    f.write(str(rf_study.best_score_))

# XGB

In [None]:
# ==================================================
# XGB
# ==================================================

# Define Pipeline
xgb = Pipeline([
    ('prep', prep_tree),
    ('model', XGBClassifier(
        verbosity = 1,
        objective = 'binary:logistic',
        eval_metric = 'aucpr',
        scale_pos_weight = counts[0] / counts[1],
        max_delta_step = 1,
        random_state = 42,
        tree_method = 'hist',
        device = 'cuda'
    ))
])

# Optuna Objective
def xgb_objective(trial):
    params = {
        'model__n_estimators': trial.suggest_int('n_estimators', 300, 2000),
        'model__max_depth': trial.suggest_int('max_depth', 4, 16),
        'model__min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'model__gamma': trial.suggest_float('gamma', 0, 10.0),
        'model__learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.2, log = True),
        'model__subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'model__colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'model__reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'model__reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0)
    }

    optuna_pipeline = clone(xgb).set_params(**params)

    scores = cross_validate(
        optuna_pipeline, x, y,
        scoring = 'f1',
        cv = cv,
        n_jobs = 3,
        verbose = 1
    )
    
    return scores['test_score'].mean()

In [None]:
# Run Optuna
os.environ['PYTHONHASHSEED'] = str(42)
xgb_study = optuna.create_study(
    direction = 'maximize',
    study_name = 'xgb',
    sampler = optuna.samplers.TPESampler(seed = 42)
)
xgb_study.optimize(xgb_objective, n_trials = 30, n_jobs = 3, show_progress_bar = True)

In [None]:
# Optuna Result
print('Best parameters:')
print(xgb_study.best_params)
print('Best CV F1 Score:')
print(xgb_study.best_trial.value)

# Save Best Parameters
with open('xgb_params.json', 'w') as f:
    json.dump({f'model__{k}': v for k, v in xgb_study.best_trial.params.items()}, f, indent = 4)

# Save Best CV F1 Score
with open('xgb_cv_f1.txt', 'w') as f:
    f.write(str(xgb_study.best_trial.value))

# Ensemble

In [None]:
# ==================================================
# Ensemble
# ==================================================

# Refit LGBM
with open('lgbm_params.json', 'r') as f:
    lgbm_params = json.load(f)

best_lgbm = lgbm.set_params(**lgbm_params)
best_lgbm = best_lgbm.fit(x, y)

# Refit RF
with open('rf_params.json', 'r') as f:
    rf_params = json.load(f)

best_rf = rf.set_params(**rf_params)
best_rf = best_rf.fit(x, y)

# Refit XGB
with open('xgb_params.json', 'r') as f:
    xgb_params = json.load(f)

best_xgb = xgb.set_params(**xgb_params)
best_xgb = best_xgb.fit(x, y)

In [None]:
# Meta Features
meta_x = pd.DataFrame({
    'lgbm': best_lgbm.predict_proba(x)[:, 1],
    'rf': best_rf.predict_proba(x)[:, 1],
    'xgb': best_xgb.predict_proba(x)[:, 1],
})

# Optuna Objective
def ensemble_objective(trial):
    params = {
        'threshold': trial.suggest_float('threshold', 0.2, 0.5)
    }

    probas = meta_x.sum(axis = 1)

    preds = (probas >= 3 * params['threshold']).astype(int)
    score = f1_score(y, preds)

    return score

In [None]:
# Run Optuna
os.environ['PYTHONHASHSEED'] = str(42)
ensemble_study = optuna.create_study(
    direction = 'maximize',
    study_name = 'ensemble',
    sampler = optuna.samplers.TPESampler(seed = 42)
)
ensemble_study.optimize(ensemble_objective, n_trials = 200, n_jobs = -1, show_progress_bar = True)

In [None]:
# Optuna Result
print('Best parameters:')
print(ensemble_study.best_params)
print('Best CV F1 Score:')
print(ensemble_study.best_trial.value)

# Save Best Parameters
with open('ensemble_params.json', 'w') as f:
    json.dump(ensemble_study.best_trial.params, f, indent = 4)

# Save Best CV F1 Score
with open('ensemble_cv_f1.txt', 'w') as f:
    f.write(str(ensemble_study.best_trial.value))

In [None]:
# ==================================================
# Submission
# ==================================================

# Meta Features for Ensemble Model
meta_x_test = pd.DataFrame({
    'lgbm': best_lgbm.predict_proba(x_test)[:, 1],
    'rf': best_rf.predict_proba(x_test)[:, 1],
    'xgb': best_xgb.predict_proba(x_test)[:, 1]
})

# Prediction
with open('ensemble_params.json', 'r') as f:
    ensemble_params = json.load(f)
probas = meta_x_test.sum(axis = 1)
preds = (probas >= 3 * ensemble_params['threshold']).astype(int)

# Submission
submission = pd.read_csv('test.csv')
submission['Cancer'] = preds
submission = submission.copy()[['ID', 'Cancer']]
submission.to_csv('submission.csv', index = False)