In [103]:
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, \
  HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import pandas as pd
from xgboost import XGBClassifier
import optuna
from sklearn.model_selection import cross_val_score

from modules.modules_2.topic_2_3.homework.process_bank_churn import preprocess_data, preprocess_new_data
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [86]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Функція для навчання та оцінки AUROC
def evaluate_model(model, X_train, train_targets, X_val, val_targets):
    model.fit(X_train, train_targets)

    train_preds = model.predict_proba(X_train)[:, 1]
    val_preds = model.predict_proba(X_val)[:, 1]

    train_auc = roc_auc_score(train_targets, train_preds)
    val_auc = roc_auc_score(val_targets, val_preds)

    print(f"Model: {model.__class__.__name__}")
    print(f"Train AUC: {train_auc:.4f}")
    print(f"Val AUC: {val_auc:.4f}")
    print('-' * 40)

    return train_auc, val_auc

In [87]:
raw_df = pd.read_csv('../../topic_2_2/homeworks/bank-customer-churn-prediction-dlu/train.csv', index_col=0) # оскільки в наборі даних є колонка id - варто одразу її зчитати як індекс датасета

preprocessed_data = preprocess_data(raw_df, scaler_numeric=False)

X_train = preprocessed_data['X_train']
train_targets = preprocessed_data['train_targets']
X_val = preprocessed_data['X_val']
val_targets = preprocessed_data['val_targets']
input_cols = preprocessed_data['input_cols']
scaler = preprocessed_data['scaler']
encoder = preprocessed_data['encoder']

Numeric columns: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
Binary columns: ['IsActiveMember', 'HasCrCard']
Categorical columns: ['Geography', 'Gender']


In [88]:
# Автоматичний підбір параметрів для Bagging (Decision Tree)
param_grid = {
    "estimator__max_depth": [3, 5, 7, 10, None],  # Оптимальна глибина дерева
    "estimator__min_samples_leaf": [1, 2, 5, 10], # Мінімальна кількість зразків у листі
    "n_estimators": [50, 100, 200],  # Кількість дерев у ансамблі
    "max_samples": [0.5, 0.7, 1.0]  # Скільки вибірки використовувати
}

bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42)

# RandomizedSearchCV для швидшого пошуку
bagging_search = RandomizedSearchCV(bagging, param_distributions=param_grid, n_iter=20, cv=3, scoring='roc_auc', random_state=42, n_jobs=-1)
bagging_search.fit(X_train, train_targets)

print(f"Найкращі параметри: {bagging_search.best_params_}")
best_bagging = bagging_search.best_estimator_

Найкращі параметри: {'n_estimators': 50, 'max_samples': 0.7, 'estimator__min_samples_leaf': 5, 'estimator__max_depth': 7}


In [89]:
# Автоматичний підбір max_features для Random Forest
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15, None],  # Обмеження на глибину
    "max_features": ["sqrt", "log2", None],  # Скільки ознак використовувати на кожному поділі
    "min_samples_split": [2, 5, 10],  # Мінімальна кількість зразків для розбиття
}

rf = RandomForestClassifier(random_state=42)

rf_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=20, cv=3, scoring='roc_auc', random_state=42, n_jobs=-1)
rf_search.fit(X_train, train_targets)

print(f"Найкращі параметри: {rf_search.best_params_}")
best_rf = rf_search.best_estimator_

Найкращі параметри: {'n_estimators': 50, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 10}


In [107]:
# Автоматичний підбір параметрів для Gradient Boosting
param_grid = {
    "n_estimators": [300, 500, 700, 1000],  # Increase max estimators
    "learning_rate": [0.01, 0.02, 0.05],  # More learning rates
    "max_depth": [3, 4, 5, 6, 8, 10],  # Allow deeper trees for capturing interactions
    "subsample": [0.6, 0.7, 0.8, 0.9],  # Test lower subsampling for regularization
    "min_samples_split": [10, 20, 50, 100],  # Prevent overfitting by limiting node splits
    "min_samples_leaf": [5, 10, 20, 50],  # Minimum samples per leaf
    "max_features": ["sqrt", "log2", None],  # Feature selection strategy
    "max_leaf_nodes": [None, 20, 50, 100],  # Additional control for complexity
    "min_weight_fraction_leaf": [0.0, 0.01, 0.05, 0.1]  # Regularization strategy
}

gb = GradientBoostingClassifier(random_state=42)

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gb_search = RandomizedSearchCV(
    gb,
    param_distributions=param_grid,
    n_iter=100,  # More iterations
    cv=cv_strategy,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1,
    verbose=2
)
gb_search.fit(X_train, train_targets)

print(f"Найкращі параметри: {gb_search.best_params_}")
best_gb_native = gb_search.best_estimator_

evaluate_model(best_gb_native, X_train, train_targets, X_val, val_targets)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Найкращі параметри: {'subsample': 0.8, 'n_estimators': 1000, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_leaf_nodes': 100, 'max_features': None, 'max_depth': 3, 'learning_rate': 0.01}
Model: GradientBoostingClassifier
Train AUC: 0.9442
Val AUC: 0.9375
----------------------------------------


(np.float64(0.944247470377414), np.float64(0.937486110158447))

In [104]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 50),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 50),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
    }

    model = GradientBoostingClassifier(random_state=42, **params)
    score = cross_val_score(model, X_train, train_targets, cv=5, scoring="roc_auc").mean()

    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)

print("Best parameters:", study.best_params)

# Train best model
best_gb = GradientBoostingClassifier(random_state=42, **study.best_params)
best_gb.fit(X_train, train_targets)

evaluate_model(best_gb, X_train, train_targets, X_val, val_targets)

[I 2025-02-09 21:29:19,196] A new study created in memory with name: no-name-43ea2a81-a838-4323-9abb-011fe5d0574f
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1)

Best parameters: {'n_estimators': 900, 'learning_rate': 0.01301172578306524, 'max_depth': 4, 'subsample': 0.8972757829563655, 'min_samples_split': 9, 'min_samples_leaf': 32, 'max_features': 'log2'}
Model: GradientBoostingClassifier
Train AUC: 0.9486
Val AUC: 0.9367
----------------------------------------


(np.float64(0.9485981016112842), np.float64(0.9366609506824886))

In [91]:
# Define the parameter grid for RandomizedSearchCV
param_grid_xgb = {
    "n_estimators": [100, 200, 300, 500],
    "learning_rate": [0.005, 0.01, 0.02, 0.05],
    "max_depth": [3, 4, 5, 6],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.5]
}

# Initialize XGBoost classifier
xgb = XGBClassifier(
    random_state=42,
    enable_categorical=False,
    eval_metric="logloss"
)

# Perform hyperparameter tuning using RandomizedSearchCV
xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid_xgb,
    n_iter=30,
    cv=3,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, train_targets)

# Retrieve the best model from the search
best_xgb = xgb_search.best_estimator_

In [92]:
# Автоматичний підбір параметрів для LightGBM
param_grid_lgb = {
    "n_estimators": [100, 200, 300, 500],
    "learning_rate": [0.005, 0.01, 0.02, 0.05],
    "max_depth": [3, 4, 5, 6],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "num_leaves": [20, 31, 40, 50],  # Контроль складності моделі
    "min_child_samples": [5, 10, 20, 50]  # Мінімум вибірки для розбиття
}

lgb = LGBMClassifier(random_state=42)

lgb_search = RandomizedSearchCV(
    lgb, param_distributions=param_grid_lgb,
    n_iter=30, cv=3, scoring='roc_auc', random_state=42, n_jobs=-1
)
lgb_search.fit(X_train, train_targets)

print(f"Найкращі параметри LightGBM: {lgb_search.best_params_}")
best_lgb = lgb_search.best_estimator_

[LightGBM] [Info] Number of positive: 1628, number of negative: 6372
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 842
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203500 -> initscore=-1.364561
[LightGBM] [Info] Start training from score -1.364561
[LightGBM] [Info] Number of positive: 1628, number of negative: 6372
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 845
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 12
[LightGBM] [Info] [binary:

In [109]:
# Ініціалізація моделей
models = {
    "Decision Tree (Depth=4)": DecisionTreeClassifier(max_depth=4, random_state=42),
    "Best Bagging (Decision Tree)": best_bagging,
    "Best Random Forest": best_rf,
    "Best Gradient Boosting (Native)": best_gb_native,
    "Best Gradient Boosting": best_gb,
    "Best XGBoost": best_xgb,
    "Best LightGBM": best_lgb,
    "Stacking (RF + SVM -> LR)": StackingClassifier(
        estimators=[
            ('rf', best_rf),
            ('xgb', best_xgb),
            ('lgbm', best_lgb)
        ],
        final_estimator=HistGradientBoostingClassifier(max_iter=100),
        cv=5
    )
}

In [95]:
# Навчання моделей та оцінка
results = {}
print(type(models), models)
for name, model in models.items():
    train_auc, val_auc = evaluate_model(model, X_train, train_targets, X_val, val_targets)
    results[name] = {"Train AUC": train_auc, "Val AUC": val_auc}

<class 'dict'> {'Decision Tree (Depth=4)': DecisionTreeClassifier(max_depth=4, random_state=42), 'Best Bagging (Decision Tree)': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=7,
                                                   min_samples_leaf=5),
                  max_samples=0.7, n_estimators=50, random_state=42), 'Best Random Forest': RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=50,
                       random_state=42), 'Best Gradient Boosting': GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,
                           random_state=42), 'Best XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0, grow_policy=None,
              importance_type=None, interaction_cons

In [96]:
# Навчання моделей та оцінка
results = {}
print(type(models), models)
for name, model in models.items():
    train_auc, val_auc = evaluate_model(model, X_train, train_targets, X_val, val_targets)
    results[name] = {"Train AUC": train_auc, "Val AUC": val_auc}

<class 'dict'> {'Decision Tree (Depth=4)': DecisionTreeClassifier(max_depth=4, random_state=42), 'Best Bagging (Decision Tree)': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=7,
                                                   min_samples_leaf=5),
                  max_samples=0.7, n_estimators=50, random_state=42), 'Best Random Forest': RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=50,
                       random_state=42), 'Best Gradient Boosting': GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,
                           random_state=42), 'Best XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0, grow_policy=None,
              importance_type=None, interaction_cons

In [97]:
# Відображення результатів у вигляді таблиці
results_df = pd.DataFrame(results).T
print(results_df)

                              Train AUC   Val AUC
Decision Tree (Depth=4)        0.911281  0.911264
Best Bagging (Decision Tree)   0.948505  0.933781
Best Random Forest             0.972375  0.931799
Best Gradient Boosting         0.944001  0.937495
Best XGBoost                   0.945045  0.936911
Best LightGBM                  0.943621  0.937213
Stacking (RF + SVM -> LR)      0.938795  0.930715


In [110]:
IS_SAVE_RESULT = True
BEST_MODEL = "Best Gradient Boosting (Native)"

if IS_SAVE_RESULT:
    test_raw_df = pd.read_csv('../../topic_2_2/homeworks/bank-customer-churn-prediction-dlu/test.csv')
    test_processed = preprocess_new_data(test_raw_df, scaler=scaler, encoder=encoder, scaler_numeric=False)
    sample_submission = pd.read_csv('../../topic_2_2/homeworks/bank-customer-churn-prediction-dlu/sample_submission.csv')

    best_model = models[BEST_MODEL]
    best_model.fit(X_train, train_targets)
    test_preds = best_model.predict_proba(test_processed)[:, 1]
    sample_submission['Exited'] = test_preds
    # Save the submission
    sample_submission.to_csv(f'submission_{BEST_MODEL}.csv', index=False)

Numeric columns: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
Binary columns: ['IsActiveMember', 'HasCrCard']
Categorical columns: ['Geography', 'Gender']
