In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import catboost 
import lightgbm as lgb
import optuna
from imblearn.under_sampling import RandomUnderSampler
import optuna.visualization as vis
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import sys
print(sys.executable)


c:\Users\emanu\Analysis-on-Churn-Banking-Modeling-Dataset\.venv\Scripts\python.exe


In [2]:
df = pd.read_csv(r"Churn_Banking_Modeling.csv")

In [3]:
X = df.drop('Flag_Richiesta_Estinzione_cc', axis=1)
y = df['Flag_Richiesta_Estinzione_cc'].map({'no': 0, 'si': 1})  # Converte in 0 e 1

In [4]:
def clean_feature_names(df):
    df.columns = [col.replace(' ', '_').replace('[', '').replace(']', '').replace('<', '') for col in df.columns]
    return df

LightGBM : Tuning hyperparametri

In [6]:
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
    sampling_strategy = trial.suggest_float('sampling_strategy', 0.1, 1.0)
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    y_train_res.reset_index(drop=True, inplace=True)  # Reset index here
    X_train_res = pd.get_dummies(X_train_res)
    X_test = pd.get_dummies(X_test)
    X_train_res, X_test = X_train_res.align(X_test, join='inner', axis=1)
    X_train_res.columns = [col.replace(' ', '_') for col in X_train_res.columns]
    X_test.columns = [col.replace(' ', '_') for col in X_test.columns]

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 2**trial.suggest_int('max_depth', 5, 15)),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 7),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'random_state': 42,
        'force_col_wise': True,
        'verbosity': -1
    }
# CV : ogni fold è usato 1 volta per la validation e 9 per il training, in questo modo riduciamo la varianza al minimo
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = []
    for train_index, val_index in kf.split(X_train_res, y_train_res):
        X_fold_train, X_fold_val = X_train_res.iloc[train_index], X_train_res.iloc[val_index]
        y_fold_train, y_fold_val = y_train_res[train_index], y_train_res[val_index]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(X_fold_train, y_fold_train)
        y_fold_pred = model.predict_proba(X_fold_val)[:, 1]
        score = roc_auc_score(y_fold_val, y_fold_pred)
        scores.append(score)
    
    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)  

study = optuna.create_study(direction='maximize', sampler = sampler)

study.optimize(objective, n_trials=50)

best_params = study.best_params
best_roc_auc = study.best_value

print("Best Hyperparameters:", best_params)
print("Best ROC AUC Score:", best_roc_auc)


history_plot = vis.plot_optimization_history(study)
history_plot.show()

importance_plot = vis.plot_param_importances(study)
importance_plot.show()

slice_plot = vis.plot_slice(study)
slice_plot.show()


[I 2024-04-18 14:12:53,259] A new study created in memory with name: no-name-fce03281-5b98-452e-a19b-62d70abc1af2
[W 2024-04-18 14:12:55,805] Trial 0 failed with parameters: {'sampling_strategy': 0.4370861069626263, 'n_estimators': 291, 'learning_rate': 0.0483437145318464, 'max_depth': 11, 'num_leaves': 336, 'min_child_samples': 16, 'subsample': 0.8116167224336399, 'subsample_freq': 6, 'colsample_bytree': 0.8404460046972835, 'reg_alpha': 0.7080725777960455, 'reg_lambda': 0.020584494295802447, 'max_bin': 297} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\emanu\Analysis-on-Churn-Banking-Modeling-Dataset\.venv\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\emanu\AppData\Local\Temp\ipykernel_3812\350554166.py", line 40, in objective
    model.fit(X_fold_train, y_fold_train)
  File "c:\Users\emanu\Analysis-on-Churn-Banking-Mod

KeyboardInterrupt: 

XGBoost: Tuning hyperparametri

In [5]:
import xgboost as xgb
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
    sampling_strategy = trial.suggest_float('sampling_strategy', 0.1, 1.0)
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    y_train_res.reset_index(drop=True, inplace=True)
    X_train_res = pd.get_dummies(X_train_res)
    X_test = pd.get_dummies(X_test)

    X_train_res = clean_feature_names(X_train_res)
    X_test = clean_feature_names(X_test)

    X_train_res, X_test = X_train_res.align(X_test, join='inner', axis=1)

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10),
        'verbosity': 0
    }

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = []
    for train_index, val_index in kf.split(X_train_res, y_train_res):
        X_fold_train, X_fold_val = X_train_res.iloc[train_index], X_train_res.iloc[val_index]
        y_fold_train, y_fold_val = y_train_res[train_index], y_train_res[val_index]
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_fold_train, y_fold_train, eval_metric="auc")
        y_fold_pred = model.predict_proba(X_fold_val)[:, 1]
        score = roc_auc_score(y_fold_val, y_fold_pred)
        scores.append(score)
    
    return np.mean(scores)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_roc_auc = study.best_value

print("Best Hyperparameters:", best_params)
print("Best ROC AUC Score:", best_roc_auc)

history_plot = vis.plot_optimization_history(study)
history_plot.show()

importance_plot = vis.plot_param_importances(study)
importance_plot.show()

slice_plot = vis.plot_slice(study)
slice_plot.show()


[I 2024-04-18 14:38:31,443] A new study created in memory with name: no-name-eec26700-35d2-425a-817b-0d43a2670d6d
[I 2024-04-18 14:38:55,023] Trial 0 finished with value: 0.9868163975297556 and parameters: {'sampling_strategy': 0.4370861069626263, 'n_estimators': 956, 'learning_rate': 0.17524101118128144, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'gamma': 4.330880728874676, 'reg_alpha': 3.005575058716044, 'reg_lambda': 7.10991852018085}. Best is trial 0 with value: 0.9868163975297556.
[I 2024-04-18 14:39:42,346] Trial 1 finished with value: 0.9895721047141135 and parameters: {'sampling_strategy': 0.1185260448662222, 'n_estimators': 973, 'learning_rate': 0.2595942550311264, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.5917022549267169, 'colsample_bytree': 0.6521211214797689, 'gamma': 2.6237821581611893, 'reg_alpha': 2.1597250932105787, 'reg_lambda': 2.983168487960615}. Best is trial 1 with value: 0.9895721047

Best Hyperparameters: {'sampling_strategy': 0.1185260448662222, 'n_estimators': 973, 'learning_rate': 0.2595942550311264, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.5917022549267169, 'colsample_bytree': 0.6521211214797689, 'gamma': 2.6237821581611893, 'reg_alpha': 2.1597250932105787, 'reg_lambda': 2.983168487960615}
Best ROC AUC Score: 0.9895721047141135


Catboost: Tuning hyperparametri

In [8]:
import catboost as cb
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
    sampling_strategy = trial.suggest_float('sampling_strategy', 0.1, 1.0)
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    y_train_res.reset_index(drop=True, inplace=True)
    X_train_res = pd.get_dummies(X_train_res)
    X_test = pd.get_dummies(X_test)

    X_train_res = clean_feature_names(X_train_res)
    X_test = clean_feature_names(X_test)

    X_train_res, X_test = X_train_res.align(X_test, join='inner', axis=1)

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Newton', 'Gradient']),
        'random_state': 42,
        'verbose': False
    }
    bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    params['bootstrap_type'] = bootstrap_type

    if bootstrap_type == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    else:
        params['subsample'] = trial.suggest_float('subsample', 0.8, 1.0)

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = []
    for train_index, val_index in kf.split(X_train_res, y_train_res):
        X_fold_train, X_fold_val = X_train_res.iloc[train_index], X_train_res.iloc[val_index]
        y_fold_train, y_fold_val = y_train_res[train_index], y_train_res[val_index]
        
        model = cb.CatBoostClassifier(**params)
        model.fit(X_fold_train, y_fold_train)
        y_fold_pred = model.predict_proba(X_fold_val)[:, 1]
        score = roc_auc_score(y_fold_val, y_fold_pred)
        scores.append(score)
    
    return np.mean(scores)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_roc_auc = study.best_value

print("Best Hyperparameters:", best_params)
print("Best ROC AUC Score:", best_roc_auc)

history_plot = vis.plot_optimization_history(study)
history_plot.show()

importance_plot = vis.plot_param_importances(study)
importance_plot.show()

slice_plot = vis.plot_slice(study)
slice_plot.show()

[I 2024-04-18 14:51:33,269] A new study created in memory with name: no-name-fd64b82e-478f-4ce3-9c20-57aa902edfa6
[I 2024-04-18 14:56:06,298] Trial 0 finished with value: 0.9867219800016741 and parameters: {'sampling_strategy': 0.4370861069626263, 'n_estimators': 291, 'learning_rate': 0.08960785365368121, 'max_depth': 11, 'subsample': 0.8312037280884873, 'l2_leaf_reg': 1.5607892088416901, 'border_count': 45, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 2, 'leaf_estimation_iterations': 10, 'leaf_estimation_method': 'Newton', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9867219800016741.
[I 2024-04-18 14:56:46,600] Trial 1 finished with value: 0.9857932829128295 and parameters: {'sampling_strategy': 0.5722807884690141, 'n_estimators': 186, 'learning_rate': 0.023927528765580644, 'max_depth': 11, 'subsample': 0.8278987721304084, 'l2_leaf_reg': 2.9221543407036465, 'border_count': 114, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 26, 'leaf_estimation_iterations': 6, 'lea

KeyboardInterrupt: 

COPIA MIGLIORI HYPER PER CIASCUNO E PLOTTA OGNI METRICA

Iterazione Metodo Brute Force per trovare i migliori pesi da dare al nostro modello per massimizzare la ROC Curve

In [None]:
def objective(trial):
    w_xgboost = trial.suggest_float("w_xgboost", 0.1, 0.8)
    w_lgbm = trial.suggest_float("w_lgbm", 0.1, 0.3)
    w_catboost = 1.0 - w_lgbm - w_xgboost  # Ensure weights sum to 1

    ensemble_predictions = (
        w_lgbm * lgbm_predictions
        + w_catboost * catboost_predictions
        + w_xgboost * xgboost_predictions
    )

    ensemble_roc_auc = roc_auc_score(y_test, ensemble_predictions)

    return ensemble_roc_auc

# Assuming you have your data (X and y) and test set (X_test, y_test) defined somewhere in your code.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Define your cross-validation strategy

study = optuna.create_study(direction='maximize', sampler=sampler)

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Train your models here (lgbm_model, catboost_model, xgboost_model)

    lgbm_predictions = lgbm_model.predict_proba(X_val)[:, 1]
    catboost_predictions = catboost_model.predict_proba(X_val)[:, 1]
    xgboost_predictions = xgboost_model.predict_proba(X_val)[:, 1]

    study.optimize(objective, n_trials=1000)  # You can adjust the number of trials

best_weights = {
    "w_xgboost": study.best_params["w_xgboost"],
    "w_lgbm": study.best_params["w_lgbm"],
    "w_catboost": 1.0 - study.best_params["w_lgbm"] - study.best_params["w_xgboost"],
}
best_roc_auc = study.best_value

print("Best Weights:", best_weights)
print("Best ROC AUC Score:", best_roc_auc)

history_plot = vis.plot_optimization_history(study)
history_plot.show()

importance_plot = vis.plot_param_importances(study)
importance_plot.show()

slice_plot = vis.plot_slice(study)
slice_plot.show()