# 1) Read in Files

In [1]:
%run Imports.ipynb
name = 'Kred'

param_grid = {
    'learning_rate': [0.001, 0.01, 0.1, 0.3, 0.5, 1.0],
    'max_depth': [5, 7, 10, 15, 20, 50],
    'n_estimators': [50, 100, 200, 500, 1000],  #
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5, 1.0],
    'min_child_weight': [1, 3, 5, 10],
    'reg_alpha': [0, 0.01, 0.1, 1, 10, 100],
    'reg_lambda': [0, 0.01, 0.1, 1, 10, 100]}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
key_featsubgroups = pd.read_pickle('../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../pickle/2_FS/' + name + '/2_df_new_.pkl')

In [3]:
print(df[target].value_counts() / df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


# 2) Create Model prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [4]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model training and prediction

In [5]:
# Save default_params to a pickle file
# with open('../pickle/3_Model/xgb_default_params.pkl', 'wb') as f:
#     pickle.dump(default_params, f, pickle.HIGHEST_PROTOCOL)

# Save results_dict_updated to a pickle file
# with open('../pickle/3_Model/results_dict_updated_pre-Bayes.pkl', 'wb') as f:
#     pickle.dump(results_dict_updated, f, pickle.HIGHEST_PROTOCOL)

In [6]:

with open('../pickle/3_Model/xgb_default_params.pkl', 'rb') as f:
    default_params = pickle.load(f)

# Load results_dict_updated from the pickle file
with open('../pickle/3_Model/results_dict_updated_pre-Bayes.pkl', 'rb') as f:
    results_dict_updated = pickle.load(f)

ModuleNotFoundError: No module named 'numpy._core.numeric'

# 6a) Run Bayesian hyperparameter optimization for XGBoost using Optuna

In [7]:
def run_BayesSearch_XGB_Optuna(default_params, param_grid, df, results_dict):
    """
    Performs Bayesian hyperparameter optimization for an XGBoost classifier using Optuna.

    This function:
      1. Splits the input dataframe into training and testing sets.
      2. Determines the number of iterations (n_iter) based on the parameter grid.
      3. Unwraps default parameters from lists to their actual values.
      4. Defines an objective function for Optuna.
      5. Runs the Bayesian optimization using Optuna.
      6. Evaluates the optimized model using cross-validation via the model_pred function.
      7. Returns the updated results dictionary containing evaluation metrics.

    Parameters:
    -----------
    default_params : dict
        Dictionary of default hyperparameters for XGBoost, with each value wrapped in a list.
    param_grid : dict
        The hyperparameter search space, where each parameter's possible values are provided as a list or distribution.
    df : DataFrame
        The dataset containing features and the target variable.
    results_dict : dict
        Dictionary to store the model evaluation results.

    Returns:
    --------
    results_dict_updated : dict
        The updated results dictionary with model performance metrics.
    """
    start_time = time.time()

    # 1. Split the dataset into training and testing sets.
    X_train, X_test, y_train, y_test = split_data_4(df)

    # 2. Determine the number of iterations for Bayesian optimization
    bcvj = int(np.cumsum([len(values) for values in param_grid.values()])[-1])
    print("No. of trials: ", bcvj)

    # 3. Unwrap default parameters: convert each parameter's value from a list to its actual value.
    default_params_xgb = {key: value[0] for key, value in default_params.items()}

    # 4. Define the objective function for Optuna
    def objective(trial):
        # Build the parameters dictionary from param_grid using trial.suggest_categorical for each key
        params = {key: trial.suggest_categorical(key, values) for key, values in param_grid.items()}

        # Merge with default parameters
        # params.update(default_params_xgb)

        # Train the XGBoost model
        xgbc = xgb.XGBClassifier(**params)

        # Evaluate with cross-validation
        accuracy = cross_val_score(xgbc, X_train, y_train, cv=5, scoring='accuracy').mean()

        return accuracy

    # 5. Run Bayesian Optimization using Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=bcvj)

    # 6. Train the best model
    best_params = study.best_params
    # best_params.update(default_params_xgb)
    best_xgbc = xgb.XGBClassifier(**best_params)

    # 7. Evaluate the final model
    results_dict = model_pred(
        X_train, X_test, y_train, y_test, best_xgbc, 'xgbc_optuna', 'opt', results_dict
    )

    end_time = time.time()
    print(f"Optimization completed in {(end_time - start_time) / 60:.2f} minutes")

    return best_params, results_dict


In [8]:
best_params, results_dict = run_BayesSearch_XGB_Optuna(default_params, param_grid, df, results_dict_updated)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']


[I 2025-05-22 15:40:07,204] A new study created in memory with name: no-name-a4ea9bb4-f5e5-4024-a166-e28bc0d39e9a


Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)
No. of trials:  44


[I 2025-05-22 15:41:28,679] Trial 0 finished with value: 0.697021194418964 and parameters: {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 1000, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0.5, 'min_child_weight': 10, 'reg_alpha': 0.01, 'reg_lambda': 0}. Best is trial 0 with value: 0.697021194418964.
[I 2025-05-22 15:42:10,628] Trial 1 finished with value: 0.65590691836045 and parameters: {'learning_rate': 0.5, 'max_depth': 50, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 1.0, 'gamma': 0.1, 'min_child_weight': 5, 'reg_alpha': 0, 'reg_lambda': 0.1}. Best is trial 0 with value: 0.697021194418964.
[I 2025-05-22 15:46:11,413] Trial 2 finished with value: 0.699000627625163 and parameters: {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 1000, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.5, 'min_child_weight': 3, 'reg_alpha': 0.1, 'reg_lambda': 100}. Best is trial 2 with value: 0.699000627625163.
[I 2025-05-22 15:46:50,786] Trial 3 finished with 


Starting model training and evaluation...
Total execution time: 0.23 minutes
Optimization completed in 37.90 minutes


In [9]:
# Save results_dict_updated to a pickle file
with open('../pickle/4_Model_Optuna/results_dict_updated.pkl', 'wb') as f:
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

In [10]:
def train_best_XGB(df, best_params, results_dict):
    """
    Trains an XGBoost model using the best parameters found from Optuna with CV=5,
    evaluates performance, and stores results in the given dictionary.

    Parameters:
    -----------
    df : DataFrame
        The dataset containing features and the target variable.
    best_params : dict
        Best hyperparameters found from Optuna optimization.
    results_dict : dict
        Dictionary to store model performance metrics.

    Returns:
    --------
    dict : Updated results dictionary containing model evaluation metrics.
    """
    start_time = time.time()
    print("\nStarting XGBoost model training and evaluation...")

    # Split dataset
    X_train, X_test, y_train, y_test = split_data_4(df)

    # Initialize and train XGBoost with best parameters
    xgbc = xgb.XGBClassifier(**best_params)
    xgbc.fit(X_train, y_train)

    # Perform cross-validation with CV=5
    y_train_pred = cross_val_predict(xgbc, X_train, y_train, cv=5)

    # Make predictions on test set
    y_test_pred = xgbc.predict(X_test)

    # Compute confusion matrices
    cfm_train = confusion_matrix(y_train, y_train_pred)
    cfm_test = confusion_matrix(y_test, y_test_pred)

    # Compute accuracy scores
    accs_train = accuracy_score(y_train, y_train_pred)
    accs_test = accuracy_score(y_test, y_test_pred)

    # Compute F1-scores for both classes (0 and 1)
    f1s_train_p1 = f1_score(y_train, y_train_pred, pos_label=1)
    f1s_train_p0 = f1_score(y_train, y_train_pred, pos_label=0)
    f1s_test_p1 = f1_score(y_test, y_test_pred, pos_label=1)
    f1s_test_p0 = f1_score(y_test, y_test_pred, pos_label=0)

    # Compute ROC-AUC score for test data
    test_ras = roc_auc_score(y_test, xgbc.predict_proba(X_test)[:, 1])

    # Calculate total runtime in minutes
    total_time = (time.time() - start_time) / 60
    print(f"XGBoost Model training completed in {total_time:.2f} minutes")

    # Store computed values in results dictionary
    results_dict["xgbc_optimized"] = {
        "classifier": deepcopy(xgbc),
        "cfm_train": cfm_train,
        "cfm_test": cfm_test,
        "train_accuracy": accs_train,
        "test_accuracy": accs_test,
        "train F1-score label 1": f1s_train_p1,
        "train F1-score label 0": f1s_train_p0,
        "test F1-score label 1": f1s_test_p1,
        "test F1-score label 0": f1s_test_p0,
        "test roc auc score": test_ras,
        "best_params": best_params,
        "time_m": total_time
    }

    return results_dict

In [11]:
results_dict = train_best_XGB(df, best_params, results_dict)


Starting XGBoost model training and evaluation...
Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)
XGBoost Model training completed in 1.07 minutes


In [13]:
results_dict

{'rf0': {'classifier': RandomForestClassifier(random_state=42),
  'cfm_train': array([[36755,     0],
         [    0, 66810]]),
  'cfm_test': array([[ 2682,  6452],
         [ 1631, 15127]]),
  'train_accuracy': 1.0,
  'test_accuracy': 0.6878186312374479,
  'train F1-score label 1': 1.0,
  'train F1-score label 0': 1.0,
  'test F1-score label 1': 0.7891592978062968,
  'test F1-score label 0': 0.39889938276195436,
  'test roc auc score': np.float64(0.7036737409018287),
  'best_params': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': None,
   'max_features': 'sqrt',
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'monotonic_cst': None,
   'n_estimators': 100,
   'n_jobs': None,
   'oob_score': False,
   'random_state': 42,
   'verbose': 0,
   'warm_start': False},
  'time_m': 1.809039008617401},
 'xgbc0'

In [14]:
# Save results_dict_updated to a pickle file
with open('../pickle/4_Model_Optuna/results_dict_updated.pkl', 'wb') as f:
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

# 7) Save results

In [15]:
pprint(results_dict)

{'rf0': {'best_params': {'bootstrap': True,
                         'ccp_alpha': 0.0,
                         'class_weight': None,
                         'criterion': 'gini',
                         'max_depth': None,
                         'max_features': 'sqrt',
                         'max_leaf_nodes': None,
                         'max_samples': None,
                         'min_impurity_decrease': 0.0,
                         'min_samples_leaf': 1,
                         'min_samples_split': 2,
                         'min_weight_fraction_leaf': 0.0,
                         'monotonic_cst': None,
                         'n_estimators': 100,
                         'n_jobs': None,
                         'oob_score': False,
                         'random_state': 42,
                         'verbose': 0,
                         'warm_start': False},
         'cfm_test': array([[ 2682,  6452],
       [ 1631, 15127]]),
         'cfm_train': array([[36755,     0

# 7) Compile results: AUC, Accuracy and Time

In [16]:
#printing benchmark, iterative grid search and randomized search ROC AUC / accuracy scores (test data set)
print(f"Benchmark RandomForest - AUC/accuracy score: {np.round(results_dict['rf0']['test roc auc score'],4)} / {np.round(results_dict['rf0']['test_accuracy'],4)} @ {np.round(results_dict['rf0']['time_m'],2)} minutes")
print(f"Benchmark XGBoost - AUC/accuracy score: {np.round(results_dict['xgbc0']['test roc auc score'],4)} / {np.round(results_dict['xgbc0']['test_accuracy'],4)} @ {np.round(results_dict['xgbc0']['time_m'],2)} minutes")
print(f"XGBoost w/ Optuna - AUC/accuracy score: {np.round(results_dict['xgbc_optuna']['test roc auc score'],4)} / {np.round(results_dict['xgbc_optuna']['test_accuracy'],4)} @ {np.round(results_dict['xgbc_optuna']['time_m'],2)} minutes")
print(f"Optimized XGBoost w/ Optuna - AUC/accuracy score: {np.round(results_dict['xgbc_optimized']['test roc auc score'],4)} / {np.round(results_dict['xgbc_optimized']['test_accuracy'],4)} @ {np.round(results_dict['xgbc_optimized']['time_m'],2)} minutes")

print("")

Benchmark RandomForest - AUC/accuracy score: 0.7037 / 0.6878 @ 1.81 minutes
Benchmark XGBoost - AUC/accuracy score: 0.723 / 0.6962 @ 0.43 minutes
XGBoost w/ Optuna - AUC/accuracy score: 0.7363 / 0.7041 @ 0.23 minutes
Optimized XGBoost w/ Optuna - AUC/accuracy score: 0.7363 / 0.7041 @ 1.07 minutes

