In [1]:
%run Imports.ipynb
name = 'Kred'

  from .autonotebook import tqdm as notebook_tqdm


# 1) read files

In [2]:
key_featsubgroups = pd.read_pickle('../../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../../pickle/2_FS/' + name + '/2_df_new_.pkl')

# 2) Create Model Prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [3]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model training and prediction

In [4]:
x_train, x_test, y_train, y_test = split_data_4(df)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)


In [5]:
import optuna
from sklearn.ensemble import RandomForestClassifier


# 3) Define Optuna objective
def objective(trial):
    # sample hyperparameters
    params = {
            "n_estimators": trial.suggest_categorical("n_estimators", param_grid["n_estimators"]),
            "max_depth": trial.suggest_categorical("max_depth", param_grid.get("max_depth", [None])),
            "min_samples_split": trial.suggest_categorical("min_samples_split", param_grid.get("min_samples_split", [2]))
    }
    rf = RandomForestClassifier(**params, random_state=42)
    # evaluate with 5-fold CV on training set
    score = cross_val_score(rf, X_train, y_train, cv=5, scoring="accuracy").mean()
    return score

def run_optuna_RF(df, results_dict, param_grid, n_trials=None):
    """
    Performs Bayesian hyperparameter optimization for a Random Forest using Optuna,
    then trains and evaluates the best model.

    Parameters:
    -----------
    df : pd.DataFrame
        Dataset containing features and target.
    results_dict : dict
        Dictionary in which to store the evaluation metrics.
    param_grid : dict
        Search space for RF hyperparameters, e.g.:
            {
              "n_estimators": [100, 200, 300, 400, 500],
              "max_depth": [None, 5, 10, 20],
              "min_samples_split": [2, 5, 10]
            }
    n_trials : int, optional
        Number of Optuna trials. Defaults to sum of grid lengths.

    Returns:
    --------
    best_params : dict
        Best hyperparameters found.
    results_dict : dict
        Updated with performance metrics under key 'rf_optuna'.
    """
    # 1) Split once
    X_train, X_test, y_train, y_test = split_data_4(df)

    # 2) Determine number of trials
    if n_trials is None:
        n_trials = int(np.sum([len(v) for v in param_grid.values()]))
    print("Optuna RF trials:", n_trials)

    # 4) Run Optuna study
    start = time.time()
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    print(f"Best RF params: {best_params}")

    # 5) Train final model
    best_rf = RandomForestClassifier(**best_params, random_state=42)
    results_dict = model_pred(
        X_train, X_test, y_train, y_test,
        best_rf, "rf_optuna", "opt", results_dict
    )

    elapsed = (time.time() - start) / 60
    print(f"RF Optuna completed in {elapsed:.2f} minutes")

    return best_params, results_dict


# Random Forest Optuna