In [1]:
import os
os.chdir('../')

import pandas as pd
import numpy as np

data = pd.read_csv("Datasets/analysis_data.csv")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# ------------------------------
# Load Data
# ------------------------------
df = data.copy()
y = df["monthly_spend"]
X = df.drop(columns=["monthly_spend"])

# ------------------------------
# Train-test split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------
# Identify column types
# ------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# ------------------------------
# Preprocessing: impute + encode
# ------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols)
    ]
)

X_train_p = preprocessor.fit_transform(X_train)
X_test_p = preprocessor.transform(X_test)

# ----------------------------------------------------------
# üöÄ 1. LightGBM (SOTA for tabular data)
# ----------------------------------------------------------
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=-1,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=1.0,
    random_state=42
)

lgbm.fit(X_train_p, y_train)
pred_lgbm = lgbm.predict(X_test_p)
rmse_lgbm = mean_squared_error(y_test, pred_lgbm, squared=False)

print('----------------------------------------------------------------')
print('LGBM:',rmse_lgbm)
print('----------------------------------------------------------------')

# ----------------------------------------------------------
# üöÄ 2. XGBoost (classic boosted trees)
# ----------------------------------------------------------
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=1.0,
    objective="reg:squarederror",
    random_state=42
)

xgb.fit(X_train_p, y_train)
pred_xgb = xgb.predict(X_test_p)
rmse_xgb = mean_squared_error(y_test, pred_xgb, squared=False)

print('----------------------------------------------------------------')
print('XGB:',rmse_xgb)
print('----------------------------------------------------------------')

# ----------------------------------------------------------
# FIX categorical columns for CatBoost
# ----------------------------------------------------------
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()

# Convert NaN categorical values to string "Unknown"
X_train_cat[cat_cols] = X_train_cat[cat_cols].astype(str).fillna("Unknown")
X_test_cat[cat_cols] = X_test_cat[cat_cols].astype(str).fillna("Unknown")

cat_idx = [X.columns.get_loc(c) for c in cat_cols]

from catboost import CatBoostRegressor

cat = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=3.0,
    loss_function='RMSE',
    random_seed=42,
    verbose=False
)

cat.fit(X_train_cat, y_train, cat_features=cat_idx)

pred_cat = cat.predict(X_test_cat)
rmse_cat = mean_squared_error(y_test, pred_cat, squared=False)

print(f"‚≠ê CatBoost RMSE: {rmse_cat:.4f}")


# ----------------------------------------------------------
# FINAL COMPARISON
# ----------------------------------------------------------
print("üî• STATE-OF-THE-ART MODEL PERFORMANCE (RMSE):")
print(f"‚≠ê LightGBM: {rmse_lgbm:.4f}")
print(f"‚≠ê XGBoost:   {rmse_xgb:.4f}")
print(f"‚≠ê CatBoost:  {rmse_cat:.4f}")

best_model = min(
    [("LightGBM", rmse_lgbm), ("XGBoost", rmse_xgb), ("CatBoost", rmse_cat)],
    key=lambda x: x[1]
)

print("\nüèÜ BEST MODEL SO FAR:", best_model[0], "with RMSE =", best_model[1])


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1791
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 22
[LightGBM] [Info] Start training from score 1665.647021




----------------------------------------------------------------
LGBM: 256.17664189023856
----------------------------------------------------------------




----------------------------------------------------------------
XGB: 259.661225798345
----------------------------------------------------------------


CatBoostError: Invalid type for cat_feature[non-default value idx=63,feature_idx=4]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string.

‚≠ê CatBoost RMSE: 254.4923




CATBOOST optimization

In [7]:
# ============================================
# ADVANCED CATBOOST PIPELINE TO BEAT ELASTICNET
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

# You may need to install these if not present:
# !pip install catboost optuna

from catboost import CatBoostRegressor
import optuna

# --------------------------------------------
# 1. Load data and basic setup
# --------------------------------------------
df = data.copy()  # assuming `data` is your cleaned dataset
target_col = "monthly_spend"

y = df[target_col]
X = df.drop(columns=[target_col])

# --------------------------------------------
# 2. Identify column types
# --------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

# --------------------------------------------
# 3. Light feature engineering (boosting-friendly)
#    - log1p for positive numeric columns (reduces skew)
# --------------------------------------------
X_fe = X.copy()

for col in num_cols:
    # Only log-transform strictly positive columns
    if (X_fe[col] > 0).all():
        X_fe[f"{col}_log1p"] = np.log1p(X_fe[col])

# Recalculate numeric columns after adding engineered features
num_cols_fe = X_fe.select_dtypes(include=["int64", "float64"]).columns.tolist()

# --------------------------------------------
# 4. Train-test split
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_fe, y, test_size=0.2, random_state=42
)

# --------------------------------------------
# 5. Prepare data for CatBoost
#    - CatBoost can handle numeric NaNs but NOT NaNs in categoricals
#    - Convert categoricals to string + fill missing with "Missing"
# --------------------------------------------
X_train_cb = X_train.copy()
X_test_cb = X_test.copy()

for col in cat_cols:
    if col in X_train_cb.columns:
        X_train_cb[col] = X_train_cb[col].astype(str).fillna("Missing")
        X_test_cb[col] = X_test_cb[col].astype(str).fillna("Missing")

# Categorical feature indices for CatBoost (in the *current* DataFrame)
cat_idx = [X_train_cb.columns.get_loc(c) for c in cat_cols if c in X_train_cb.columns]

print("CatBoost categorical indices:", cat_idx)

# --------------------------------------------
# 6. Optuna objective function for CatBoost
#    - 5-fold CV on training data
# --------------------------------------------
def objective(trial):
    params = {
        "loss_function": "RMSE",
        "random_seed": 42,
        "verbose": False,
        "iterations": trial.suggest_int("iterations", 300, 1500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
        "border_count": trial.suggest_int("border_count", 64, 255),
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 200),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmses = []

    for train_idx, valid_idx in kf.split(X_train_cb):
        X_tr = X_train_cb.iloc[train_idx]
        X_val = X_train_cb.iloc[valid_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[valid_idx]

        model = CatBoostRegressor(**params)
        model.fit(
            X_tr, y_tr,
            cat_features=cat_idx,
            eval_set=(X_val, y_val),
            use_best_model=True
        )

        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        rmses.append(rmse)

    return np.mean(rmses)

# --------------------------------------------
# 7. Run Optuna study to find best hyperparameters
#    - You can increase n_trials for more thorough search
# --------------------------------------------
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, show_progress_bar=True)

print("Best trial:")
print(study.best_trial.params)

# --------------------------------------------
# 8. Train final CatBoost model on full training set
#    using best hyperparameters from Optuna
# --------------------------------------------
best_params = study.best_trial.params
best_params.update({
    "loss_function": "RMSE",
    "random_seed": 42,
    "verbose": False
})

best_cat = CatBoostRegressor(**best_params)
best_cat.fit(
    X_train_cb, y_train,
    cat_features=cat_idx,
    eval_set=(X_test_cb, y_test),
    use_best_model=True
)

# --------------------------------------------
# 9. Evaluate on test set
# --------------------------------------------
y_pred_test = best_cat.predict(X_test_cb)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print(f"\nüî• Tuned CatBoost Test RMSE: {rmse_test:.4f}")

# --------------------------------------------
# 10. Feature importance (top 25)
# --------------------------------------------
feature_importances = best_cat.get_feature_importance()
fi_df = pd.DataFrame({
    "feature": X_train_cb.columns,
    "importance": feature_importances
}).sort_values("importance", ascending=False)

print("\nTop 25 CatBoost feature importances:")
print(fi_df.head(25))


[I 2025-12-08 00:14:04,183] A new study created in memory with name: no-name-d88e5061-ef76-4f66-abb6-6bd8048fabb4


Categorical columns: ['gender', 'marital_status', 'education_level', 'region', 'employment_status', 'card_type']
Numeric columns: ['customer_id', 'age', 'owns_home', 'has_auto_loan', 'annual_income', 'credit_score', 'credit_limit', 'tenure', 'num_transactions', 'avg_transaction_value', 'online_shopping_freq', 'reward_points_balance', 'travel_frequency', 'utility_payment_count', 'num_children', 'num_credit_cards']
CatBoost categorical indices: [2, 3, 4, 5, 6, 13]


  0%|          | 0/30 [00:00<?, ?it/s]



[I 2025-12-08 00:37:47,086] Trial 0 finished with value: 258.14713013478377 and parameters: {'iterations': 1370, 'depth': 7, 'learning_rate': 0.061176893123203936, 'l2_leaf_reg': 4.268145798210141, 'bagging_temperature': 1.8455095303472118, 'border_count': 161, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 57}. Best is trial 0 with value: 258.14713013478377.




[I 2025-12-08 00:45:25,605] Trial 1 finished with value: 257.8112529844229 and parameters: {'iterations': 456, 'depth': 6, 'learning_rate': 0.06953317021729892, 'l2_leaf_reg': 8.707235740187038, 'bagging_temperature': 1.3012618701671341, 'border_count': 251, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 77}. Best is trial 1 with value: 257.8112529844229.




[I 2025-12-08 01:01:06,552] Trial 2 finished with value: 256.4113276331638 and parameters: {'iterations': 1406, 'depth': 9, 'learning_rate': 0.0786734917452061, 'l2_leaf_reg': 7.179486990625046, 'bagging_temperature': 0.4434090946208713, 'border_count': 228, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 192}. Best is trial 2 with value: 256.4113276331638.




[I 2025-12-08 01:36:58,360] Trial 3 finished with value: 257.7250504751947 and parameters: {'iterations': 1387, 'depth': 9, 'learning_rate': 0.010651674343838166, 'l2_leaf_reg': 7.650437556684703, 'bagging_temperature': 4.917429221430102, 'border_count': 97, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 121}. Best is trial 2 with value: 256.4113276331638.




[I 2025-12-08 01:41:56,394] Trial 4 finished with value: 257.19803195555824 and parameters: {'iterations': 1299, 'depth': 6, 'learning_rate': 0.010508368598932323, 'l2_leaf_reg': 8.856379857477247, 'bagging_temperature': 1.731579863750392, 'border_count': 219, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 25}. Best is trial 2 with value: 256.4113276331638.




[I 2025-12-08 01:45:01,079] Trial 5 finished with value: 254.5991860858356 and parameters: {'iterations': 1059, 'depth': 4, 'learning_rate': 0.022329119249118796, 'l2_leaf_reg': 4.7623306058100825, 'bagging_temperature': 2.9544094972130552, 'border_count': 67, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 73}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 01:48:52,268] Trial 6 finished with value: 255.33446114816792 and parameters: {'iterations': 582, 'depth': 8, 'learning_rate': 0.09079573478838282, 'l2_leaf_reg': 2.0023692129089077, 'bagging_temperature': 3.7856898541118857, 'border_count': 194, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 15}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 01:50:18,960] Trial 7 finished with value: 272.78974565695256 and parameters: {'iterations': 485, 'depth': 4, 'learning_rate': 0.011705822893943972, 'l2_leaf_reg': 5.615889584568017, 'bagging_temperature': 2.563014357576087, 'border_count': 191, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 50}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 02:10:35,331] Trial 8 finished with value: 258.80449378469814 and parameters: {'iterations': 1368, 'depth': 10, 'learning_rate': 0.08312594429330014, 'l2_leaf_reg': 7.620812345869749, 'bagging_temperature': 3.491942794256031, 'border_count': 174, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 144}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 02:24:18,581] Trial 9 finished with value: 257.992054121207 and parameters: {'iterations': 829, 'depth': 10, 'learning_rate': 0.0397587138197512, 'l2_leaf_reg': 6.205942225805641, 'bagging_temperature': 4.339315892162629, 'border_count': 148, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 98}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 02:26:56,867] Trial 10 finished with value: 256.409246271576 and parameters: {'iterations': 1074, 'depth': 4, 'learning_rate': 0.020701901741254605, 'l2_leaf_reg': 3.784297540594781, 'bagging_temperature': 2.9433430359474295, 'border_count': 69, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 155}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 02:31:07,566] Trial 11 finished with value: 254.73399409050626 and parameters: {'iterations': 724, 'depth': 8, 'learning_rate': 0.023392104646301603, 'l2_leaf_reg': 1.256443382510927, 'bagging_temperature': 3.7818202479226133, 'border_count': 130, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 22}. Best is trial 5 with value: 254.5991860858356.




[I 2025-12-08 02:34:42,804] Trial 12 finished with value: 254.34125441938613 and parameters: {'iterations': 808, 'depth': 6, 'learning_rate': 0.022946745644434663, 'l2_leaf_reg': 1.267522349034759, 'bagging_temperature': 3.3449201935929764, 'border_count': 122, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 48}. Best is trial 12 with value: 254.34125441938613.




[I 2025-12-08 02:38:12,344] Trial 13 finished with value: 254.2919480815936 and parameters: {'iterations': 1047, 'depth': 5, 'learning_rate': 0.0206228604354108, 'l2_leaf_reg': 2.8309725744966396, 'bagging_temperature': 2.9887374462352283, 'border_count': 111, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 77}. Best is trial 13 with value: 254.2919480815936.




[I 2025-12-08 02:41:48,381] Trial 14 finished with value: 254.85299536171087 and parameters: {'iterations': 1068, 'depth': 5, 'learning_rate': 0.016718162396187432, 'l2_leaf_reg': 2.87321861415798, 'bagging_temperature': 3.2271523199858017, 'border_count': 115, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 102}. Best is trial 13 with value: 254.2919480815936.




[I 2025-12-08 02:45:31,556] Trial 15 finished with value: 253.94667008171464 and parameters: {'iterations': 920, 'depth': 6, 'learning_rate': 0.03851293092852618, 'l2_leaf_reg': 1.0930376467668188, 'bagging_temperature': 2.3114491534233137, 'border_count': 102, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 61}. Best is trial 15 with value: 253.94667008171464.




[I 2025-12-08 02:48:24,389] Trial 16 finished with value: 256.74683941359035 and parameters: {'iterations': 961, 'depth': 5, 'learning_rate': 0.04038789910726558, 'l2_leaf_reg': 2.4928121587250063, 'bagging_temperature': 2.1011595917068444, 'border_count': 95, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 80}. Best is trial 15 with value: 253.94667008171464.




[I 2025-12-08 02:52:24,399] Trial 17 finished with value: 253.66785248464674 and parameters: {'iterations': 1198, 'depth': 5, 'learning_rate': 0.03409139626079436, 'l2_leaf_reg': 3.2930961662205993, 'bagging_temperature': 0.9431883663529017, 'border_count': 104, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 139}. Best is trial 17 with value: 253.66785248464674.




[I 2025-12-08 02:58:04,799] Trial 18 finished with value: 254.35721745077518 and parameters: {'iterations': 1175, 'depth': 7, 'learning_rate': 0.03272189646006651, 'l2_leaf_reg': 3.5206553094969917, 'bagging_temperature': 0.2734259477027353, 'border_count': 87, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 130}. Best is trial 17 with value: 253.66785248464674.




[I 2025-12-08 03:04:07,453] Trial 19 finished with value: 254.4190843600915 and parameters: {'iterations': 1229, 'depth': 7, 'learning_rate': 0.0521899858617669, 'l2_leaf_reg': 1.8165885374163309, 'bagging_temperature': 0.9987888968594825, 'border_count': 145, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 171}. Best is trial 17 with value: 253.66785248464674.




[I 2025-12-08 03:06:09,969] Trial 20 finished with value: 256.9173185140543 and parameters: {'iterations': 674, 'depth': 5, 'learning_rate': 0.031235216250251937, 'l2_leaf_reg': 4.8885718808191125, 'bagging_temperature': 0.9112402644008675, 'border_count': 84, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 118}. Best is trial 17 with value: 253.66785248464674.




[I 2025-12-08 03:09:33,175] Trial 21 finished with value: 253.80781925104702 and parameters: {'iterations': 920, 'depth': 5, 'learning_rate': 0.030870234951343403, 'l2_leaf_reg': 2.7639239459307405, 'bagging_temperature': 2.375159353929024, 'border_count': 115, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 91}. Best is trial 17 with value: 253.66785248464674.




[I 2025-12-08 03:13:18,474] Trial 22 finished with value: 254.0618492436293 and parameters: {'iterations': 917, 'depth': 6, 'learning_rate': 0.045857658083142155, 'l2_leaf_reg': 1.0364834682949817, 'bagging_temperature': 2.390104215037928, 'border_count': 105, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 94}. Best is trial 17 with value: 253.66785248464674.




[I 2025-12-08 03:17:17,302] Trial 23 finished with value: 253.61207846741004 and parameters: {'iterations': 1209, 'depth': 5, 'learning_rate': 0.0309085977972149, 'l2_leaf_reg': 3.4102959508023423, 'bagging_temperature': 1.415415829558291, 'border_count': 133, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 139}. Best is trial 23 with value: 253.61207846741004.




[I 2025-12-08 03:22:09,080] Trial 24 finished with value: 253.48184184124966 and parameters: {'iterations': 1477, 'depth': 5, 'learning_rate': 0.029395760540202612, 'l2_leaf_reg': 3.2884380767046397, 'bagging_temperature': 1.418953966133421, 'border_count': 134, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 146}. Best is trial 24 with value: 253.48184184124966.




[I 2025-12-08 03:26:06,250] Trial 25 finished with value: 253.46250044727435 and parameters: {'iterations': 1482, 'depth': 4, 'learning_rate': 0.026962335192820312, 'l2_leaf_reg': 3.6596329909052625, 'bagging_temperature': 1.4144940444697018, 'border_count': 136, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 160}. Best is trial 25 with value: 253.46250044727435.




[I 2025-12-08 03:30:02,411] Trial 26 finished with value: 254.46575731955437 and parameters: {'iterations': 1490, 'depth': 4, 'learning_rate': 0.01620877998727012, 'l2_leaf_reg': 4.098104209639874, 'bagging_temperature': 1.45347423509438, 'border_count': 140, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 168}. Best is trial 25 with value: 253.46250044727435.




[I 2025-12-08 03:33:57,074] Trial 27 finished with value: 253.62787911923579 and parameters: {'iterations': 1498, 'depth': 4, 'learning_rate': 0.026168001761837246, 'l2_leaf_reg': 5.428076633070997, 'bagging_temperature': 0.06282851298799841, 'border_count': 164, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 198}. Best is trial 25 with value: 253.46250044727435.




[I 2025-12-08 03:37:00,598] Trial 28 finished with value: 256.3545422617509 and parameters: {'iterations': 1282, 'depth': 4, 'learning_rate': 0.01612751235930232, 'l2_leaf_reg': 4.457107069219993, 'bagging_temperature': 0.590928751693069, 'border_count': 130, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 178}. Best is trial 25 with value: 253.46250044727435.




[I 2025-12-08 03:42:00,043] Trial 29 finished with value: 253.46006493182344 and parameters: {'iterations': 1430, 'depth': 5, 'learning_rate': 0.02763204350349723, 'l2_leaf_reg': 2.2076729630825316, 'bagging_temperature': 1.8122912929392145, 'border_count': 174, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 151}. Best is trial 29 with value: 253.46006493182344.
Best trial:
{'iterations': 1430, 'depth': 5, 'learning_rate': 0.02763204350349723, 'l2_leaf_reg': 2.2076729630825316, 'bagging_temperature': 1.8122912929392145, 'border_count': 174, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 151}

üî• Tuned CatBoost Test RMSE: 252.3193

Top 25 CatBoost feature importances:
                        feature  importance
20                 num_children   19.216545
14             num_transactions    9.759743
13                    card_type    8.576299
26           credit_limit_log1p    8.060238
11                 credit_limit    6.912150
27  avg_transaction_value_log1p    6.493774
9   



In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet

# --- Assuming these variables are defined and your model is fitted ---
# 'model' is your fitted ElasticNet instance
# 'X' is your training data DataFrame (used for feature names)
# --------------------------------------------------------------------

# 1. Get the feature names
# Assuming 'poly_transformer' is your fitted PolynomialFeatures instance
# and 'X' is your original feature DataFrame
feature_names = poly.get_feature_names_out(input_features=X_imputed_df.columns)

# 2. Get the estimated coefficients
coefficients = enet.coef_

# 3. Combine them into a DataFrame for easy viewing
coef_summary = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# 4. Display the results
print("### All Feature Coefficients ###")
print(coef_summary)

# 5. Identify features shrunk to ZERO
# The tolerance for zero might be very small, so we use a small threshold
zero_coef_features = coef_summary[np.isclose(coef_summary['Coefficient'], 0, atol=1e-6)]

print("\n### Features Shrunk to Zero (Eliminated) ###")
print(zero_coef_features[zero_coef_features['Coefficient']!=0])

### All Feature Coefficients ###
                                    Feature  Coefficient
0                             gender_female          0.0
1                               gender_male         -0.0
2                    marital_status_married          0.0
3                     marital_status_single         -0.0
4                 education_level_bachelors          0.0
..                                      ...          ...
625           travel_frequency num_children          0.0
626       travel_frequency num_credit_cards          0.0
627      utility_payment_count num_children         -0.0
628  utility_payment_count num_credit_cards          0.0
629           num_children num_credit_cards          0.0

[630 rows x 2 columns]

### Features Shrunk to Zero (Eliminated) ###
                                         Feature   Coefficient
53                     gender_female customer_id  2.104586e-07
86                       gender_male customer_id  1.888470e-07
118           marital_st

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet

# --- Assuming these variables are defined and your model is fitted ---
# 'model' is your fitted ElasticNet instance
# 'X' is your training data DataFrame (used for feature names)
# --------------------------------------------------------------------

# 1. Get the feature names
# Assuming 'poly_transformer' is your fitted PolynomialFeatures instance
# and 'X' is your original feature DataFrame
feature_names = poly.get_feature_names_out(input_features=X_imputed_df.columns)

# 2. Get the estimated coefficients
coefficients = enet.coef_

# 3. Combine them into a DataFrame for easy viewing
coef_summary = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# 4. Display the results
print("### All Feature Coefficients ###")
print(coef_summary)

# 5. Identify features shrunk to ZERO
# The tolerance for zero might be very small, so we use a small threshold
zero_coef_features = coef_summary[np.isclose(coef_summary['Coefficient'], 0, atol=1e-6)]

print("\n### Features Shrunk to Zero (Eliminated) ###")
print(zero_coef_features[zero_coef_features['Coefficient']!=0])

### All Feature Coefficients ###
                                    Feature  Coefficient
0                             gender_female          0.0
1                               gender_male         -0.0
2                    marital_status_married          0.0
3                     marital_status_single         -0.0
4                 education_level_bachelors          0.0
..                                      ...          ...
625           travel_frequency num_children          0.0
626       travel_frequency num_credit_cards          0.0
627      utility_payment_count num_children         -0.0
628  utility_payment_count num_credit_cards          0.0
629           num_children num_credit_cards          0.0

[630 rows x 2 columns]

### Features Shrunk to Zero (Eliminated) ###
                                         Feature   Coefficient
53                     gender_female customer_id  2.104586e-07
86                       gender_male customer_id  1.888470e-07
118           marital_st