In [1]:
import os
os.chdir('../')

In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv("Datasets/analysis_data.csv")

In [3]:
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, ElasticNet
from sklearn.pipeline import Pipeline

from sklearn.metrics import root_mean_squared_error

# ============================================================
# 1. Prepare data
# ============================================================
df = data.copy()
target_col = "monthly_spend"

y = df[target_col].reset_index(drop=True)
X = df.drop(columns=[target_col]).reset_index(drop=True)

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number"]).columns.tolist()

# ============================================================
# 2. Preprocessing: OHE + MICE + Polynomial (degree 2)
# ============================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ],
    remainder="passthrough"
)

mice = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy="median",
    random_state=42
)

poly = PolynomialFeatures(
    degree=2,
    include_bias=False,
    interaction_only=True
)

enet = ElasticNet(max_iter=20000, random_state=42)

pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("imputer", mice),
    ("poly", poly),
    ("model", enet),
])

# ============================================================
# 3. Focused, high-performance hyperparameter grid
# ============================================================
param_grid = {
    "model__alpha": [5, 8, 10, 12, 15],
    "model__l1_ratio": [0.8, 0.85, 0.9, 0.95, 0.98]
}

# ============================================================
# 4. Very robust CV (5Ã—3 repeated)
# ============================================================
cv = RepeatedKFold(
    n_splits=5,
    n_repeats=3,
    random_state=42
)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# ============================================================
# 5. Fit gridsearch on full training data
# ============================================================
grid.fit(X, y)

print("\n==========================")
print(" Best Hyperparameters ")
print("==========================")
print(grid.best_params_)

print("\nBest CV RMSE:", -grid.best_score_)

best_model = grid.best_estimator_

# Train RMSE (sanity check)
y_pred_train = best_model.predict(X)
print("\nTrain RMSE:", root_mean_squared_error(y, y_pred_train))


Fitting 15 folds for each of 25 candidates, totalling 375 fits


KeyboardInterrupt: 