In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, ElasticNetCV
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv("Datasets/analysis_data.csv")

In [7]:
# -----------------------------------------------------------
# üî• RANDOM FOREST USING ELASTICNET-SELECTED VARIABLES
# -----------------------------------------------------------


# -----------------------------------------------------------
# 1Ô∏è‚É£ SPLIT DATA
# -----------------------------------------------------------
y = data["monthly_spend"]
X = data.drop(columns=["monthly_spend"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------------------------------------
# 2Ô∏è‚É£ COLUMN IDENTIFICATION
# -----------------------------------------------------------
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_columns = X.select_dtypes(include=['number']).columns.tolist()

# -----------------------------------------------------------
# 3Ô∏è‚É£ ONE-HOT ENCODING (same as notebook)
# -----------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_columns)
    ],
    remainder="passthrough"
)

X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

encoded_cat_cols = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_columns)
new_cols = list(encoded_cat_cols) + numeric_columns

X_train_df = pd.DataFrame(X_train_enc, columns=new_cols)
X_test_df = pd.DataFrame(X_test_enc, columns=new_cols)

# -----------------------------------------------------------
# 4Ô∏è‚É£ MICE IMPUTATION WITH BAYESIAN RIDGE
# -----------------------------------------------------------
mice = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy="median",
    random_state=42
)

X_train_imp = pd.DataFrame(mice.fit_transform(X_train_df), columns=new_cols)
X_test_imp = pd.DataFrame(mice.transform(X_test_df), columns=new_cols)


X_train_sel =X_train_imp 
X_test_sel = X_test_imp


# -----------------------------------------------------------
# 7Ô∏è‚É£ RANDOM FOREST USING ONLY SELECTED VARIABLES
# -----------------------------------------------------------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_sel, y_train)

# -----------------------------------------------------------
# 8Ô∏è‚É£ MODEL EVALUATION
# -----------------------------------------------------------
y_pred_rf = rf.predict(X_test_sel)
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)

print(f"\nüî• RANDOM FOREST RMSE on Test Set: {rmse_rf:.4f}")




üî• RANDOM FOREST RMSE on Test Set: 279.2283
