In [3]:
import pickle
import numpy as np
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

# Load the data
test_data = pd.read_csv('test_data.csv')
X_test = test_data.drop(columns=['median_house_value'])
y_test = test_data['median_house_value']

# Load the optimized models
with open('optimized_random_forest.pkl', 'rb') as f:
    best_rf = pickle.load(f)

with open('optimized_xgboost.pkl', 'rb') as f:
    best_xgb = pickle.load(f)

with open('optimized_catboost.pkl', 'rb') as f:
    best_cat = pickle.load(f)

# Create the ensemble model using Voting Regressor
ensemble_model = VotingRegressor(estimators=[
    ('random_forest', best_rf),
    ('xgboost', best_xgb),
    ('catboost', best_cat)
])

# Train the ensemble model on the combined train + val data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
X_train = pd.concat([train_data.drop(columns=['median_house_value']), val_data.drop(columns=['median_house_value'])])
y_train = pd.concat([train_data['median_house_value'], val_data['median_house_value']])

ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model on the test data
y_pred = ensemble_model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nEnsemble Model - Test Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R²: {r2:.2f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   7.2s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   7.5s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   8.4s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   8.0s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   7.9s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=

80 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dungp\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dungp\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\dungp\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\dungp\AppData\Local\Programs\Python\Python312\Lib


Ensemble Model - Test Metrics:
Mean Absolute Error (MAE): 29078.54
Mean Squared Error (MSE): 2045927851.63
R²: 0.84


In [4]:
print("\nEnsemble Model - Test Metrics:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R²: {r2}")


Ensemble Model - Test Metrics:
Mean Absolute Error (MAE): 29078.536150608183
Mean Squared Error (MSE): 2045927851.6265733
R²: 0.8430275969441167
