# Ensemble Model

In [86]:
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2ORandomForestEstimator, H2OGradientBoostingEstimator, H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
import pandas as pd

In [87]:
# Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 17 mins
H2O_cluster_timezone:,Asia/Singapore
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 25 days
H2O_cluster_name:,chunhong
H2O_cluster_total_nodes:,2
H2O_cluster_free_memory:,5.385 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


In [88]:
# Load the pre-split datasets into H2O
X_train = h2o.import_file("./dataset/X_train.csv")
y_train = h2o.import_file("./dataset/y_train.csv")
X_val = h2o.import_file("./dataset/X_val.csv")
y_val = h2o.import_file("./dataset/y_val.csv")
X_test = h2o.import_file("./dataset/X_test.csv")
y_test = h2o.import_file("./dataset/y_test.csv")

# Combine X and y for H2O training (H2O expects the target to be part of the dataset)
train = X_train.cbind(y_train)
val = X_val.cbind(y_val)
test = X_test.cbind(y_test)

# Define the target and features
target = "yearly_compensation"
features = [col for col in train.columns if col != target]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Bagging

In [89]:
# Train a Random Forest (Bagging)
# Also one of the Base Models (Level 0 Models)
rf_model = H2ORandomForestEstimator(
    ntrees=100,          # Number of trees
    max_depth=20,        # Maximum depth of each tree
    min_rows=5,          # Minimum number of rows per leaf
    sample_rate=0.8,     # Row sampling rate per tree
    seed=42,             # Random seed for reproducibility
    nfolds=5,
    keep_cross_validation_predictions=True
)

# Train the model
rf_model.train(x=features, y=target, training_frame=train, validation_frame=val)

drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,100.0,100.0,843356.0,18.0,20.0,19.78,636.0,695.0,667.3

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,2.5341358,0.0510449,2.5657077,2.5103388,2.5565422,2.4561892,2.5819008
mean_residual_deviance,10.305171,0.4351854,10.397884,10.159893,10.850405,9.665643,10.452029
mse,10.305171,0.4351854,10.397884,10.159893,10.850405,9.665643,10.452029
r2,0.8203849,0.0095904,0.8154147,0.8251508,0.8058461,0.8272909,0.8282217
residual_deviance,10.305171,0.4351854,10.397884,10.159893,10.850405,9.665643,10.452029
rmse,3.2095902,0.0680272,3.224575,3.1874588,3.2939954,3.1089616,3.2329597
rmsle,0.5920789,0.024467,0.594752,0.5946686,0.5727473,0.5682262,0.6300002

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2024-12-27 23:33:00,44.795 sec,0.0,,,,,,
,2024-12-27 23:33:00,44.911 sec,1.0,5.2904887,3.5720561,27.9892706,5.2741266,3.5039631,27.8164112
,2024-12-27 23:33:00,45.005 sec,2.0,5.0364267,3.3661247,25.3655943,4.2133372,3.0000374,17.7522103
,2024-12-27 23:33:00,45.114 sec,3.0,5.0626162,3.4409417,25.6300824,4.0200985,2.9604855,16.1611923
,2024-12-27 23:33:00,45.236 sec,4.0,4.8948722,3.3472129,23.9597735,3.8147203,2.8075370,14.5520911
,2024-12-27 23:33:00,45.337 sec,5.0,4.8090316,3.2700208,23.1267848,3.6212303,2.6942962,13.1133087
,2024-12-27 23:33:00,45.446 sec,6.0,4.7672725,3.2640050,22.7268868,3.5410762,2.6563002,12.5392208
,2024-12-27 23:33:00,45.558 sec,7.0,4.7055978,3.2282565,22.1426507,3.4662232,2.6040782,12.0147030
,2024-12-27 23:33:00,45.654 sec,8.0,4.6320780,3.1941199,21.4561464,3.4484284,2.5907840,11.8916583
,2024-12-27 23:33:00,45.767 sec,9.0,4.5734023,3.1747059,20.9160086,3.3988538,2.5808907,11.5522069

variable,relative_importance,scaled_importance,percentage
job_title_Data_Engineer,5081440.5,1.0,0.2250043
job_title_Product/Project_Manager,2562027.2500000,0.5041931,0.1134456
country_Ukraine,2285159.2500000,0.4497070,0.1011860
ml_spending,1980804.3750000,0.3898116,0.0877093
used_tpu,1778038.5,0.3499084,0.0787309
country_United_States_of_America,888786.5,0.1749084,0.0393551
Total_Experience,858759.3750000,0.1689992,0.0380255
country_Republic_of_Korea,829853.25,0.1633106,0.0367456
country_SUMprofileTable_yearly_compensation,675374.0,0.1329099,0.0299053
country_Thailand,509345.9062500,0.1002365,0.0225536


In [90]:
bagged_perf = rf_model.model_performance(test_data=test)
bagged_perf 

In [91]:
bagged_predictions = rf_model.predict(test)
bagged_predictions.head()

drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%


predict
21.0081
8.68144
12.2096
17.5071
15.5879
4.93371
10.7269
16.7793
2.53943
12.4879


### Stacking

In [92]:
# Gradient Boosting Machine
# One of the Base Models (Level 0 Models)
gbm_model = H2OGradientBoostingEstimator(
    ntrees=100, max_depth=5, learn_rate=0.1, seed=42, nfolds=5, keep_cross_validation_predictions=True
)
gbm_model.train(x=features, y=target, training_frame=train, validation_frame=val)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,100.0,100.0,41143.0,5.0,5.0,5.0,17.0,32.0,28.05

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,1.748205,0.0612959,1.7578387,1.7624997,1.8243537,1.6538647,1.7424679
mean_residual_deviance,5.7297006,0.5049402,5.812983,6.018807,6.3375645,5.040335,5.4388123
mse,5.7297006,0.5049402,5.812983,6.018807,6.3375645,5.040335,5.4388123
r2,0.9000747,0.0101739,0.8968068,0.8964179,0.8865975,0.9099375,0.9106135
residual_deviance,5.7297006,0.5049402,5.812983,6.018807,6.3375645,5.040335,5.4388123
rmse,2.391797,0.106106,2.411013,2.4533257,2.517452,2.245069,2.3321261
rmsle,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2024-12-27 23:33:22,10.409 sec,0.0,7.5755655,6.8359983,57.3891923,7.5782556,6.8386806,57.4299575
,2024-12-27 23:33:22,10.445 sec,1.0,7.1292975,6.4120023,50.8268832,7.1759811,6.4549020,51.4947048
,2024-12-27 23:33:22,10.466 sec,2.0,6.7122643,6.0117010,45.0544917,6.7874159,6.0767282,46.0690142
,2024-12-27 23:33:22,10.488 sec,3.0,6.3604097,5.6791362,40.4548119,6.4731907,5.7694044,41.9021975
,2024-12-27 23:33:22,10.511 sec,4.0,6.0178540,5.3444293,36.2145669,6.1632880,5.4507152,37.9861194
,2024-12-27 23:33:22,10.533 sec,5.0,5.7480943,5.0840578,33.0405883,5.9347744,5.2266189,35.2215467
,2024-12-27 23:33:22,10.557 sec,6.0,5.4850039,4.8286667,30.0852674,5.7040243,4.9917290,32.5358931
,2024-12-27 23:33:22,10.580 sec,7.0,5.2702481,4.6177933,27.7755153,5.5320565,4.8132664,30.6036497
,2024-12-27 23:33:22,10.618 sec,8.0,4.9871387,4.3466556,24.8715523,5.2685287,4.5550260,27.7573948
,2024-12-27 23:33:22,10.641 sec,9.0,4.7950975,4.1702178,22.9929596,5.0985426,4.3936876,25.9951367

variable,relative_importance,scaled_importance,percentage
job_title_Data_Engineer,392324.0625000,1.0,0.2443427
job_title_Product/Project_Manager,259214.1093750,0.6607143,0.1614407
country_Ukraine,157434.5156250,0.4012869,0.0980515
ml_spending,154485.6875000,0.3937706,0.0962150
used_tpu,127075.1250000,0.3239035,0.0791435
country_United_States_of_America,71365.625,0.1819048,0.0444471
country_SUMprofileTable_yearly_compensation,67307.0,0.1715597,0.0419194
Total_Experience,58029.4257812,0.1479120,0.0361412
country_Republic_of_Korea,50283.7265625,0.1281689,0.0313171
country_Thailand,44625.3046875,0.1137460,0.0277930


In [93]:
# Generalized Linear Model
# One of the Base Models (Level 0 Models)
glm_model = H2OGeneralizedLinearEstimator(family="gaussian", nfolds=5, keep_cross_validation_predictions=True)
glm_model.train(x=features, y=target, training_frame=train, validation_frame=val)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.00595 )",50,48,1,py_36_sid_a7f3

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,2608.5618,82.566505,2502.5393,2556.1562,2718.6091,2633.4846,2632.0188
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.542622,0.0143119,0.5349909,0.5449228,0.5666661,0.5343979,0.5321322
mean_residual_deviance,0.5827644,0.0331848,0.5601175,0.5550763,0.638499,0.5785654,0.5815641
mse,0.5827644,0.0331848,0.5601175,0.5550763,0.638499,0.5785654,0.5815641
null_deviance,62701.91,1357.659,60384.79,63390.65,63142.43,63857.074,62734.6
r2,0.9898315,0.0005782,0.9901137,0.9904369,0.9888971,0.9899467,0.989763
residual_deviance,636.424,40.76543,595.4049,606.1433,699.7949,639.8933,640.8836
rmse,0.7631488,0.0214575,0.74841,0.7450344,0.7990613,0.7606348,0.7626035
rmsle,0.2554063,0.0,0.2554063,,,,

Unnamed: 0,timestamp,duration,iterations,negative_log_likelihood,objective,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2024-12-27 23:33:25,0.000 sec,0,313287.6006595,57.3891923,,,,,,,,
,2024-12-27 23:33:25,0.010 sec,1,,,0.7510358,0.5640547,0.5337682,0.9901714,0.7506743,0.563512,0.5451096,0.9901878

variable,relative_importance,scaled_importance,percentage
job_title_Data_Engineer,2.8772113,1.0,0.0722130
country_Ukraine,2.4107668,0.8378831,0.0605060
used_tpu,2.3956559,0.8326312,0.0601268
ml_spending,2.3848531,0.8288766,0.0598557
country_United_States_of_America,1.9691713,0.6844027,0.0494228
country_SUMprofileTable_yearly_compensation,1.7913967,0.6226156,0.0449609
Total_Experience,1.5754886,0.5475749,0.0395420
company_MEANprofileTable_yearly_compensation,1.5475025,0.5378481,0.0388396
country_Thailand,1.5285199,0.5312505,0.0383632
RecommendedLanguage_MEANprofileTable_yearly_compensation,1.3835822,0.4808761,0.0347255


In [94]:
# Combine Base Models into a Stacked Ensemble
stacked_ensemble = H2OStackedEnsembleEstimator(
    base_models=[rf_model.model_id, gbm_model.model_id, glm_model.model_id],  # Base model IDs
    metalearner_algorithm="glm",
    seed=42
)

# Train the stacked ensemble
stacked_ensemble.train(x=features, y=target, training_frame=train, validation_frame=val)

stackedensemble Model Build progress: |██████████████████████████████████████████| (done) 100%


In [95]:
stacked_perf = stacked_ensemble.model_performance(test)
stacked_perf

In [96]:
stacked_predictions = stacked_ensemble.predict(test)
stacked_predictions.head()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict
24.0241
6.88873
15.6187
17.4896
16.7289
4.26075
11.2072
18.3169
0.731514
17.9857


### Voting

In [97]:
# Combine Base Models into a Voted Ensemble
voted_ensemble = H2OStackedEnsembleEstimator(
    base_models=[rf_model.model_id, gbm_model.model_id, glm_model.model_id],  # Base model IDs
    seed=42
)

# Train the stacked ensemble
voted_ensemble.train(x=features, y=target, training_frame=train, validation_frame=val)

stackedensemble Model Build progress: |██████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),2/3
# GBM base models (used / total),1/1
# DRF base models (used / total),0/1
# GLM base models (used / total),1/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,
Custom metalearner hyperparameters,


In [98]:
voted_perf = voted_ensemble.model_performance(test)
voted_perf

In [99]:
voted_predictions = voted_ensemble.predict(test)
voted_predictions.head()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict
24.0822
6.88326
15.8439
17.386
16.7153
4.26303
11.3443
18.2658
0.751958
18.024


### Summary
Based on the performance metrics provided for **Bagging**, **Stacking**, and **Voting** models, we can compare the models using several metrics:

### **Key Performance Metrics**:
1. **Mean Squared Error (MSE)**: Lower MSE indicates better performance.
2. **Root Mean Squared Error (RMSE)**: Lower RMSE indicates better performance.
3. **Mean Absolute Error (MAE)**: Lower MAE indicates better performance.
4. **R² (R-squared)**: Higher R² indicates better model performance, as it shows the proportion of variance explained by the model.
5. **AIC**: Lower AIC indicates a better model (penalizes models for overfitting).
6. **Residual Deviance**: Lower residual deviance indicates better model performance.
   
### **Comparison of Models**:

#### **Bagged Model (DRF)**:
- **MSE**: 9.64
- **RMSE**: 3.10
- **MAE**: 2.43
- **R²**: Not provided
- **Residual Deviance**: 9.64

#### **Stacked Ensemble (GLM)**:
- **MSE**: 0.57
- **RMSE**: 0.75
- **MAE**: 0.55
- **R²**: 0.99
- **Residual Deviance**: 0.57
- **AIC**: 2671.23

#### **Voted Ensemble (GLM)**:
- **MSE**: 0.55
- **RMSE**: 0.74
- **MAE**: 0.54
- **R²**: 0.99
- **Residual Deviance**: 0.55
- **AIC**: 2632.42

### **Analysis**:
- **MSE, RMSE, and MAE**: The **Voted Ensemble** has the lowest MSE (0.55), RMSE (0.74), and MAE (0.54), which are all indicators of the best overall performance in terms of prediction error.
  
- **R²**: Both the **Stacked** and **Voted** models have very high R² values (~0.99), indicating that they explain almost all of the variance in the data, suggesting that both these models have excellent predictive power. The **Voted Ensemble** has a slightly higher R² (0.99 vs. 0.98 for Stacked), but the difference is minimal.

- **Residual Deviance**: Both **Stacked** and **Voted** models have very low residual deviance (around 0.55–0.57), which means the models are well-calibrated.

- **AIC**: The **Voted Ensemble** has the lowest AIC (2632.42), indicating that it has a better balance of model fit and complexity compared to the **Stacked Ensemble** (2671.23). Lower AIC values are preferable.

### **Conclusion**:
- The **Voted Ensemble** model shows the best performance based on most of the metrics, including **MSE**, **RMSE**, **MAE**, **R²**, and **AIC**.
- The **Stacked Ensemble** also performs very well, but it has slightly higher values for **MSE**, **RMSE**, and **AIC** than the **Voted Ensemble**.
- The **Bagged Model (DRF)** performs the worst overall with higher error metrics (MSE, RMSE, MAE), and no R² value is provided for comparison.

Therefore, the **Voted Ensemble** is the best-performing model among the three.