In [1]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import graphviz

In [3]:
# Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,18 mins 36 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,H2O_from_python_User_wqc811
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.370 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
X_val = pd.read_csv("../04_modelling/dataset/X_val.csv")
y_val = pd.read_csv("../04_modelling/dataset/y_val.csv")
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

In [5]:
# Combine training features and target for H2O AutoML
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)


In [6]:
# Convert to H2O Frames
train_h2o = h2o.H2OFrame(train_df)
val_h2o = h2o.H2OFrame(val_df)
test_h2o = h2o.H2OFrame(test_df)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [7]:
# Define target and features
target = "yearly_compensation"
features = train_h2o.columns
if target in features:
    features.remove(target)

In [8]:
# Train AutoML model
aml = H2OAutoML(max_models=10, seed=42, include_algos=["GBM", "DRF"])
aml.train(x=features, y=target, training_frame=train_h2o)


AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,273.0,273.0,126880.0,7.0,15.0,13.179487,20.0,36.0,32.06227

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,1.3748565,0.0882278,1.3866962,1.3488008,1.3445948,1.5162853,1.277905
mean_residual_deviance,3.521788,0.4191825,3.5632846,3.4695508,3.3629441,4.180511,3.032648
mse,3.521788,0.4191825,3.5632846,3.4695508,3.3629441,4.180511,3.032648
r2,0.9386179,0.0052585,0.9364745,0.9393604,0.9400091,0.9314282,0.9458174
residual_deviance,3.521788,0.4191825,3.5632846,3.4695508,3.3629441,4.180511,3.032648
rmse,1.8740505,0.1102412,1.8876665,1.862673,1.8338332,2.0446298,1.74145
rmsle,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2024-12-30 23:18:05,15.579 sec,0.0,7.5755655,6.8359983,57.3891923
,2024-12-30 23:18:05,15.644 sec,5.0,5.8810450,5.2317359,34.5866905
,2024-12-30 23:18:05,15.714 sec,10.0,4.7589262,4.1257682,22.6473786
,2024-12-30 23:18:05,15.769 sec,15.0,4.0623955,3.4352620,16.5030572
,2024-12-30 23:18:05,15.823 sec,20.0,3.5096071,2.8883484,12.3173420
,2024-12-30 23:18:05,15.879 sec,25.0,3.1191230,2.4962184,9.7289282
,2024-12-30 23:18:05,15.943 sec,30.0,2.8152226,2.2043730,7.9254783
,2024-12-30 23:18:05,16.007 sec,35.0,2.5719044,1.9738716,6.6146923
,2024-12-30 23:18:05,16.063 sec,40.0,2.3641333,1.7950226,5.5891265
,2024-12-30 23:18:05,16.122 sec,45.0,2.1935377,1.6504052,4.8116076

variable,relative_importance,scaled_importance,percentage
job_title_Data_Engineer,331056.4062500,1.0,0.2346578
job_title_Product/Project_Manager,186515.5312500,0.5633950,0.1322050
country_Ukraine,146831.1562500,0.4435231,0.1040761
used_tpu,116450.7187500,0.3517549,0.0825420
ml_spending,108790.6640625,0.3286167,0.0771125
country_United_States_of_America,65088.1367188,0.1966074,0.0461355
country_SUMprofileTable_yearly_compensation,54645.1328125,0.1650629,0.0387333
Total_Experience,49943.1445312,0.1508599,0.0354005
country_Republic_of_Korea,49515.4570312,0.1495680,0.0350973
country_Thailand,49257.8046875,0.1487898,0.0349147


In [12]:
# Display leaderboard
print("H2O AutoML Leaderboard:")
print(aml.leaderboard)


H2O AutoML Leaderboard:
model_id                                        rmse       mse      mae       rmsle    mean_residual_deviance
GBM_1_AutoML_1_20241230_230531               1.87667   3.52188  1.37487  nan                          3.52188
GBM_grid_1_AutoML_1_20241230_230531_model_1  2.14408   4.59709  1.56718  nan                          4.59709
GBM_2_AutoML_1_20241230_230531               2.19528   4.81924  1.59312  nan                          4.81924
GBM_5_AutoML_1_20241230_230531               2.219     4.92397  1.60779  nan                          4.92397
GBM_grid_1_AutoML_1_20241230_230531_model_2  2.22596   4.95491  1.57824  nan                          4.95491
GBM_3_AutoML_1_20241230_230531               2.25622   5.09054  1.64942  nan                          5.09054
GBM_4_AutoML_1_20241230_230531               2.34223   5.48606  1.71535  nan                          5.48606
DRF_1_AutoML_1_20241230_230531               3.1552    9.95527  2.46362    0.580605             

In [13]:
# Extract and visualize the best model (if Decision Tree is included)
model = aml.get_best_model()
if 'drf' in model.model_id:
    tree = h2o.get_model_tree(model, tree_number=0)
    print("Decision Tree Details:")
    print(tree)


In [14]:
# Train and visualize Decision Tree using sklearn
clf = DecisionTreeClassifier(max_depth=3, criterion='gini', random_state=42)
clf.fit(X_train, y_train)


In [None]:
# Export the decision tree
# dot_data = export_graphviz(clf, out_file=None, feature_names=X_train.columns, filled=True, rounded=True, special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render("DecisionTree")

In [15]:
# Model evaluation
print("Evaluation on Test Set:")
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Evaluation on Test Set:
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.62      0.57       122
           1       0.00      0.00      0.00        49
           2       0.96      0.54      0.69        79
           3       0.28      0.91      0.42        74
           4       0.00      0.00      0.00        49
           5       0.35      0.53      0.42        51
           6       0.00      0.00      0.00        45
           7       0.21      0.42      0.28        36
           8       0.00      0.00      0.00        52
           9       0.00      0.00      0.00        14
          10       0.26      0.45      0.33        47
          11       0.00      0.00      0.00         6
          12       0.00      0.00      0.00        26
          13       0.32      0.79      0.45        73
          14       0.00      0.00      0.00         7
          15       0.00      0.00      0.00        27
          16       0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[76  0  0  1  0  0  0 13  0  0  0  0  0  2  0  0  0  0  0  0  0 30  0  0
   0]
 [26  0  0  0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  0  0  0 17  0  0
   0]
 [ 1  0 43  0  0 26  0  0  0  0  3  0  0  6  0  0  0  0  0  0  0  0  0  0
   0]
 [ 4  0  0 67  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0
   0]
 [ 6  0  0 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0
   0]
 [ 2  0  2  0  0 27  0  0  0  0 18  0  0  2  0  0  0  0  0  0  0  0  0  0
   0]
 [ 2  0  0 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 27  0  0
   0]
 [ 1  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0 20  0  0
   0]
 [ 0  0  0  0  0 15  0  0  0  0 23  0  0 14  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10  0  0
   0]
 [ 0  0  0  0  0  6  0  0  0  0 21  0  0 20  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0
   0]
 [ 0  0  0  0  0  0  0

In [10]:
from bayes_opt import BayesianOptimization
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the function to optimize
def objective(max_depth, min_samples_split, min_samples_leaf):
    # Convert float parameters to integers as DecisionTree requires integer values
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)

    # Initialize the model
    clf = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Evaluate using cross-validation
    score = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    return np.mean(score)

# Define the parameter bounds
param_bounds = {
    'max_depth': (10,20),
    'min_samples_split': (30, 50),
    'min_samples_leaf': (10, 20)
}

optimizer = BayesianOptimization(
    f=objective,
    pbounds=param_bounds,
    random_state=42
)
optimizer.maximize(init_points=15, n_iter=30)

# Best parameters
print("Best parameters:", optimizer.max)

# Train with best parameters
best_params = optimizer.max['params']
clf = DecisionTreeClassifier(
    max_depth=int(best_params['max_depth']),
    min_samples_split=int(best_params['min_samples_split']),
    min_samples_leaf=int(best_params['min_samples_leaf']),
    random_state=42
)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("Test set accuracy:",{accuracy})

|   iter    |  target   | max_depth | min_sa... | min_sa... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.436    [39m | [39m13.75    [39m | [39m19.51    [39m | [39m44.64    [39m |
| [39m2        [39m | [39m0.4319   [39m | [39m15.99    [39m | [39m11.56    [39m | [39m33.12    [39m |
| [39m3        [39m | [39m0.4294   [39m | [39m10.58    [39m | [39m18.66    [39m | [39m42.02    [39m |
| [39m4        [39m | [39m0.432    [39m | [39m17.08    [39m | [39m10.21    [39m | [39m49.4     [39m |
| [39m5        [39m | [39m0.4308   [39m | [39m18.32    [39m | [39m12.12    [39m | [39m33.64    [39m |
| [39m6        [39m | [39m0.4332   [39m | [39m11.83    [39m | [39m13.04    [39m | [39m40.5     [39m |
| [39m7        [39m | [39m0.4318   [39m | [39m14.32    [39m | [39m12.91    [39m | [39m42.24    [39m |
| [39m8        [39m | [39m0.431    [39m | [39m11.39    [39m | [39m12.92    [39m | [