In [5]:

# In recent years, the demand for machine learning experts has outpaced the supply, despite the surge of people entering the field. To address this gap, there have been big strides in the development of user-friendly machine learning software that can be used by non-experts. The first steps toward simplifying machine learning involved developing simple, unified interfaces to a variety of machine learning algorithms (e.g. H2O).

# H2O’s AutoML can be used for automating the machine learning workflow, which includes automatic training and tuning of many models within a user-specified time-limit.

import h2o
from h2o.automl import H2OAutoML

h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,20 hours 21 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,3 months and 10 days !!!
H2O_cluster_name:,H2O_from_python_ryans_w53o2h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.289 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |█
14:11:11.709: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████Failed polling AutoML progress log: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\ryans\\AppData\\Local\\Temp\\tmpsbz_nq45.csv'
████| 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_AutoML_20200714_141111,0.787916,0.553829,0.805132,0.324327,0.43328,0.187732
StackedEnsemble_BestOfFamily_AutoML_20200714_141111,0.785161,0.556351,0.802576,0.33263,0.434448,0.188745
GBM_5_AutoML_20200714_141111,0.780862,0.559708,0.799145,0.325399,0.436083,0.190168
GBM_grid__1_AutoML_20200714_141111_model_6,0.779677,0.559886,0.801394,0.328174,0.436728,0.190731
GBM_grid__1_AutoML_20200714_141111_model_3,0.779386,0.562607,0.797915,0.334245,0.437242,0.191181
GBM_1_AutoML_20200714_141111,0.778997,0.56159,0.797276,0.326697,0.437003,0.190971
GBM_2_AutoML_20200714_141111,0.778338,0.561527,0.796497,0.329805,0.437199,0.191143
GBM_grid__1_AutoML_20200714_141111_model_1,0.776443,0.570148,0.792348,0.333251,0.440213,0.193787
GBM_3_AutoML_20200714_141111,0.776389,0.563906,0.794032,0.328065,0.438274,0.192084
GBM_grid__1_AutoML_20200714_141111_model_5,0.773405,0.571628,0.791952,0.328845,0.4411,0.19457




In [6]:

# The leader model is stored here
aml.leader


# To generate predictions on a test set, you can make predictions
# directly on the `"H2OAutoML"` object or on the leader model
# object directly
preds = aml.predict(test)

# or:
preds = aml.leader.predict(test)


# Get leaderboard with `extra_columns` = 'ALL'
lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')
lb


stackedensemble prediction progress: |████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
StackedEnsemble_AllModels_AutoML_20200714_141111,0.787916,0.553829,0.805132,0.324327,0.43328,0.187732,2943,0.18505
StackedEnsemble_BestOfFamily_AutoML_20200714_141111,0.785161,0.556351,0.802576,0.33263,0.434448,0.188745,1156,0.049995
GBM_5_AutoML_20200714_141111,0.780862,0.559708,0.799145,0.325399,0.436083,0.190168,1266,0.00476
GBM_grid__1_AutoML_20200714_141111_model_6,0.779677,0.559886,0.801394,0.328174,0.436728,0.190731,1389,0.003832
GBM_grid__1_AutoML_20200714_141111_model_3,0.779386,0.562607,0.797915,0.334245,0.437242,0.191181,684,0.004061
GBM_1_AutoML_20200714_141111,0.778997,0.56159,0.797276,0.326697,0.437003,0.190971,1164,0.004689
GBM_2_AutoML_20200714_141111,0.778338,0.561527,0.796497,0.329805,0.437199,0.191143,1063,0.007222
GBM_grid__1_AutoML_20200714_141111_model_1,0.776443,0.570148,0.792348,0.333251,0.440213,0.193787,696,0.011849
GBM_3_AutoML_20200714_141111,0.776389,0.563906,0.794032,0.328065,0.438274,0.192084,1299,0.006564
GBM_grid__1_AutoML_20200714_141111_model_5,0.773405,0.571628,0.791952,0.328845,0.4411,0.19457,751,0.004627




In [4]:

# Data:
# Imported from AWS
# https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
