In [1]:
# imports
import h2o
from h2o.automl import H2OAutoML, get_leaderboard
import mlflow
import mlflow.h2o
from mlflow.tracking import MlflowClient

In [2]:
# start the h2o server
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_144"; Java(TM) SE Runtime Environment (build 1.8.0_144-b01); Java HotSpot(TM) 64-Bit Server VM (build 25.144-b01, mixed mode)
  Starting server from /usr/local/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/s_/30r12bzn3k7dn150jq07b8m80000gn/T/tmpx31rdfev
  JVM stdout: /var/folders/s_/30r12bzn3k7dn150jq07b8m80000gn/T/tmpx31rdfev/h2o_pax_data_started_from_python.out
  JVM stderr: /var/folders/s_/30r12bzn3k7dn150jq07b8m80000gn/T/tmpx31rdfev/h2o_pax_data_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster timezone:,Europe/London
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,"14 days, 19 hours and 21 minutes"
H2O cluster name:,H2O_from_python_pax_data_ygnyfi
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0


In [3]:
# variables
x_cols = ['C1', 'C2', 'C3', 'C4']
y_cols = 'C5'

experiment_name = 'automl_mlflow'

In [4]:
client = MlflowClient()

try:
    experiment = mlflow.create_experiment(experiment_name)
except:
    experiment = client.get_experiment_by_name(experiment_name)
mlflow.set_experiment(experiment_name)

In [5]:
train, valid = h2o.import_file('iris.csv').split_frame(ratios=[0.7])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
with mlflow.start_run():
    model = H2OAutoML(max_models=10, max_runtime_secs=300, seed=24, nfolds=6)
    model.train(x=x_cols, y=y_cols, training_frame=train, validation_frame=valid)

    mlflow.log_metric("rmse", model.leader.rmse())
    mlflow.log_metric("log_loss", model.leader.logloss())
    mlflow.log_metric("mean_per_class_error", model.leader.mean_per_class_error())

    mlflow.h2o.log_model(model.leader, "model")
    
    lb = model.leaderboard
    lb = get_leaderboard(model, extra_columns='ALL')
    print(lb.head(rows=lb.nrows))

AutoML progress: |
14:23:49.708: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

██████████
14:24:00.811: Skipping training of model GBM_5_AutoML_20200204_142349 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200204_142349.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 108.0.


██████████████████████████████████████████████| 100%


model_id,mean_per_class_error,logloss,rmse,mse,training_time_ms,predict_time_per_row_ms
GLM_1_AutoML_20200204_142349,0.03465,0.0906292,0.166896,0.0278542,298,0.006844
XGBoost_3_AutoML_20200204_142349,0.0526681,0.234201,0.23652,0.0559418,96,0.008694
XGBoost_1_AutoML_20200204_142349,0.0526681,0.25637,0.247727,0.0613685,221,0.007991
GBM_4_AutoML_20200204_142349,0.0526681,0.196108,0.223568,0.0499824,54,0.014902
DRF_1_AutoML_20200204_142349,0.0526681,0.131968,0.201344,0.0405392,56,0.01741
DeepLearning_1_AutoML_20200204_142349,0.0535921,0.251908,0.261355,0.0683066,1163,0.011231
GBM_1_AutoML_20200204_142349,0.0612151,0.203541,0.224734,0.0505053,125,0.030553
GBM_3_AutoML_20200204_142349,0.0616771,0.205754,0.233191,0.0543783,59,0.022955
GBM_2_AutoML_20200204_142349,0.0616771,0.193319,0.223642,0.0500158,55,0.015614
XGBoost_2_AutoML_20200204_142349,0.0868561,0.549568,0.423578,0.179418,94,0.010254





In [12]:
all_mlflow_runs = client.list_run_infos(experiment.experiment_id)
if len(all_mlflow_runs) > 0:
    run_info = all_mlflow_runs[-1]
    model = mlflow.h2o.load_model("mlruns/{exp_id}/{run_id}/artifacts/model/".format(exp_id=experiment.experiment_id,run_id=run_info.run_uuid))
    result = model.predict(valid)
else:
    raise Exception('Run the training first')

glm prediction progress: |████████████████████████████████████████████████| 100%
