Here is the full scala code:
import ai.h2o.automl.AutoML;
import ai.h2o.automl.AutoMLBuildSpec
import org.apache.spark.h2o._
val h2oContext = H2OContext.getOrCreate(sc)
import h2oContext._
import java.io.File
import h2oContext.implicits._
import water.Key
val prostateData = new H2OFrame(new File("/Users/avkashchauhan/src/github.com/h2oai/sparkling-water/examples/smalldata/prostate.csv"))
val autoMLBuildSpec = new AutoMLBuildSpec()
autoMLBuildSpec.input_spec.training_frame = prostateData
autoMLBuildSpec.input_spec.response_column = "CAPSULE";
autoMLBuildSpec.build_control.loss = "AUTO"
autoMLBuildSpec.build_control.stopping_criteria.set_max_runtime_secs(5)
import java.util.Date;
val aml = AutoML.makeAutoML(Key.make(), new Date(), autoMLBuildSpec)
AutoML.startAutoML(aml)
// Note: In some cases the above call is non-blocking
// So using the following alternative function will block the next commmand, untill the exection of action command
AutoML.startAutoML(autoMLBuildSpec).get() ## This is forced blocking call
aml.leader
aml.leaderboard
Here is the full code execution:
scala> import ai.h2o.automl.AutoML;
import ai.h2o.automl.AutoML
scala> import ai.h2o.automl.AutoMLBuildSpec
import ai.h2o.automl.AutoMLBuildSpec
scala> import org.apache.spark.h2o._
import org.apache.spark.h2o._
scala> val h2oContext = H2OContext.getOrCreate(sc)
17/09/15 20:21:15 WARN H2OContext: Method H2OContext.getOrCreate with an argument of type SparkContext is deprecated and parameter of type SparkSession is preferred.
17/09/15 20:21:15 WARN InternalH2OBackend: Increasing 'spark.locality.wait' to value 30000
17/09/15 20:21:15 WARN InternalH2OBackend: Due to non-deterministic behavior of Spark broadcast-based joins
We recommend to disable them by
configuring `spark.sql.autoBroadcastJoinThreshold` variable to value `-1`:
sqlContext.sql("SET spark.sql.autoBroadcastJoinThreshold=-1")
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 2:
[rdd_0_2]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 5:
[rdd_0_5]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 4:
[rdd_0_4]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 7:
[rdd_0_7]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 3:
[rdd_0_3]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 0:
[rdd_0_0]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 1:
[rdd_0_1]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 6:
[rdd_0_6]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 9:
[rdd_0_9]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 8:
[rdd_0_8]
17/09/15 20:21:16 WARN Executor: 1 block locks were not released by TID = 10:
[rdd_0_10]
09-15 20:21:17.596 10.0.0.46:54323 51356 #r thread INFO: Found XGBoost backend with library: xgboost4j
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: Your system supports only minimal version of XGBoost (no GPUs, no multithreading)!
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: ----- H2O started -----
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: Build git branch: rel-weierstrass
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: Build git hash: 03e64d5c87f1eb7bcad9372bb4a73c4aab4f52d9
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: Build git describe: jenkins-3.14.0.1-6-g03e64d5
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: Build project version: 3.14.0.2 (latest version: 3.14.0.2)
09-15 20:21:17.614 10.0.0.46:54323 51356 #r thread INFO: Build age: 24 days
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Built by: 'jenkins'
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Built on: '2017-08-21 22:18:30'
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Watchdog Build git branch: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Watchdog Build git hash: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Watchdog Build git describe: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Watchdog Build project version: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Watchdog Built by: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Watchdog Built on: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: XGBoost Build git branch: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: XGBoost Build git hash: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: XGBoost Build git describe: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: XGBoost Build project version: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: XGBoost Built by: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: XGBoost Built on: (unknown)
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Processed H2O arguments: [-name, sparkling-water-avkashchauhan_local-1505532002512, -ga_opt_out, -log_level, INFO, -baseport, 54321, -ip, 10.0.0.46, -log_dir, /Volumes/OSxexT/tools/sw2/sparkling-water-2.1.14/h2ologs/local-1505532002512]
09-15 20:21:17.615 10.0.0.46:54323 51356 #r thread INFO: Java availableProcessors: 8
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Java heap totalMemory: 735.5 MB
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Java heap maxMemory: 2.67 GB
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Java version: Java 1.8.0_101 (from Oracle Corporation)
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: JVM launch parameters: [-Dscala.usejavacp=true, -Xmx3G, -XX:MaxPermSize=384m]
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: OS version: Mac OS X 10.12.6 (x86_64)
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Machine physical memory: 16.00 GB
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: X-h2o-cluster-id: 1505532075841
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: User name: 'avkashchauhan'
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Opted out of sending usage metrics.
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: IPv6 stack selected: false
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Network address/interface is not reachable in 150ms: /fe80:0:0:0:3eea:c3b7:1ad4:317b%utun0/name:utun0 (utun0)
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Network address/interface is not reachable in 150ms: /fe80:0:0:0:38e4:bff:febb:63e1%awdl0/name:awdl0 (awdl0)
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: en0 (en0), 2601:646:c401:818d:0:0:0:5101
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: en0 (en0), 2601:646:c401:818d:49b0:aebf:b647:bf93
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: en0 (en0), 2601:646:c401:818d:1c6c:26:e862:7761
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: en0 (en0), fe80:0:0:0:d5:151e:593:4a60%en0
09-15 20:21:17.616 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: en0 (en0), 10.0.0.46
09-15 20:21:17.617 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: lo0 (lo0), fe80:0:0:0:0:0:0:1%lo0
09-15 20:21:17.617 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: lo0 (lo0), 0:0:0:0:0:0:0:1
09-15 20:21:17.617 10.0.0.46:54323 51356 #r thread INFO: Possible IP Address: lo0 (lo0), 127.0.0.1
09-15 20:21:17.617 10.0.0.46:54323 51356 #r thread INFO: H2O node running in unencrypted mode.
09-15 20:21:17.619 10.0.0.46:54323 51356 #r thread INFO: Internal communication uses port: 54324
09-15 20:21:17.619 10.0.0.46:54323 51356 #r thread INFO: Listening for HTTP and REST traffic on http://10.0.0.46:54323/
09-15 20:21:17.653 10.0.0.46:54323 51356 #r thread INFO: H2O cloud name: 'sparkling-water-avkashchauhan_local-1505532002512' on /10.0.0.46:54323, discovery address /235.200.37.90:60360
09-15 20:21:17.654 10.0.0.46:54323 51356 #r thread INFO: If you have trouble connecting, try SSH tunneling from your local machine (e.g., via port 55555):
09-15 20:21:17.654 10.0.0.46:54323 51356 #r thread INFO: 1. Open a terminal and run 'ssh -L 55555:localhost:54323 avkashchauhan@10.0.0.46'
09-15 20:21:17.654 10.0.0.46:54323 51356 #r thread INFO: 2. Point your browser to http://localhost:55555
09-15 20:21:17.793 10.0.0.46:54323 51356 #r thread INFO: Log dir: '/Volumes/OSxexT/tools/sw2/sparkling-water-2.1.14/h2ologs/local-1505532002512'
09-15 20:21:17.793 10.0.0.46:54323 51356 #r thread INFO: Cur dir: '/Volumes/OSxexT/tools/sw2/sparkling-water-2.1.14'
09-15 20:21:17.800 10.0.0.46:54323 51356 #r thread INFO: HDFS subsystem successfully initialized
09-15 20:21:17.806 10.0.0.46:54323 51356 #r thread INFO: S3 subsystem successfully initialized
09-15 20:21:17.806 10.0.0.46:54323 51356 #r thread INFO: Flow dir: '/Users/avkashchauhan/h2oflows'
09-15 20:21:17.817 10.0.0.46:54323 51356 #r thread INFO: Cloud of size 1 formed [/10.0.0.46:54323]
09-15 20:21:17.861 10.0.0.46:54323 51356 #r thread INFO: Registered parsers: [GUESS, ARFF, XLS, SVMLight, AVRO, ORC, PARQUET, CSV]
09-15 20:21:17.861 10.0.0.46:54323 51356 #r thread INFO: Watchdog extension initialized
09-15 20:21:17.861 10.0.0.46:54323 51356 #r thread INFO: XGBoost extension initialized
09-15 20:21:17.861 10.0.0.46:54323 51356 #r thread INFO: Registered 2 core extensions in: 24ms
09-15 20:21:17.862 10.0.0.46:54323 51356 #r thread INFO: Registered H2O core extensions: [Watchdog, XGBoost]
09-15 20:21:18.355 10.0.0.46:54323 51356 #r thread INFO: Registered: 160 REST APIs in: 493ms
09-15 20:21:18.355 10.0.0.46:54323 51356 #r thread INFO: Registered REST API extensions: [XGBoost, Algos, AutoML, Core V3, Core V4]
09-15 20:21:18.448 10.0.0.46:54323 51356 #r thread INFO: Registered: 244 schemas in 93ms
09-15 20:21:18.449 10.0.0.46:54323 51356 #r thread INFO: H2O started in 1848ms
09-15 20:21:18.449 10.0.0.46:54323 51356 #r thread INFO:
09-15 20:21:18.449 10.0.0.46:54323 51356 #r thread INFO: Open H2O Flow in your web browser: http://10.0.0.46:54323
09-15 20:21:18.449 10.0.0.46:54323 51356 #r thread INFO:
09-15 20:21:20.541 10.0.0.46:54323 51356 main TRACE: H2OContext initialized
h2oContext: org.apache.spark.h2o.H2OContext =
Sparkling Water Context:
* H2O name: sparkling-water-avkashchauhan_local-1505532002512
* cluster size: 1
* list of used nodes:
(executorId, host, port)
------------------------
(driver,10.0.0.46,54323)
------------------------
Open H2O Flow in browser: http://10.0.0.46:54323 (CMD + click in Mac OSX)
scala> import h2oContext._
import h2oContext._
scala> import java.io.File
import java.io.File
scala> import h2oContext.implicits._
import h2oContext.implicits._
scala> import water.Key
import water.Key
scala> val prostateData = new H2OFrame(new File("/Users/avkashchauhan/src/github.com/h2oai/sparkling-water/examples/smalldata/prostate.csv"))
09-15 20:22:43.228 10.0.0.46:54323 51356 main INFO: Locking cloud to new members, because water.fvec.NFSFileVec
09-15 20:22:43.464 10.0.0.46:54323 51356 main INFO: ParseSetup heuristic: cloudSize: 1, cores: 8, numCols: 9, maxLineLength: 28, totalSize: 9254, localParseSize: 9254, chunkSize: 4194304, numChunks: 1, numChunks * cols: 9
09-15 20:22:43.466 10.0.0.46:54323 51356 main INFO: Total file size: 9.0 KB
09-15 20:22:43.484 10.0.0.46:54323 51356 main INFO: Parse chunk size 4194304
09-15 20:22:43.585 10.0.0.46:54323 51356 FJ-1-15 INFO: Parse result for prostate.hex (380 rows):
09-15 20:22:43.598 10.0.0.46:54323 51356 FJ-1-15 INFO: ColV2 type min max mean sigma NAs constant cardinality
09-15 20:22:43.598 10.0.0.46:54323 51356 FJ-1-15 INFO: ID: numeric 1.00000 380.000 190.500 109.841
09-15 20:22:43.599 10.0.0.46:54323 51356 FJ-1-15 INFO: CAPSULE: numeric 0.00000 1.00000 0.402632 0.491074
09-15 20:22:43.599 10.0.0.46:54323 51356 FJ-1-15 INFO: AGE: numeric 43.0000 79.0000 66.0395 6.52707
09-15 20:22:43.599 10.0.0.46:54323 51356 FJ-1-15 INFO: RACE: numeric 0.00000 2.00000 1.08684 0.308773
09-15 20:22:43.599 10.0.0.46:54323 51356 FJ-1-15 INFO: DPROS: numeric 1.00000 4.00000 2.27105 1.00011
09-15 20:22:43.599 10.0.0.46:54323 51356 FJ-1-15 INFO: DCAPS: numeric 1.00000 2.00000 1.10789 0.310656
09-15 20:22:43.599 10.0.0.46:54323 51356 FJ-1-15 INFO: PSA: numeric 0.300000 139.700 15.4086 19.9976
09-15 20:22:43.600 10.0.0.46:54323 51356 FJ-1-15 INFO: VOL: numeric 0.00000 97.6000 15.8129 18.3476
09-15 20:22:43.600 10.0.0.46:54323 51356 FJ-1-15 INFO: GLEASON: numeric 0.00000 9.00000 6.38421 1.09195
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: Chunk compression summary:
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: Chunk Type Chunk Name Count Count Percentage Size Size Percentage
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: CBS Binary 1 11.111 % 118 B 2.421 %
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: C1N 1-Byte Integers (w/o NAs) 5 55.556 % 2.2 KB 45.958 %
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: C2 2-Byte Integers 1 11.111 % 828 B 16.988 %
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: C2S 2-Byte Fractions 2 22.222 % 1.6 KB 34.633 %
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: Frame distribution summary:
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: Size Number of Rows Number of Chunks per Column Number of Chunks
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: 10.0.0.46:54323 4.8 KB 380 1 9
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: mean 4.8 KB 380.000000 1.000000 9.000000
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: min 4.8 KB 380.000000 1.000000 9.000000
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: max 4.8 KB 380.000000 1.000000 9.000000
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: stddev 0 B 0.000000 0.000000 0.000000
09-15 20:22:43.612 10.0.0.46:54323 51356 FJ-1-15 INFO: total 4.8 KB 380 1 9
prostateData: water.fvec.H2OFrame =
Frame key: prostate.hex
cols: 9
rows: 380
chunks: 1
size: 4874
scala> val autoMLBuildSpec = new AutoMLBuildSpec()
autoMLBuildSpec: ai.h2o.automl.AutoMLBuildSpec = ai.h2o.automl.AutoMLBuildSpec@5cfadadc
scala> autoMLBuildSpec.input_spec.training_frame = prostateData.
VecSelector anyVec clone domains hasNAs isVec makeSchema name readExternal rename toCSV uniquify vec
VecTransformation apply colToEnum extractFrame hashCode key means names readJSON replace toJsonString unlock vecs
_kb asBytes compareTo find home keys modes numCols read_lock restructure toString unlock_all write
_key bulkRollups deepCopy frozenType home_node keysList moveFirst numRows reloadFromBytes setNames toTwoDimTable update writeAll
_lockers byteSize deepSlice get insertVec lastVec mults postWrite reloadVecs sort type user_allowed writeExternal
_names cardinality delete getVecKey isChunkKey lastVecName naCount prepend remove subframe types valueClass writeJSON
add checksum delete_and_lock hasInfs isCompatible makeCompatible naFraction read removeAll swap typesStr valueClassSimple write_lock
scala> autoMLBuildSpec.input_spec.training_frame = prostateData._key
autoMLBuildSpec.input_spec.training_frame: water.Key[water.fvec.Frame] = prostate.hex
scala> autoMLBuildSpec.input_spec.response_column = "CAPSULE";
autoMLBuildSpec.input_spec.response_column: String = CAPSULE
scala> autoMLBuildSpec.build_control.loss = "AUTO"
autoMLBuildSpec.build_control.loss: String = AUTO
scala> autoMLBuildSpec.build_control.stopping_criteria.set_max_runtime_secs(5)
scala> import java.util.Date;
import java.util.Date
scala> val aml = AutoML.makeAutoML(Key.make(), new Date(), autoMLBuildSpec)
aml: ai.h2o.automl.AutoML = ai.h2o.automl.AutoML@3ad9012f
scala> AutoML.startAutoML(aml)
..............
...
... This will start the AML process which will take some time to finish
...
..............
scala> aml.leader
warning: there was one feature warning; re-run with -feature for details
res4: hex.Model[?0,?1,?2] forSome { type ?0 <: hex.Model[?0,?1,?2]; type ?1 <: hex.Model.Parameters; type ?2 <: hex.Model.Output } =
Model Metrics Type: RegressionGLM
Description: N/A
model id: GLM_grid_0_AutoML_20170915_202514_model_1
frame id: automl_training_prostate.hex
MSE: 0.16562025
RMSE: 0.4069647
mean residual deviance: 0.16562025
mean absolute error: 0.35771698
root mean squared log error: 0.28812236
null DOF: 265.0
residual DOF: 257.0
null deviance: 64.8421
residual deviance: 44.05499
AIC: 296.59195
Model Metrics Type: RegressionGLM
Description: N/A
model id: GLM_grid_0_AutoML_20170915_202514_model_1
frame id: automl_validation_prostate.hex
MSE: 0.21967092
RMSE: 0.46869063
mean residual deviance: 0.21967092
mean absolute error: 0.4152546
root mean squared...
scala> aml.leader
leader leaderboard
scala> aml.leaderboard
res5: ai.h2o.automl.Leaderboard = Leaderboard for project_name "automl_prostate": | model_id ; [Ljava.lang.String;@5dde2d12 | GLM_grid_0_AutoML_20170915_202514_model_1 ; [D@71c8e40e | GLM_grid_0_AutoML_20170915_202514_model_0 ; [D@65a44d8a | StackedEnsemble_0_AutoML_20170915_202514 ; [D@5e24311b | DRF_0_AutoML_20170915_202514 ; [D@72be2170 | XRT_0_AutoML_20170915_202514 ; [D@446043bd | GBM_grid_0_AutoML_20170915_202514_model_0 ; [D@36f8ea6 |