In [1]:
###: Loading important libraries 

In [2]:
import h2o
from __future__ import print_function

In [3]:
###: Loading H2O Ensemble, Gird Search, GBM, Random Forest and Deep Learning specific libraries

In [4]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch

In [5]:
###: Initializing H2O

In [6]:
h2o.init()

ERROR:h2o:Key init.version_check is not a valid config key


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 mins 50 secs
H2O cluster version:,3.14.0.7
H2O cluster version age:,29 days
H2O cluster name:,H2O_from_python_avkashchauhan_q7n20e
H2O cluster total nodes:,1
H2O cluster free memory:,3.413 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [7]:
###:Importing training and test data
###: Local
train = h2o.import_file("/Users/avkashchauhan/src/github.com/avkash/mldl/data/house_price_train.csv")
test = h2o.import_file("/Users/avkashchauhan/src/github.com/avkash/mldl/data/house_price_test.csv")

###: From HTTP
#train = h2o.import_file("__")
#test = h2o.import_file("__")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:
###: Understanding the size of training and test dataset

In [9]:
print(train.shape)
print(test.shape)

(407, 14)
(99, 14)


In [10]:
###: Understanding the training dataset

In [11]:
train.describe()

Rows:407
Cols:14




Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
type,real,real,real,int,real,real,real,real,int,int,real,real,real,real
mins,0.00906,0.0,0.46,0.0,0.385,3.863,2.9,1.1296,1.0,188.0,12.6,0.32,1.73,5.0
mean,3.56736771499,10.5872235872,11.4092628993,0.0687960687961,0.556763882064,6.28658230958,69.3889434889,3.71767051597,9.83783783784,412.378378378,18.4474201474,354.403218673,12.792039312,22.6248157248
maxs,73.5341,100.0,27.74,1.0,0.871,8.725,100.0,10.7103,24.0,711.0,22.0,396.9,37.97,50.0
sigma,7.94798961655,22.2597824603,6.8144822924,0.253418548596,0.115557348898,0.690897322921,27.8179045475,2.01524090631,8.78440418525,170.44740987,2.16176742548,94.175205015,7.09869539936,9.18501930903
zeros,0,301,0,379,0,0,0,0,0,0,0,0,0,0
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
2,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4


In [12]:
###: Listing columns in training dataset

In [13]:
train.col_names

[u'crim',
 u'zn',
 u'indus',
 u'chas',
 u'nox',
 u'rm',
 u'age',
 u'dis',
 u'rad',
 u'tax',
 u'ptratio',
 u'b',
 u'lstat',
 u'medv']

In [14]:
###: Setting response column to "medv" <- Median house value

In [15]:
response = "medv"

In [16]:
###: Setting all the columns as features which will be used in training

In [17]:
features = train.columns
features.remove(response)
print(features)

[u'crim', u'zn', u'indus', u'chas', u'nox', u'rm', u'age', u'dis', u'rad', u'tax', u'ptratio', u'b', u'lstat']


In [18]:
###: We will be using cross validation in traning so setting Number of CV folds 

In [19]:
nfolds = 5

In [20]:
###: You can create ensemble models in few given ways:
###: 1. First Train individual models and pass as list to ensemble
###: 2. Train a grid of models
###: 3. Train several grids of models

###: Note: All base models must have the same cross-validation folds and
###: the cross-validated predicted values must be kept using keep_cross_validation_predictions set to TRUE.

In [21]:
###: Ensemble 1. 
###: First Train individual models and pass as list to ensemble
###: We will generate a 3-model ensemble using GBM + RF + Deep Learning model types

In [22]:
###: Training and cross-validating a GBM Regression Model and R2 metrics will be used as main metric

In [23]:
house_price_gbm = H2OGradientBoostingEstimator(distribution="AUTO",
                                      ntrees=10,
                                      max_depth=3,
                                      min_rows=2,
                                      learn_rate=0.2,
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1,
                                      model_id ="House_Price_GBM_Regression_Model")
house_price_gbm.train(x=features, y=response, training_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [24]:
###: Training and cross-validating a Random Forest Regression Model and R2 metrics will be used as main metric

In [25]:
house_price_rf = H2ORandomForestEstimator(ntrees=10,
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 seed=1,
                                 model_id = "House_Price_RF_Regression_Model")
house_price_rf.train(x=features, y=response, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [26]:
###: Training and cross-validating a Deep Learning Regression Model and R2 metrics will be used as main metric

In [27]:
house_price_dl = H2ODeepLearningEstimator(
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 hidden = [10,10],
                                 epochs = 25,
                                 train_samples_per_iteration=10,
                                 score_training_samples=20,
                                 activation="Rectifier",
                                 adaptive_rate = False,
                                 seed=1,
                                 model_id = "House_Price_DL_Regression_Model")
house_price_dl.train(x=features, y=response, training_frame=train)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [28]:
###: Getting Model ID of each model we have built

In [29]:
print(house_price_gbm.model_id)
print(house_price_rf.model_id)
print(house_price_dl.model_id)

House_Price_GBM_Regression_Model
House_Price_RF_Regression_Model
House_Price_DL_Regression_Model


In [30]:
###: Getting Model performance based cross validation using R2 metric for each model

In [31]:
print(house_price_gbm.model_performance(xval=True).r2())
print(house_price_rf.model_performance(xval=True).r2())
print(house_price_dl.model_performance(xval=True).r2())

0.86590631245
0.875860965039
0.830021328457


In [32]:
###: Generating an ensemble model using all 3 previously created GBM, RD and DL models

In [33]:
# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="house_price_ensemble_model",
                                       base_models=[house_price_gbm.model_id, 
                                                    house_price_rf.model_id,
                                                    house_price_dl.model_id ])
ensemble.train(x=features, y=response, training_frame=train)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [34]:
###: Evaluating ensemble performance using test data

In [35]:
perf_stack_test = ensemble.model_performance(test)

In [36]:
###: Comparing Base Learner performance using test set for GBM, RF and Deep Learning Model:

In [37]:
perf_gbm_test = house_price_gbm.model_performance(test)
perf_rf_test = house_price_rf.model_performance(test)
perf_dl_test = house_price_dl.model_performance(test)
baselearner_best_r2_test = max(perf_gbm_test.r2(), perf_rf_test.r2(), perf_dl_test.r2())
print("Best Base-learner Test R2 (R^2):  {0}".format(baselearner_best_r2_test))

Best Base-learner Test R2 (R^2):  0.810986942322


In [38]:
###: Getting Stack Ensemble Models performance using test set:

In [39]:
stack_r2_test = perf_stack_test.r2()
print("Ensemble Test R2 (R^2):  {0}".format(stack_r2_test))

Ensemble Test R2 (R^2):  0.828352360218


In [40]:
###: Printing Original Modelperformance using test data for comparision

In [41]:
print("Original GBM Model Test R2 (r^2) {0}", perf_gbm_test.r2())
print("Original RF Model Test R2 (r^2) {0}", perf_rf_test.r2())
print("Original Deep Learning Model Test R2 (r^2) {0}", perf_dl_test.r2())

Original GBM Model Test R2 (r^2) {0} 0.784615907643
Original RF Model Test R2 (r^2) {0} 0.810986942322
Original Deep Learning Model Test R2 (r^2) {0} 0.778774237379


In [42]:
###: Stack Ensemble model is like any other model in H2O 
###: So you sure can perform predictions using test set as needed

In [43]:
pred = ensemble.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [44]:
###: Ensemble 2. 
###: First We will generate a random Grid of model selecting GBM as main algorithm
###: Secondany we will stack all of grid model together

In [45]:
###: Let's specify GBM hyperparameters for the grid search

In [46]:
gbm_hyper_params = {"learn_rate": [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
                "max_depth": [ 5, 7, 10],
                "sample_rate": [0.5, 0.75, 1.0],
                "col_sample_rate": [0.5, 0.6, 0.7, 0.8]}

In [47]:
###: Now we will setup the Grid Search criteria and other parameter to fine tune it:

In [48]:
grid_search_criteria = {"strategy": "RandomDiscrete", 
                   "max_models": 100, 
                   "seed": 12345}

In [49]:
###: Now we will build the H2O GBM model based on Gird Search criteria and GBM hyperparameters setting:

In [50]:
house_price_gbm_grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=50,
                                                        seed=1,
                                                        nfolds=nfolds,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=gbm_hyper_params,
                     search_criteria=grid_search_criteria,
                     grid_id="house_price_gbm_grid")

house_price_gbm_grid.train(x=features, y=response, training_frame=train)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [51]:
###: Lets see the count of all models we built during grid search

In [52]:
len(house_price_gbm_grid.model_ids)

100

In [53]:
###: Now we will train a stacked ensemble model by passing the GBM grid models

In [54]:
house_price_grid_ensemble = H2OStackedEnsembleEstimator(model_id="house_price_gbm_grid_ensemble",
                                       base_models=house_price_gbm_grid.model_ids)
house_price_grid_ensemble.train(x=features, y=response, training_frame=train)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [55]:
###: Let's evaluate the stacked ensemble model performance based on test data

In [56]:
perf_stack_test = house_price_grid_ensemble.model_performance(test)

In [57]:
###: Now we can compare base learner performance with stacked ensemble model using test data

In [58]:
baselearner_best_r2_test = max([h2o.get_model(model).model_performance(test_data=test).r2() for model in house_price_gbm_grid.model_ids])

stack_r2_test = perf_stack_test.r2()
print("Best Base-learner model R2:  {0}".format(baselearner_best_r2_test))
print("Ensemble Model R2:  {0}".format(stack_r2_test))

Best Base-learner model R2:  0.870147909488
Ensemble Model R2:  0.841648272115


In [59]:
###:  Stack Ensemble model is like any other model in H2O 
###: So you sure can perform predictions using test set as needed

In [60]:
pred = house_price_grid_ensemble.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [61]:
###: Ensemble 3. 
###: First We will generate a random Grid of model selecting Deep Learning as main algorithm
###: Secondany we will stack Previously created GBM and recently created Deep Learning model together

In [62]:
###: Let's specify Deep Learning hyperparameters for the grid search

In [63]:
dl_hyper_params = { "hidden" : [1, 5,10],
                     "train_samples_per_iteration" : [5, 10],
                     "score_training_samples" : [10, 20]}

In [64]:
###: Now we will setup the Grid Search criteria and other parameter to fine tune it:

In [65]:
grid_search_criteria = {"strategy": "RandomDiscrete", 
                   "max_models": 100, 
                   "seed": 12345}

In [66]:
###: Now we will build H2O Deep Learning models based on Gird Search criteria and GBM hyperparameters setting:

In [67]:
house_price_dl_grid = H2OGridSearch(model=H2ODeepLearningEstimator(epochs=10,
                                                        seed=1,
                                                        nfolds=nfolds,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=dl_hyper_params,
                     search_criteria=grid_search_criteria,
                     grid_id="house_price_dl_grid")

house_price_dl_grid.train(x=features, y=response, training_frame=train)

deeplearning Grid Build progress: |███████████████████████████████████████| 100%


In [68]:
###: Lets see the count of all models we built during grid search

In [69]:
len(house_price_dl_grid.model_ids)

12

In [70]:
###: Now we will train a stacked ensemble model by passing the GBM and Deep Learning grid models

In [71]:
all_ids = []

In [72]:
for mid in house_price_dl_grid.model_ids:
    all_ids.append(mid)
for mid in house_price_gbm_grid.model_ids:
    all_ids.append(mid)
print(len(all_ids))    

112


In [73]:
house_price_gbm_dl_grid_ensemble = H2OStackedEnsembleEstimator(model_id="house_price_gbm_dl_grid_ensemble_3",
                                       base_models=all_ids)

In [74]:
house_price_gbm_dl_grid_ensemble.train(x=features, y=response, training_frame=train)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [75]:
###: Let's evaluate the stacked ensemble model performance based on test data

In [76]:
perf_gbm_dl_stack_test = house_price_gbm_dl_grid_ensemble.model_performance(test)

In [77]:
###: Now we can compare base learner performance with stacked ensemble model using test data

In [78]:
baselearner_gbm_dl_best_r2_test = max([h2o.get_model(model).model_performance(test_data=test).r2() for model in all_ids])

stack_gbm_dl_r2_test = perf_gbm_dl_stack_test.r2()

In [79]:
print("Best Base-learner model R2:  {0}".format(baselearner_gbm_dl_best_r2_test))
print("Ensemble Model R2:  {0}".format(stack_gbm_dl_r2_test))

Best Base-learner model R2:  0.870147909488
Ensemble Model R2:  0.841664055295
