In [None]:
Build base learners with hyperparameter search. Base algorithms: GLM, GBM, and Deep Learning
    
Hyperparameter Search
    Random discrete
    Five maximum models
Common Algorithm Parameters
    5-fold cross-validation
    Modulo fold assignment
GLM Search Space
    Alpha (Regularization distribution): 0 to 1 by 0.1
    Lambda (Regularization strength): 0, 1e-7, 1e-5, 1e-3, 1e-1
GBM Search Space
    Learning Rate: 0.01, 0.03
    Maximum Tree Depth: 3, 4, 5, 6, 9
    Row Sample Rate: 0.7, 0.8, 0.9, 1
    Column Sample Rate: 0.2 0.3, 0.4, 0.5, 0.6, 0.7, 0.8
Deep Learning Search Space
    Activations: Rectifier, Rectifier with Dropout
    Hidden Layers and Units: (10,10), (20,15), (50,50,50)
    L1 Regularization: 0, 1e-3, 1e-5
    L2 Regularization: 0, 1e-3, 1e-5

Ensemble all the models
Evaluate the ensemble performance on the test set

In [1]:
import h2o
h2o.init(nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,35 mins 00 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.2
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_praveen_lv5xcq
H2O cluster total nodes:,1
H2O cluster free memory:,1.374 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [2]:
data_path = "../data/airlines.csv"
airlines_df = h2o.import_file(data_path)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
## New time parameters - to convert non-continuous clock-time to elapsed minutes since midnight
#CRS times
hours = airlines_df["CRSArrTime"] // 100
mins = airlines_df["CRSArrTime"] % 100
CRSarrTimeMins = hours*60+mins
hours = airlines_df["CRSDepTime"] // 100
mins = airlines_df["CRSDepTime"] % 100
CRSdepTimeMins = hours*60+mins
airlines_df["CRSarrTimeMins"]  = CRSarrTimeMins
airlines_df["CRSdepTimeMins"]  = CRSdepTimeMins

#Actual times
hours = airlines_df["ArrTime"] // 100
mins = airlines_df["ArrTime"] % 100
arrTimeMins = hours*60+mins
hours = airlines_df["DepTime"] // 100
mins = airlines_df["DepTime"] % 100
depTimeMins = hours*60+mins
airlines_df["ArrTimeMins"]  = arrTimeMins
airlines_df["DepTimeMins"]  = depTimeMins

travelTime = (airlines_df["DepTimeMins"] - airlines_df["ArrTimeMins"] > 0).ifelse(airlines_df["DepTimeMins"] - airlines_df["ArrTimeMins"], airlines_df["ArrTimeMins"] - airlines_df["DepTimeMins"])
airlines_df["TravelTime"]  = travelTime

In [4]:
data_split = airlines_df.split_frame(ratios=[0.8], seed=12345)
Yvar = "IsDepDelayed"
Xvar = ["Year", "Month", "DayofMonth", "DayOfWeek", "CRSarrTimeMins", "CRSdepTimeMins", "ArrTimeMins", "DepTimeMins", "Dest"]

In [5]:
#airlines_df.describe()

In [6]:
# Import H2O Grid Search:
from h2o.grid.grid_search import H2OGridSearch

# Import estimators
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [7]:
# Search criteria
search_criteria = {'strategy': 'RandomDiscrete', 'max_models': 5, 'seed': 1618}

#Hyperparameters
common_params = {'nfolds': 5, 'fold_assignment': 'Modulo', 'keep_cross_validation_predictions': True} 
#last param needed for Ensemble to work

glm_params = {'alpha': [i * 0.1 for i in range(0, 10)], 
                'lambda': [0, 1e-7, 1e-5, 1e-1]}

gbm_params = {'learn_rate': [0.01, 0.03],
                'max_depth': [3, 4, 5, 6, 9],
                'sample_rate': [0.7, 0.8, 0.9, 1],
                'col_sample_rate': [i * 0.1 for i in range(2, 8)]}

dl_params = {'activation': ['Rectifier', 'RectifierWithDropout'],
                'hidden': [[10,10], [20,15], [50,50,50]],
                'l1': [0, 1e-3, 1e-5],
                'l2': [0, 1e-3, 1e-5]}

#{**glm_params, **common_params}

<div align="center"><b>   Generalized Linear Model   </b></div>

In [8]:
#GLM
glm_grid = H2OGridSearch(H2OGeneralizedLinearEstimator(family = 'binomial', 
                                                       lambda_search = True, 
                                                       standardize = True, 
                                                       **common_params),
                        glm_params, 
                        grid_id="GLM", 
                        search_criteria=search_criteria)

glm_grid.train(x=Xvar, y=Yvar, training_frame=data_split[0])
#glm_grid.hyper_params
#validation_frame=data_split[1]

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [9]:
glm_grid_perf = glm_grid.get_grid(sort_by='auc', decreasing=True)
print (glm_grid_perf)

                     alpha    lambda    model_ids                 auc
0    [0.30000000000000004]  [1.0E-5]  GLM_model_3  0.6834277255289434
1                    [0.4]  [1.0E-5]  GLM_model_4  0.6834228481792217
2     [0.6000000000000001]  [1.0E-7]  GLM_model_5  0.6831933838121061
3    [0.30000000000000004]     [0.0]  GLM_model_2  0.6831794093283996
4                    [0.2]     [0.1]  GLM_model_1   0.628758819511472



In [10]:
glm_best_perf = glm_grid_perf.models[0]
glm_best_res = glm_best_perf.model_performance(data_split[1])
print (glm_best_res.auc())

0.6852129385214988


<div align="center"><b>   Gradient Boosting Machine   </b></div>

In [11]:
#GBM
gbm_grid = H2OGridSearch(model=H2OGradientBoostingEstimator,
                        grid_id='gbm_grid',
                        hyper_params=gbm_params,
                        search_criteria=search_criteria
                        )

gbm_grid.train(x=Xvar, y=Yvar, training_frame=data_split[0], seed=1618, **common_params)

#

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [12]:
gbm_grid_perf = gbm_grid.get_grid(sort_by='auc', decreasing=True)
print (gbm_grid_perf)

        col_sample_rate learn_rate max_depth sample_rate         model_ids  \
0                   0.5       0.03         6         0.8  gbm_grid_model_2   
1                   0.4       0.03         6         1.0  gbm_grid_model_3   
2    0.7000000000000001       0.03         3         1.0  gbm_grid_model_5   
3    0.6000000000000001       0.01         5         0.8  gbm_grid_model_1   
4    0.7000000000000001       0.01         4         1.0  gbm_grid_model_4   

                  auc  
0  0.7716159057739157  
1  0.7696751313601468  
2  0.7405449896674873  
3  0.7349932408115895  
4  0.7237610486096676  



In [13]:
gbm_best_perf = gbm_grid_perf.models[0]
gbm_best_res = gbm_best_perf.model_performance(data_split[1])
print (gbm_best_res.auc())

0.7717567999421463


<div align="center"><b>   Deep Learning   </b></div>

In [14]:
dl_grid = H2OGridSearch(model=H2ODeepLearningEstimator,
                        grid_id='dl_grid',
                        hyper_params=dl_params,
                        search_criteria=search_criteria)

dl_grid.train(x=Xvar, y=Yvar, training_frame=data_split[0], seed=1618, **common_params)


deeplearning Grid Build progress: |███████████████████████████████████████| 100%


In [15]:
dl_grid_perf = dl_grid.get_grid(sort_by='auc', decreasing=True)
print (dl_grid_perf)

               activation        hidden      l1      l2        model_ids  \
0               Rectifier  [50, 50, 50]     0.0  1.0E-5  dl_grid_model_3   
1               Rectifier      [20, 15]  1.0E-5     0.0  dl_grid_model_2   
2    RectifierWithDropout  [50, 50, 50]   0.001     0.0  dl_grid_model_5   
3    RectifierWithDropout      [10, 10]  1.0E-5   0.001  dl_grid_model_4   
4    RectifierWithDropout      [20, 15]   0.001  1.0E-5  dl_grid_model_1   

                  auc  
0  0.9186812924464998  
1  0.9175529052488895  
2  0.8357418110564595  
3  0.8101480466578056  
4  0.7903151823921623  



In [16]:
dl_best_perf = dl_grid_perf.models[0]
dl_best_res = dl_best_perf.model_performance(data_split[1])
print (dl_best_res.auc())

0.9449746217192392


<div align="center"><b>   Ensembling   </b></div>

In [17]:
all_models = glm_grid.model_ids + gbm_grid.model_ids + dl_grid.model_ids
print(all_models)

['GLM_model_3', 'GLM_model_4', 'GLM_model_5', 'GLM_model_2', 'GLM_model_1', 'gbm_grid_model_2', 'gbm_grid_model_3', 'gbm_grid_model_5', 'gbm_grid_model_1', 'gbm_grid_model_4', 'dl_grid_model_3', 'dl_grid_model_2', 'dl_grid_model_5', 'dl_grid_model_1', 'dl_grid_model_4']


In [18]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

ensemble = H2OStackedEnsembleEstimator(model_id="ensemble_glm_gbm_dl", 
                                       base_models=glm_grid.model_ids + gbm_grid.model_ids + dl_grid.model_ids,
                                      )

ensemble.train(x=Xvar, y=Yvar, training_frame=data_split[0])

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [19]:
# Eval ensemble performance on the test data
ensemble_perf = ensemble.model_performance(data_split[1])

In [21]:
print('Ensemble AUC on Validation data : ' + str(ensemble_perf.auc()))
print('GLM AUC on Validation data : ' + str(glm_best_res.auc()))
print('GBM AUC on Validation data : ' + str(gbm_best_res.auc()))
print('DL AUC on Validation data : ' + str(dl_best_res.auc()))

Ensemble AUC on Validation data : 0.9483537528227052
GLM AUC on Validation data : 0.6852129385214988
GBM AUC on Validation data : 0.7717567999421463
DL AUC on Validation data : 0.9449746217192392


In [166]:
#h2o.cluster().shutdown()

H2O session _sid_94c5 closed.
