In [1]:
import numpy as np
import pandas as pd
import h2o
import math
from h2o.estimators import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_SEED = 17
FILE = "NO"

In [3]:
h2o.init(nthreads = -1, min_mem_size = 20)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_131"; OpenJDK Runtime Environment (build 1.8.0_131-b12); OpenJDK 64-Bit Server VM (build 25.131-b12, mixed mode)
  Starting server from /home/u20104/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /home/u20104/tmp/tmpo1b4k9hi
  JVM stdout: /home/u20104/tmp/tmpo1b4k9hi/h2o_u20104_started_from_python.out
  JVM stderr: /home/u20104/tmp/tmpo1b4k9hi/h2o_u20104_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.9
H2O cluster version age:,"14 days, 10 hours and 42 minutes"
H2O cluster name:,H2O_from_python_u20104_zu3v0v
H2O cluster total nodes:,1
H2O cluster free memory:,20.69 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [4]:
df = pd.read_csv('data/brfss2017.csv')

In [5]:
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = RANDOM_SEED, stratify = df["cvdinfr4"])
df_test, df_val = train_test_split(df_test, test_size = 0.3, random_state = RANDOM_SEED, stratify = df_test["cvdinfr4"])

In [6]:
train = h2o.H2OFrame(df_train)
test = h2o.H2OFrame(df_test)
val = h2o.H2OFrame(df_val)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
train = train.asfactor()
test = test.asfactor()
val = val.asfactor()

In [28]:
print(train["cvdinfr4"].table())

cvdinfr4,Count
1,12426
2,189311





In [8]:
training_cols = ["deaf","blind","x.rfhlth","x.rfhype5","x.rfchol1","x.asthms1","x.drdxar1","x.race","x.age.g","x.bmi5cat","x.chldcnt","x.educag","x.incomg","x.smoker3","x.ecigsts","x.rfdrhv5","x.totinda"]

In [9]:
model = H2ORandomForestEstimator(ntrees=50, max_depth=20)

In [10]:
model.train(x = training_cols, y = "cvdinfr4", training_frame = train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
params21 = {'sample_rate': [0.6, 0.7, 0.8, 0.9, 1],
            'col_sample_rate_per_tree': [.3, .7, .8, 1],
            'col_sample_rate_change_per_level': [.3, .7, .8, 2],
            'min_rows': [2**x for x in range(0,int(math.log(train.nrow,2)-1)+1)],
            'nbins': [2**x for x in range(4,11)],
            'nbins_cats': [2**x for x in range(4,13)],
            'min_split_improvement': [0,1e-8,1e-6,1e-4],
             'max_depth' : [4,6,8,12,16,20]
            }

params1 = {'max_depth' : [4,16]}

In [12]:
search_criteria_tune = {
                   'strategy': "RandomDiscrete",
                   'max_runtime_secs': 84000,
                   'seed' : RANDOM_SEED
                   }

In [16]:
drf = H2ORandomForestEstimator(ntrees = 10, sample_rate = 0.8, stopping_rounds = 2, stopping_metric = "AUC", stopping_tolerance = 1e-3, seed = RANDOM_SEED, score_tree_interval = 200, balance_classes = True)

In [19]:
final_grid = H2OGridSearch(drf, hyper_params = params1, grid_id = 'final_grid2', search_criteria = search_criteria_tune)

In [20]:
final_grid.train(x = training_cols, y = "cvdinfr4", training_frame = train, validation_frame = val)

drf Grid Build progress: |████████████████████████████████████████████████| 100%


In [21]:
sorted_final_grid = final_grid.get_grid(sort_by='auc',decreasing=True)

print(sorted_final_grid)

    max_depth            model_ids                 auc
0           4  final_grid2_model_1  0.8040722641053615
1          16  final_grid2_model_0  0.7766836631386698



In [27]:
for i in range(10):
    curr_model = h2o.get_model(sorted_final_grid.sorted_metric_table()['model_ids'][i])
   # h2o.save_model(curr_model, FILE + "-DRF_Model"+str(i)+".csv", force=True)
    print("\n\n\n\nMODEL "+str(i)+"\n")
    perf = curr_model.model_performance(test)
    print(perf)
    #print(curr_model)





MODEL 0


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.05333097118670925
RMSE: 0.230934993421762
LogLoss: 0.19620138604939566
Mean Per-Class Error: 0.26462873843743084
AUC: 0.8093562311106732
Gini: 0.6187124622213465
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7590435786110714: 


0,1,2,3,4
,1.0,2.0,Error,Rate
1,1.0,3727.0,0.9997,(3727.0/3728.0)
2,0.0,56794.0,0.0,(0.0/56794.0)
Total,1.0,60521.0,0.0616,(3727.0/60522.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.7590436,0.9682308,398.0
max f2,0.7590436,0.9870454,398.0
max f0point5,0.8268157,0.9519864,348.0
max accuracy,0.7590436,0.9384191,398.0
max precision,0.9938506,1.0,0.0
max recall,0.7590436,1.0,398.0
max specificity,0.9938506,1.0,0.0
max absolute_mcc,0.9105607,0.2536658,220.0
max min_per_class_accuracy,0.9272640,0.7323837,178.0


Gains/Lift Table: Avg response rate: 93.84 %, avg score: 94.36 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0427117,0.9932716,1.0631673,1.0631673,0.9976789,0.9933419,0.9976789,0.9933419,0.0454097,0.0454097,6.3167296,6.3167296
,2,0.0580285,0.9927256,1.0621921,1.0629099,0.9967638,0.9927256,0.9974374,0.9931792,0.0162693,0.0616791,6.2192061,6.2909881
,3,0.1015994,0.9907152,1.0607914,1.0620014,0.9954494,0.9919750,0.9965848,0.9926628,0.0462197,0.1078987,6.0791405,6.2001372
,4,0.1501107,0.9886412,1.0634630,1.0624737,0.9979564,0.9892081,0.9970281,0.9915463,0.0515900,0.1594887,6.3462997,6.2473725
,5,0.2003569,0.9864371,1.0610852,1.0621255,0.9957251,0.9879827,0.9967013,0.9906526,0.0533155,0.2128042,6.1085219,6.2125511
,6,0.3002710,0.9809680,1.0575343,1.0605978,0.9923929,0.9835541,0.9952677,0.9882906,0.1056626,0.3184667,5.7534325,6.0597810
,7,0.4000364,0.9686103,1.0494037,1.0578061,0.9847632,0.9746945,0.9926480,0.9848999,0.1046942,0.4231609,4.9403746,5.7806114
,8,0.5014540,0.9559826,1.0368209,1.0535619,0.9729554,0.9612564,0.9886652,0.9801180,0.1051520,0.5283128,3.6820867,5.3561907
,9,0.6010046,0.9455817,1.0191240,1.0478576,0.9563485,0.9513199,0.9833123,0.9753479,0.1014544,0.6297672,1.9123971,4.7857597








MODEL 1


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.05389212769069225
RMSE: 0.23214678048745852
LogLoss: 0.2175975995753514
Mean Per-Class Error: 0.28330497588529036
AUC: 0.7870154056879912
Gini: 0.5740308113759824
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.46706999870026106: 


0,1,2,3,4
,1.0,2.0,Error,Rate
1,3.0,3725.0,0.9992,(3725.0/3728.0)
2,1.0,56793.0,0.0,(1.0/56794.0)
Total,4.0,60518.0,0.0616,(3726.0/60522.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4670700,0.9682385,397.0
max f2,0.4384469,0.9870420,399.0
max f0point5,0.7800516,0.9512680,297.0
max accuracy,0.4878491,0.9384356,395.0
max precision,0.9982709,0.9950597,9.0
max recall,0.4384469,1.0,399.0
max specificity,0.9999733,0.9914163,0.0
max absolute_mcc,0.9345972,0.2314173,149.0
max min_per_class_accuracy,0.9512687,0.7127146,121.0


Gains/Lift Table: Avg response rate: 93.84 %, avg score: 95.38 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0248505,1.0,1.0443846,1.0443846,0.9800532,1.0,0.9800532,1.0,0.0259534,0.0259534,4.4384605,4.4384605
,2,0.0300717,0.9998602,1.0656407,1.0480752,1.0,0.9999070,0.9835165,0.9999839,0.0055640,0.0315174,6.5640737,4.8075230
,3,0.0400020,0.9997023,1.0620945,1.0515554,0.9966722,0.9997868,0.9867823,0.9999349,0.0105469,0.0420643,6.2094511,5.1555440
,4,0.0500149,0.9995681,1.0656407,1.0543753,1.0,0.9996238,0.9894285,0.9998726,0.0106701,0.0527344,6.5640737,5.4375291
,5,0.1000132,0.9990652,1.0635278,1.0589508,0.9980172,0.9992997,0.9937221,0.9995862,0.0531746,0.1059091,6.3527768,5.8950773
,6,0.1500446,0.9983857,1.0628253,1.0602427,0.9973580,0.9987545,0.9949345,0.9993089,0.0531746,0.1590837,6.2825305,6.0242711
,7,0.2000595,0.9973395,1.0578957,1.0596560,0.9927321,0.9978806,0.9943839,0.9989518,0.0529105,0.2119942,5.7895743,5.9655969
,8,0.3002214,0.9941012,1.0514017,1.0569021,0.9866381,0.9959629,0.9917997,0.9979547,0.1053104,0.3173046,5.1401723,5.6902131
,9,0.4011434,0.9891975,1.0488920,1.0548869,0.9842829,0.9919368,0.9899086,0.9964407,0.1058563,0.4231609,4.8891963,5.4886886






KeyError: 2