In [2]:
#----------------------------------------------------------------------
# Purpose:  Condition an Airline dataset by filtering out NAs where the
#           departure delay in the input dataset is unknown.
#
#           Then treat anything longer than minutesOfDelayWeTolerate
#           as delayed.
#----------------------------------------------------------------------

In [3]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [4]:
h2o.init()



0,1
H2O cluster uptime:,36 minutes 47 seconds 274 milliseconds
H2O cluster version:,3.5.0.99999
H2O cluster name:,ludirehak
H2O cluster total nodes:,1
H2O cluster total memory:,3.56 GB
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster healthy:,True
H2O Connection ip:,127.0.0.1
H2O Connection port:,54321


In [5]:
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.

air = h2o.import_file(_locate("smalldata/airlines/allyears2k_headers.zip"))


Parse Progress: [##################################################] 100%
Imported /Users/ludirehak/h2o-3/smalldata/airlines/allyears2k_headers.zip. Parsed 43,978 rows and 31 cols


In [6]:
numRows, numCols = air.dim
print "Original dataset rows: {0}, columns: {1}".format(numRows, numCols)

x_cols = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance"]
y_col = "SynthDepDelayed"

noDepDelayedNAs = air[air["DepDelay"].isna() == 0]
rows, cols = noDepDelayedNAs.dim
print "New dataset rows: {0}, columns: {1}".format(rows, cols)

Original dataset rows: 43978, columns: 31
New dataset rows: 42892, columns: 31


In [7]:
minutesOfDelayWeTolerate = 15
noDepDelayedNAs.cbind(noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate)
noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols-1].asfactor()
noDepDelayedNAs.set_name(numCols,y_col)

H2OFrame with 42892 rows and 32 columns: 


Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed,IsDepDelayed,SynthDepDelayed
0,1987,10,14,3,741,730,912,849,PS,1451,...,,0,,,,,,YES,YES,YES
1,1987,10,15,4,729,730,903,849,PS,1451,...,,0,,,,,,YES,NO,NO
2,1987,10,17,6,741,730,918,849,PS,1451,...,,0,,,,,,YES,YES,YES
3,1987,10,18,7,729,730,847,849,PS,1451,...,,0,,,,,,NO,NO,NO
4,1987,10,19,1,749,730,922,849,PS,1451,...,,0,,,,,,YES,YES,YES
5,1987,10,21,3,728,730,848,849,PS,1451,...,,0,,,,,,NO,NO,NO
6,1987,10,22,4,728,730,852,849,PS,1451,...,,0,,,,,,YES,NO,NO
7,1987,10,23,5,731,730,902,849,PS,1451,...,,0,,,,,,YES,YES,YES
8,1987,10,24,6,744,730,908,849,PS,1451,...,,0,,,,,,YES,YES,YES
9,1987,10,25,7,729,730,851,849,PS,1451,...,,0,,,,,,YES,NO,NO




In [8]:
gbm = H2OGradientBoostingEstimator(distribution="bernoulli")
gbm.train(x=x_cols, y=y_col, training_frame = noDepDelayedNAs)
gbm.show()


gbm Model Build Progress: [##################################################] 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1445557087082_2738

Model Summary:


0,1,2,3,4,5,6,7,8
,number_of_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,50.0,34343.0,5.0,5.0,5.0,18.0,32.0,28.62




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.191672191035
R^2: 0.232789986813
LogLoss: 0.565710073073
AUC: 0.785428554449
Gini: 0.570857108897

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.412557029006:


0,1,2,3,4
,NO,YES,Error,Rate
NO,11180.0,9707.0,0.4647,(9707.0/20887.0)
YES,3402.0,18603.0,0.1546,(3402.0/22005.0)
Total,14582.0,28310.0,0.3056,(13109.0/42892.0)



Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4,0.7,259.0
max f2,0.2,0.8,347.0
max f0point5,0.6,0.7,180.0
max accuracy,0.5,0.7,213.0
max precision,1.0,1.0,0.0
max absolute_MCC,0.5,0.4,213.0
max min_per_class_accuracy,0.5,0.7,209.0



Scoring History:


0,1,2,3,4,5,6,7
,timestamp,duration,number_of_trees,training_MSE,training_logloss,training_AUC,training_classification_error
,2015-10-22 17:14:56,0.174 sec,1.0,0.2,0.7,0.7,0.4
,2015-10-22 17:14:56,0.230 sec,2.0,0.2,0.7,0.7,0.4
,2015-10-22 17:14:56,0.331 sec,3.0,0.2,0.7,0.7,0.4
,2015-10-22 17:14:56,0.425 sec,4.0,0.2,0.7,0.7,0.4
,2015-10-22 17:14:56,0.454 sec,5.0,0.2,0.7,0.7,0.4
---,---,---,---,---,---,---,---
,2015-10-22 17:14:58,2.245 sec,46.0,0.2,0.6,0.8,0.3
,2015-10-22 17:14:58,2.305 sec,47.0,0.2,0.6,0.8,0.3
,2015-10-22 17:14:58,2.352 sec,48.0,0.2,0.6,0.8,0.3



Variable Importances:


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Origin,6877.3,1.0,0.4
Dest,4551.0,0.7,0.3
DayofMonth,2025.6,0.3,0.1
UniqueCarrier,1279.5,0.2,0.1
CRSArrTime,724.8,0.1,0.0
CRSDepTime,636.9,0.1,0.0
DayOfWeek,408.2,0.1,0.0
CRSElapsedTime,118.8,0.0,0.0
Month,73.3,0.0,0.0
