In [1]:
import h2o


You can upgrade to the newest version of the module running from the command line
    $ pip2 install --upgrade requests


In [3]:
h2o.connect(ip="35.196.153.55")

Connecting to H2O server at http://35.196.153.55:54321... successful.


0,1
H2O cluster uptime:,7 mins 14 secs
H2O cluster version:,3.14.0.7
H2O cluster version age:,10 days
H2O cluster name:,sparkling-water-olmsteadbrett_local-1509393572563
H2O cluster total nodes:,1
H2O cluster free memory:,2.322 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://35.196.153.55:54321


<H2OConnection to http://35.196.153.55:54321, session _sid_bae2>

In [5]:

from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [7]:
air = h2o.import_file("https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:

# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt

In [9]:
air_path = "https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip"

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()

Import and Parse airlines data
Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:43978
Cols:31




Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed,IsDepDelayed
type,int,int,int,int,int,int,int,int,enum,int,enum,int,int,int,int,int,enum,enum,int,int,int,int,enum,int,int,int,int,int,int,enum,enum
mins,1987.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,,1.0,,16.0,17.0,14.0,-63.0,-16.0,,,11.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,
mean,1997.5,1.40909090909,14.6010732639,3.82061485288,1345.84666138,1313.22286143,1504.63413038,1485.28916731,,818.842989677,,124.814529135,125.021562607,114.316111091,9.31711193698,10.0073906556,,,730.182190565,5.38136805953,14.1686341847,0.0246941652645,,0.00247851198326,4.04780029106,0.289376469271,4.85503190418,0.0170155602821,7.62006045002,,
maxs,2008.0,10.0,31.0,7.0,2400.0,2359.0,2400.0,2359.0,,3949.0,,475.0,437.0,402.0,475.0,473.0,,,3365.0,128.0,254.0,1.0,,1.0,369.0,201.0,323.0,14.0,373.0,,
sigma,6.34436090171,1.87471137134,9.17579042586,1.90501311913,465.340899124,476.251139993,484.347487904,492.750434123,,777.404369164,,73.9744416606,73.40159463,69.6363295151,29.8402219624,26.4388090429,,,578.43800823,4.20197993986,9.9050857472,0.155193141358,,0.0497234872189,16.2057299045,4.41677989873,18.6197762215,0.403940182102,23.4875658741,,
zeros,0,0,0,0,0,569,0,569,,0,,0,0,0,1514,6393,,,0,623,557,42892,,43869,7344,8840,7388,8914,7140,,
missing,0,0,0,0,1086,0,1195,0,0,0,32,1195,13,16649,1195,1086,0,0,35,16026,16024,0,9774,0,35045,35045,35045,35045,35045,0,0
0,1987.0,10.0,14.0,3.0,741.0,730.0,912.0,849.0,PS,1451.0,,91.0,79.0,,23.0,11.0,SAN,SFO,447.0,,,0.0,,0.0,,,,,,YES,YES
1,1987.0,10.0,15.0,4.0,729.0,730.0,903.0,849.0,PS,1451.0,,94.0,79.0,,14.0,-1.0,SAN,SFO,447.0,,,0.0,,0.0,,,,,,YES,NO
2,1987.0,10.0,17.0,6.0,741.0,730.0,918.0,849.0,PS,1451.0,,97.0,79.0,,29.0,11.0,SAN,SFO,447.0,,,0.0,,0.0,,,,,,YES,YES


In [10]:
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = H2OGeneralizedLinearEstimator(family = "gaussian")
        lr.train(x=x, y=y, training_frame=data)
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._id)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()

Month,sum_Cancelled,nrow
1,1067,41979
10,19,1999


Rows:2
Cols:3




Unnamed: 0,Month,sum_Cancelled,nrow
type,int,int,int
mins,1.0,19.0,1999.0
mean,5.5,543.0,21989.0
maxs,10.0,1067.0,41979.0
sigma,6.36396103068,741.047906684,28270.1291118
zeros,0,0,0
missing,0,0,0
0,1.0,1067.0,41979.0
1,10.0,19.0,1999.0


In [12]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%


In [13]:
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [14]:
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
                                        ntrees         =3,
                                        max_depth      =1,
                                        distribution   ="bernoulli",
                                        learn_rate     =0.1,
                                        min_rows       =2)

data_gbm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
                                         ntrees         =50,
                                         max_depth      =5,
                                         distribution   ="bernoulli",
                                         learn_rate     =0.1,
                                         min_rows       =2)

data_gbm2.train(x               =myX,
                y               =myY,
                training_frame  =train,
                validation_frame=test)

# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees         =5,
                                   max_depth      =2,
                                   balance_classes=True)

data_rf.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)

# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees         =10,
                                    max_depth      =5,
                                    balance_classes=True)

data_rf2.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden              =[10,10],
                                   epochs              =5,
                                   variable_importances=True,
                                   balance_classes     =True,
                                   loss                ="Automatic")

data_dl.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)

glm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [15]:

# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)

data_gbm.varimp()
data_rf.varimp()

Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |               2.12834    |
| Origin.HPN       |               1.78092    |
| Origin.LIH       |               1.72329    |
| Year.2003        |               1.66481    |
| Dest.LYH         |               1.48352    |
| Origin.TLH       |               1.4438     |
| Origin.MDW       |               1.42237    |
| Origin.LEX       |               1.37468    |
| Origin.CHO       |               1.37442    |
| Year.2007        |               1.33457    |
| Origin.TRI       |               1.1948     |
| Origin.LBB       |               1.11611    |
| Dest.PNS         |               1.09135    |
| Dest.HTS         |               1.03871    |
| UniqueCarrier.HP |               1.00414    |
| Origin.ERI       |               1.00131    |
| Year.2002        |               1.00045    |
| Year.2001        |               0.989799   |
| Origin.SRQ     

[(u'Year', 1041.415283203125, 1.0, 0.547312376507354),
 (u'Origin', 523.938720703125, 0.5031025846784432, 0.27535427124735096),
 (u'FlightNum', 101.34282684326172, 0.09731259803635424, 0.053260389295381905),
 (u'DayOfWeek', 93.48446655273438, 0.08976675113236311, 0.04913045389359786),
 (u'Distance', 64.25371551513672, 0.06169845646734591, 0.03376832883597861),
 (u'Dest', 55.47443771362305, 0.05326831534774291, 0.029154408265516324),
 (u'UniqueCarrier',
  22.870986938476562,
  0.021961447375855002,
  0.012019771954820393),
 (u'Month', 0.0, 0.0, 0.0)]

In [16]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.204516957162
RMSE: 0.452235510726
LogLoss: 0.595705258221
Mean Per-Class Error: 0.314546559781
AUC: 0.745583363998
Gini: 0.491166727997
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.378769592765: 


0,1,2,3,4
,NO,YES,Error,Rate
NO,2300.0,2967.0,0.5633,(2967.0/5267.0)
YES,814.0,4942.0,0.1414,(814.0/5756.0)
Total,3114.0,7909.0,0.343,(3781.0/11023.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3787696,0.7233077,284.0
max f2,0.1839260,0.8460443,372.0
max f0point5,0.5536517,0.7028692,193.0
max accuracy,0.5048457,0.6860201,220.0
max precision,0.9688600,1.0,0.0
max recall,0.0689509,1.0,398.0
max specificity,0.9688600,1.0,0.0
max absolute_mcc,0.5048457,0.3708690,220.0
max min_per_class_accuracy,0.5163738,0.6829315,214.0


Gains/Lift Table: Avg response rate: 52.22 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100699,0.9336374,1.7597712,1.7597712,0.9189189,0.9189189,0.0177206,0.0177206,75.9771238,75.9771238
,2,0.0200490,0.9086334,1.6887217,1.7244072,0.8818182,0.9004525,0.0168520,0.0345726,68.8721650,72.4407190
,3,0.0301188,0.8954846,1.7080133,1.7189261,0.8918919,0.8975904,0.0171994,0.0517721,70.8013260,71.8926087
,4,0.0400980,0.8821818,1.7757692,1.7330726,0.9272727,0.9049774,0.0177206,0.0694927,77.5769158,73.3072552
,5,0.0500771,0.8675461,1.6190836,1.7103574,0.8454545,0.8931159,0.0161571,0.0856498,61.9083644,71.0357371
,6,0.1002449,0.8094473,1.6068372,1.6585504,0.8390597,0.8660633,0.0806115,0.1662613,60.6837177,65.8550433
,7,0.1505035,0.7670999,1.5209745,1.6126089,0.7942238,0.8420735,0.0764420,0.2427033,52.0974503,61.2608862
,8,0.2001270,0.7295836,1.4494126,1.5721427,0.7568556,0.8209429,0.0719249,0.3146282,44.9412615,57.2142703
,9,0.3000091,0.6659620,1.3775802,1.5073670,0.7193460,0.7871182,0.1375956,0.4522238,37.7580177,50.7366973





