In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [2]:
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_91"; Java(TM) SE Runtime Environment (build 1.8.0_91-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.91-b14, mixed mode)
  Starting server from /Users/pasha/github/h2o-3/build/h2o.jar
  Ice root: /var/folders/k9/mxm_jrl50m75kmgvdk9_wlh00000gp/T/tmpNO2RkW
  JVM stdout: /var/folders/k9/mxm_jrl50m75kmgvdk9_wlh00000gp/T/tmpNO2RkW/h2o_pasha_started_from_python.out
  JVM stderr: /var/folders/k9/mxm_jrl50m75kmgvdk9_wlh00000gp/T/tmpNO2RkW/h2o_pasha_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return _locate(s)

In [4]:
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print("Import and Parse bike data")
data = h2o.import_file(path=small_test)

Import and Parse bike data
Parse progress: |█████████████████████████████████████████████████████████████████████████████| 100%


In [5]:
# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay = 1000 * 60 * 60 * 24
data["Days"] = (startime / secsPerDay).floor()
data.describe()

Rows:1,037,712 Cols:16
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,3,0.5859375,240 B,0.0003780
CBS,Bits,32,6.25,128.9 KB,0.2078283
C1N,1-Byte Integers (w/o NAs),32,6.25,1015.5 KB,1.6376555
C1S,1-Byte Fractions,61,11.9140625,1.9 MB,3.1233454
C2,2-Byte Integers,160,31.25,9.9 MB,16.3594216
C2S,2-Byte Fractions,10,1.953125,634.3 KB,1.0228260
C4,4-Byte Integers,25,4.8828125,3.1 MB,5.1095314
C4S,4-Byte Fractions,23,4.4921875,2.8 MB,4.7019318
C8,64-bit Integers,61,11.9140625,15.1 MB,24.9282986


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,60.6 MB,1037712.0,32.0,512.0
mean,60.6 MB,1037712.0,32.0,512.0
min,60.6 MB,1037712.0,32.0,512.0
max,60.6 MB,1037712.0,32.0,512.0
stddev,0 B,0.0,0.0,0.0
total,60.6 MB,1037712.0,32.0,512.0






Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,Days
type,int,time,time,int,enum,real,real,int,enum,real,real,int,enum,int,int,int
mins,60.0,1.380610868e+12,1.380611083e+12,72.0,0.0,40.680342423,-74.01713445,72.0,0.0,40.680342423,-74.01713445,14529.0,0.0,1899.0,0.0,15979.0
mean,825.614754383,1.38191371692e+12,1.38191454253e+12,443.714212614,,40.7345188586,-73.9911328848,443.207421712,,40.7342847885,-73.9912702982,17644.0716451,0.906095332809,1975.77839486,1.12375591686,15993.8523906
maxs,1259480.0,1.383289197e+12,1.38341851e+12,3002.0,329.0,40.770513,-73.9500479759,3002.0,329.0,40.770513,-73.9500479759,20757.0,1.0,1997.0,2.0,16010.0
sigma,2000.3732323,778871729.132,778847387.504,354.434325075,,0.0195734073053,0.0123161234106,357.398217058,,0.0195578458116,0.0123855811965,1717.68112134,0.291696182123,11.1314906238,0.544380593291,9.02215033586
zeros,0,0,0,0,5239,0,0,0,5449,0,0,0,97446,0,97498,0
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,97445,0,0
0,326.0,1.380610868e+12,1.380611194e+12,239.0,Willoughby St & Fleet St,40.69196566,-73.9813018,366.0,Clinton Ave & Myrtle Ave,40.693261,-73.968896,16052.0,Subscriber,1982.0,1.0,15979.0
1,729.0,1.380610881e+12,1.38061161e+12,322.0,Clinton St & Tillary St,40.696192,-73.991218,398.0,Atlantic Ave & Furman St,40.69165183,-73.9999786,19412.0,Customer,,0.0,15979.0
2,520.0,1.380610884e+12,1.380611404e+12,174.0,E 25 St & 1 Ave,40.7381765,-73.97738662,403.0,E 2 St & 2 Ave,40.72502876,-73.99069656,19645.0,Subscriber,1984.0,1.0,15979.0


In [6]:
# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
grouped = data.group_by(["Days", "start station name"])
bpd = grouped.count().get_frame()  # Compute bikes-per-day
bpd.set_name(2, "bikes")
bpd.show()
bpd.describe()
bpd.dim

Days,start station name,bikes
15979,1 Ave & E 15 St,97
15979,1 Ave & E 18 St,75
15979,1 Ave & E 30 St,113
15979,10 Ave & W 28 St,74
15979,11 Ave & W 27 St,139
15979,11 Ave & W 41 St,60
15979,12 Ave & W 40 St,90
15979,2 Ave & E 31 St,88
15979,2 Ave & E 58 St,55
15979,3 Ave & Schermerhorn St,8


Rows:10,450 Cols:3
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,1,1.0416667,80 B,0.1364815
C1N,1-Byte Integers (w/o NAs),1,1.0416667,412 B,0.7028798
C1S,1-Byte Fractions,31,32.2916657,12.4 KB,21.7142075
C2,2-Byte Integers,63,65.625,44.3 KB,77.4464309


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,57.2 KB,10450.0,32.0,96.0
mean,57.2 KB,10450.0,32.0,96.0
min,57.2 KB,10450.0,32.0,96.0
max,57.2 KB,10450.0,32.0,96.0
stddev,0 B,0.0,0.0,0.0
total,57.2 KB,10450.0,32.0,96.0






Unnamed: 0,Days,start station name,bikes
type,int,enum,int
mins,15979.0,0.0,1.0
mean,15994.4415311,,99.3025837321
maxs,16010.0,329.0,553.0
sigma,9.23370172444,,72.9721964301
zeros,0,32,0
missing,0,0,0
0,15979.0,1 Ave & E 15 St,97.0
1,15979.0,1 Ave & E 18 St,75.0
2,15979.0,1 Ave & E 30 St,113.0


[10450, 3]

In [7]:
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print("Quantiles of bikes-per-day")
bpd["bikes"].quantile().show()

Quantiles of bikes-per-day


Probs,bikesQuantiles
0.01,4.49
0.1,19.0
0.25,43.0
0.333,57.0
0.5,87.0
0.667,118.0
0.75,137.0
0.9,192.0
0.99,334.51


In [8]:
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"] * secsPerDay
bpd["Month"] = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print("Bikes-Per-Day")
bpd.describe()

Bikes-Per-Day
Rows:10,450 Cols:5
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,33,20.6249997,2.6 KB,3.7012111
CBS,Bits,6,3.7500001,666 B,0.9337147
CXI,Zero Sparse Integers,4,2.5000000,800 B,1.1215792
C1N,1-Byte Integers (w/o NAs),23,14.3749997,8.9 KB,12.7551585
C1S,1-Byte Fractions,31,19.3749994,12.4 KB,17.8443253
C2,2-Byte Integers,63,39.3750012,44.3 KB,63.6440098


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,69.7 KB,10450.0,32.0,160.0
mean,69.7 KB,10450.0,32.0,160.0
min,69.7 KB,10450.0,32.0,160.0
max,69.7 KB,10450.0,32.0,160.0
stddev,0 B,0.0,0.0,0.0
total,69.7 KB,10450.0,32.0,160.0






Unnamed: 0,Days,start station name,bikes,Month,DayOfWeek
type,int,enum,int,enum,enum
mins,15979.0,0.0,1.0,0.0,0.0
mean,15994.4415311,,99.3025837321,0.968612440191,
maxs,16010.0,329.0,553.0,1.0,6.0
sigma,9.23370172444,,72.9721964301,0.174371128617,
zeros,0,32,0,328,1635
missing,0,0,0,0,0
0,15979.0,1 Ave & E 15 St,97.0,9,Mon
1,15979.0,1 Ave & E 18 St,75.0,9,Mon
2,15979.0,1 Ave & E 30 St,113.0,9,Mon


In [9]:
# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
    # Classic Test/Train split
    r = data["Days"].runif()   # Random uniform numbers, one per row
    train = data[r < 0.6]
    test = data[(0.6 <= r) & (r < 0.9)]
    hold = data[0.9 <= r]
    print("Training data has %d columns and %d rows, test has %d rows, holdout has %d"
          % (train.ncol, train.nrow, test.nrow, hold.nrow))
    bike_names_x = list(data.names)  # make a copy of the array
    bike_names_x.remove("bikes")

    # Run GBM
    s = time.time()
    gbm0 = H2OGradientBoostingEstimator(ntrees=500, max_depth=6, learn_rate=0.1)
    gbm0.train(x=bike_names_x, y="bikes", training_frame=train, validation_frame=test)
    gbm0_elapsed = time.time() - s

    # Run DRF
    s = time.time()
    drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)
    drf0.train(x=bike_names_x, y="bikes", training_frame=train, validation_frame=test)
    drf0_elapsed = time.time() - s

    # Run GLM
    if "WC1" in bike_names_x: bike_names_x.remove("WC1")
    s = time.time()
    glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
    glm0.train(x=bike_names_x, y="bikes", training_frame=train, validation_frame=test)
    glm0_elapsed = time.time() - s

    # Run DL
    s = time.time()
    dl0 = H2ODeepLearningEstimator(hidden=[50, 50, 50, 50], epochs=50)
    dl0.train(x=bike_names_x, y="bikes", training_frame=train, validation_frame=test)
    dl0_elapsed = time.time() - s

    # ----------
    # 4- Score on holdout set & report
    train_mse_gbm = gbm0.model_performance(train).mse()
    test_mse_gbm = gbm0.model_performance(test).mse()
    hold_mse_gbm = gbm0.model_performance(hold).mse()

    train_mse_drf = drf0.model_performance(train).mse()
    test_mse_drf = drf0.model_performance(test).mse()
    hold_mse_drf = drf0.model_performance(hold).mse()

    train_mse_glm = glm0.model_performance(train).mse()
    test_mse_glm = glm0.model_performance(test).mse()
    hold_mse_glm = glm0.model_performance(hold).mse()

    train_mse_dl = dl0.model_performance(train).mse()
    test_mse_dl = dl0.model_performance(test).mse()
    hold_mse_dl = dl0.model_performance(hold).mse()

    # make a pretty HTML table printout of the results
    header = ["Model", "mse TRAIN", "mse TEST", "mse HOLDOUT", "Model Training Time (s)"]
    table = [["GBM", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm0_elapsed, 3)],
             ["DRF", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf0_elapsed, 3)],
             ["GLM", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm0_elapsed, 3)],
             ["DL",  train_mse_dl,  test_mse_dl,  hold_mse_dl,  round(dl0_elapsed, 3)]]
    h2o.display.H2ODisplay(table, header)
    # --------------

In [10]:
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.

Training data has 5 columns and 6312 rows, test has 3147 rows, holdout has 991
gbm Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████████████████████████| 100%


0,1,2,3,4
Model,mse TRAIN,mse TEST,mse HOLDOUT,Model Training Time (s)
GBM,18.2451569,440.5341361,475.3214778,9.546
DRF,1605.0238113,1756.6302562,1683.2403756,4.68
GLM,742.5558651,838.2359921,855.4683499,1.309
DL,265.7244577,572.4409667,540.2203839,7.511


In [11]:
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                              mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()

Parse progress: |█████████████████████████████████████████████████████████████████████████████| 100%
Rows:17,520 Cols:50
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,104,6.3030303,8.1 KB,0.9701639
C0D,Constant Reals,423,25.6363630,33.0 KB,3.9459553
CXI,Zero Sparse Integers,28,1.6969698,4.9 KB,0.5881619
C1,1-Byte Integers,76,4.6060607,44.8 KB,5.3543255
C1N,1-Byte Integers (w/o NAs),196,11.8787877,114.9 KB,13.7193084
C1S,1-Byte Fractions,169,10.2424242,100.5 KB,11.9952843
C2S,2-Byte Fractions,156,9.4545454,173.9 KB,20.7652405
C4S,4-Byte Fractions,124,7.5151518,268.6 KB,32.0695162
CNAXI,NA Sparse Integers,261,15.8181816,40.0 KB,4.7731601


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,837.5 KB,17520.0,33.0,1650.0
mean,837.5 KB,17520.0,33.0,1650.0
min,837.5 KB,17520.0,33.0,1650.0
max,837.5 KB,17520.0,33.0,1650.0
stddev,0 B,0.0,0.0,0.0
total,837.5 KB,17520.0,33.0,1650.0






Unnamed: 0,Year Local,Month Local,Day Local,Hour Local,Year UTC,Month UTC,Day UTC,Hour UTC,Cavok Reported,Cloud Ceiling (m),Cloud Cover Fraction,Cloud Cover Fraction 1,Cloud Cover Fraction 2,Cloud Cover Fraction 3,Cloud Cover Fraction 4,Cloud Cover Fraction 5,Cloud Cover Fraction 6,Cloud Height (m) 1,Cloud Height (m) 2,Cloud Height (m) 3,Cloud Height (m) 4,Cloud Height (m) 5,Cloud Height (m) 6,Dew Point (C),Humidity Fraction,Precipitation One Hour (mm),Pressure Altimeter (mbar),Pressure Sea Level (mbar),Pressure Station (mbar),Snow Depth (cm),Temperature (C),Visibility (km),Weather Code 1,Weather Code 1/ Description,Weather Code 2,Weather Code 2/ Description,Weather Code 3,Weather Code 3/ Description,Weather Code 4,Weather Code 4/ Description,Weather Code 5,Weather Code 5/ Description,Weather Code 6,Weather Code 6/ Description,Weather Code Most Severe / Icon Code,Weather Code Most Severe,Weather Code Most Severe / Description,Wind Direction (degrees),Wind Gust (m/s),Wind Speed (m/s)
type,int,int,int,int,int,int,int,int,int,real,real,real,real,real,int,int,int,real,real,real,int,int,int,real,real,real,real,int,int,int,real,real,int,enum,int,enum,int,enum,int,enum,int,enum,int,enum,int,int,enum,int,real,real
mins,2013.0,1.0,1.0,0.0,2013.0,1.0,1.0,0.0,0.0,61.0,0.0,0.0,0.25,0.5,,,,60.96,213.36,365.76,,,,-26.7,0.1251,0.0,983.2949,,,,-15.6,0.001,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,10.0,7.2,0.0
mean,2013.5,6.52602739726,15.7205479452,11.5,2013.50057078,6.52511415525,15.721347032,11.5001141553,0.0,1306.31195846,0.416742490522,0.361207349081,0.872445384073,0.963045685279,0.0,0.0,0.0,1293.9822682,1643.73900166,2084.89386376,0.0,0.0,0.0,4.31304646766,0.596736389159,1.37993010753,1017.82581441,0.0,0.0,0.0,12.5789090701,14.3914429682,4.84251968504,,3.65867689358,,2.84660766962,,2.01149425287,,4.125,,3.0,0.0,1.37848173516,4.84251968504,,194.69525682,9.42216948073,2.41032887849
maxs,2014.0,12.0,31.0,23.0,2015.0,12.0,31.0,23.0,0.0,3657.6,1.0,1.0,1.0,1.0,,,,3657.5999,3657.5999,3657.5999,,,,24.4,1.0,26.924,1042.2113,,,,36.1,16.0934,60.0,11.0,60.0,10.0,36.0,7.0,27.0,4.0,27.0,2.0,3.0,0.0,16.0,60.0,11.0,360.0,20.58,10.8
sigma,0.500014270017,3.44794972385,8.79649804852,6.92238411188,0.500584411716,3.44782405458,8.79561488868,6.92230165203,0.0,995.339856966,0.462720830993,0.42770569708,0.197155690367,0.0861015598104,-0.0,-0.0,-0.0,962.743095854,916.73861349,887.215847511,-0.0,-0.0,-0.0,10.9731282097,0.185792011866,2.56215129179,7.46451697179,-0.0,-0.0,-0.0,10.0396739531,3.69893623033,5.70486576983,,6.13386253912,,5.80553286364,,3.12340844261,,6.15223536611,,0.0,0.0,4.07386062702,5.70486576983,,106.350000031,1.81511871115,1.61469790524
zeros,0,0,0,730,0,0,0,730,17455,0,8758,8758,0,0,0,0,0,0,0,0,0,0,0,268,0,501,0,0,0,0,269,0,0,17,0,30,0,13,0,20,0,12,0,2,14980,0,17,0,0,2768
missing,0,0,0,0,0,0,0,0,65,10780,375,375,14682,16535,17520,17520,17520,9103,14683,16535,17520,17520,17520,67,67,15660,360,17520,17520,17520,67,412,14980,14980,16477,16477,17181,17181,17433,17433,17504,17504,17518,17518,0,14980,14980,9382,14381,1283
0,2013.0,1.0,1.0,0.0,2013.0,1.0,1.0,5.0,0.0,2895.6,1.0,0.9,1.0,,,,,2895.5999,3352.8,,,,,-5.0,0.5447,,1013.0917,,,,3.3,16.0934,,,,,,,,,,,,,0.0,,,,,2.57
1,2013.0,1.0,1.0,1.0,2013.0,1.0,1.0,6.0,0.0,3048.0,1.0,1.0,,,,,,3048.0,,,,,,-4.4,0.5463,,1012.0759,,,,3.9,16.0934,,,,,,,,,,,,,0.0,,,260.0,9.77,4.63
2,2013.0,1.0,1.0,2.0,2013.0,1.0,1.0,7.0,0.0,1828.8,1.0,1.0,,,,,,1828.7999,,,,,,-3.3,0.619,,1012.4145,,,,3.3,16.0934,,,,,,,,,,,,,0.0,,,,7.72,1.54


In [12]:
# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local", "Month Local", "Day Local", "Hour Local", "Dew Point (C)", "Humidity Fraction",
               "Precipitation One Hour (mm)", "Temperature (C)", "Weather Code 1/ Description"]]

wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!  

Rows:17,520 Cols:9
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,44,14.8148149,3.4 KB,1.8906130
C0D,Constant Reals,1,0.3367003,80 B,0.0429685
C1,1-Byte Integers,18,6.0606062,10.6 KB,5.8469355
C1N,1-Byte Integers (w/o NAs),88,29.6296299,51.8 KB,28.4929335
C1S,1-Byte Fractions,42,14.1414136,24.9 KB,13.6768669
C2S,2-Byte Fractions,68,22.8956223,76.9 KB,42.3207283
CNAXI,NA Sparse Integers,15,5.0505050,3.7 KB,2.0549674
CNAXD,NA Sparse Reals,21,7.0707068,10.3 KB,5.6739874


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,181.8 KB,17520.0,33.0,297.0
mean,181.8 KB,17520.0,33.0,297.0
min,181.8 KB,17520.0,33.0,297.0
max,181.8 KB,17520.0,33.0,297.0
stddev,0 B,0.0,0.0,0.0
total,181.8 KB,17520.0,33.0,297.0






Unnamed: 0,Year Local,Month Local,Day Local,Hour Local,Dew Point (C),Humidity Fraction,Rain (mm),Temperature (C),WC1
type,int,int,int,int,real,real,real,real,enum
mins,2013.0,1.0,1.0,0.0,-26.7,0.1251,0.0,-15.6,0.0
mean,2013.5,6.52602739726,15.7205479452,11.5,4.31304646766,0.596736389159,1.37993010753,12.5789090701,
maxs,2014.0,12.0,31.0,23.0,24.4,1.0,26.924,36.1,11.0
sigma,0.500014270017,3.44794972385,8.79649804852,6.92238411188,10.9731282097,0.185792011866,2.56215129179,10.0396739531,
zeros,0,0,0,730,268,0,501,269,17
missing,0,0,0,0,67,67,15660,67,14980
0,2013.0,1.0,1.0,0.0,-5.0,0.5447,,3.3,
1,2013.0,1.0,1.0,1.0,-4.4,0.5463,,3.9,
2,2013.0,1.0,1.0,2.0,-3.3,0.619,,3.3,


In [13]:
# Filter down to the weather at Noon
wthr3 = wthr2[wthr2["Hour Local"] == 12]

In [14]:
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"] - 1, 
                                    day=wthr3["Day Local"] - 1, hour=wthr3["Hour Local"])
secsPerDay = 1000 * 60 * 60 * 24
wthr3["Days"] = (wthr3["msec"] / secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.  

Rows:730 Cols:11
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,78,21.4876026,6.1 KB,16.9985563
C0D,Constant Reals,7,1.9283747,560 B,1.5255115
C1,1-Byte Integers,21,5.7851240,1.9 KB,5.1921871
C1N,1-Byte Integers (w/o NAs),54,14.8760334,4.8 KB,13.2910192
C1S,1-Byte Fractions,87,23.9669427,9.0 KB,25.1273543
C2S,2-Byte Fractions,88,24.2424250,11.0 KB,30.8098823
CNAXI,NA Sparse Integers,11,3.0303031,882 B,2.4026806
CNAXD,NA Sparse Reals,14,3.8567495,1.2 KB,3.3942629
CUD,Unique Reals,3,0.8264462,462 B,1.2585470


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,35.8 KB,730.0,33.0,363.0
mean,35.8 KB,730.0,33.0,363.0
min,35.8 KB,730.0,33.0,363.0
max,35.8 KB,730.0,33.0,363.0
stddev,0 B,0.0,0.0,0.0
total,35.8 KB,730.0,33.0,363.0






Unnamed: 0,Year Local,Month Local,Day Local,Hour Local,Dew Point (C),Humidity Fraction,Rain (mm),Temperature (C),WC1,msec,Days
type,int,int,int,int,real,real,real,real,enum,int,int
mins,2013.0,1.0,1.0,12.0,-26.7,0.1723,0.0,-13.9,0.0,1.3570704e+12,15706.0
mean,2013.5,6.52602739726,15.7205479452,12.0,4.23012379642,0.539728198074,1.53125714286,14.0687757909,,1.3885608526e+12,16070.5
maxs,2014.0,12.0,31.0,12.0,23.3,1.0,12.446,34.4,10.0,1.420056e+12,16435.0
sigma,0.500342818004,3.45021529307,8.80227802701,0.0,11.1062964725,0.179945027923,2.36064248615,10.3989855149,,18219740080.4,210.877136425
zeros,0,0,0,0,14,0,15,7,1,0,0
missing,0,0,0,0,3,3,660,3,620,0,0
0,2013.0,1.0,1.0,12.0,-3.3,0.5934,,3.9,,1.3570704e+12,15706.0
1,2013.0,1.0,2.0,12.0,-11.7,0.4806,,-2.2,,1.3571568e+12,15707.0
2,2013.0,1.0,3.0,12.0,-10.6,0.5248,,-2.2,,1.3572432e+12,15708.0


In [15]:
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")

In [16]:
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[rain.isna()] = 0
wthr4["Rain (mm)"] = rain

In [17]:
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather = bpd.merge(wthr4, all_x=True, all_y=False)
bpd_with_weather.describe()
bpd_with_weather.show()

Merge Daily Weather with Bikes-Per-Day
Rows:10,450 Cols:10
Chunk compression summary: 


0,1,2,3,4,5
chunk_type,chunk_name,count,count_percentage,size,size_percentage
C0L,Constant Integers,64,20.0000003,5.0 KB,3.5199165
C0D,Constant Reals,33,10.3124999,2.6 KB,1.8149570
CBS,Bits,6,1.8750001,666 B,0.4578641
CXI,Zero Sparse Integers,4,1.2500000,800 B,0.5499870
C1,1-Byte Integers,2,0.6250000,788 B,0.5417371
C1N,1-Byte Integers (w/o NAs),24,7.5000003,9.3 KB,6.5255947
C1S,1-Byte Fractions,31,9.6874997,12.4 KB,8.7502919
C2,2-Byte Integers,63,19.6875006,44.3 KB,31.2090099
CNAXI,NA Sparse Integers,2,0.6250000,372 B,0.2557439


Frame distribution summary: 


0,1,2,3,4
,size,number_of_rows,number_of_chunks_per_column,number_of_chunks
127.0.0.1:54321,142.0 KB,10450.0,32.0,320.0
mean,142.0 KB,10450.0,32.0,320.0
min,142.0 KB,10450.0,32.0,320.0
max,142.0 KB,10450.0,32.0,320.0
stddev,0 B,0.0,0.0,0.0
total,142.0 KB,10450.0,32.0,320.0






Unnamed: 0,Days,start station name,bikes,Month,DayOfWeek,Dew Point (C),Humidity Fraction,Rain (mm),Temperature (C),WC1
type,int,enum,int,enum,enum,real,real,real,real,enum
mins,15979.0,0.0,1.0,0.0,0.0,-2.2,0.3485,0.0,9.4,2.0
mean,15994.4415311,,99.3025837321,0.968612440191,,7.77999043062,0.562374191388,0.00794813397129,16.9630717703,
maxs,16010.0,329.0,553.0,1.0,6.0,19.4,0.8718,0.254,26.1,8.0
sigma,9.23370172444,,72.9721964301,0.174371128617,,6.49151146664,0.149631413472,0.0442248839098,4.29746634617,
zeros,0,32,0,328,1635,0,0,10123,0,0
missing,0,0,0,0,0,0,0,0,0,9134
0,15979.0,1 Ave & E 15 St,97.0,9,Mon,10.6,0.4315,0.0,23.9,
1,15979.0,1 Ave & E 18 St,75.0,9,Mon,10.6,0.4315,0.0,23.9,
2,15979.0,1 Ave & E 30 St,113.0,9,Mon,10.6,0.4315,0.0,23.9,


Days,start station name,bikes,Month,DayOfWeek,Dew Point (C),Humidity Fraction,Rain (mm),Temperature (C),WC1
15979,1 Ave & E 15 St,97,9,Mon,10.6,0.4315,0,23.9,
15979,1 Ave & E 18 St,75,9,Mon,10.6,0.4315,0,23.9,
15979,1 Ave & E 30 St,113,9,Mon,10.6,0.4315,0,23.9,
15979,10 Ave & W 28 St,74,9,Mon,10.6,0.4315,0,23.9,
15979,11 Ave & W 27 St,139,9,Mon,10.6,0.4315,0,23.9,
15979,11 Ave & W 41 St,60,9,Mon,10.6,0.4315,0,23.9,
15979,12 Ave & W 40 St,90,9,Mon,10.6,0.4315,0,23.9,
15979,2 Ave & E 31 St,88,9,Mon,10.6,0.4315,0,23.9,
15979,2 Ave & E 58 St,55,9,Mon,10.6,0.4315,0,23.9,
15979,3 Ave & Schermerhorn St,8,9,Mon,10.6,0.4315,0,23.9,


In [18]:
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)

Training data has 10 columns and 6200 rows, test has 3169 rows, holdout has 1081
gbm Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████████████████████████| 100%


0,1,2,3,4
Model,mse TRAIN,mse TEST,mse HOLDOUT,Model Training Time (s)
GBM,49.7116910,419.8720174,410.5695314,7.269
DRF,650.3114403,1149.0162553,1252.8437471,17.941
GLM,691.6546957,793.8077896,795.1698966,7.959
DL,249.9157275,503.6839407,509.1793580,7.958
