In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

## Grab the data
We'll pull in some moderately pre-processed data from our forest inventory plots.

In [2]:
DATA = '../data/processed/training_data_annual.csv'
df = pd.read_csv(DATA)

PLOTS = '../data/processed/blm_usfs_wadnr_plot_footprints.shp'
gdf = gpd.read_file(PLOTS)
gdf['agency'] = gdf['source'].apply(lambda x: x.split('-')[0].replace('WA', 'WADNR'))

In [3]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_info_columns', 999)

## Splitting the data into training and testing sets
Because our data include several records from the same plots at different years, both observed (lidar) and simulated (growth-and-yield), we want to make sure that all records for any given plot will be assigned to either the training set or testing set. Similarly, we will assign all records of a plot to one of five folds that will be used for cross-validation. This is to ensure that the testing datasets do not include very-closely-related plot records to those the model has already been trained on. 

In [4]:
# a dataframe with the plot ids and agency sources
splits = pd.DataFrame(pd.unique(df['uuid']), 
                      columns=['uuid']).merge(gdf[['uuid', 'agency']], on='uuid').set_index('uuid')

# randomly assign each plot-id to train(0) or test(1) sets
np.random.seed(127)
splits['train0_test1'] = np.random.choice(2,
                                          size=len(splits),
                                          p=[0.8,0.2])

In [5]:
# confirm how the split falls out for the plots from each agency
(splits.reset_index().groupby(by=['agency', 'train0_test1'])['uuid'].count() / \
    splits.reset_index().groupby(by=['agency'])['uuid'].count()).round(2)

agency  train0_test1
BLM     0               0.81
        1               0.19
USFS    0               0.81
        1               0.19
WADNR   0               0.79
        1               0.21
Name: uuid, dtype: float64

In [6]:
# add set assignments to the dataframe
df['agency'] = splits.loc[df['uuid']]['agency'].values
df['train0_test1'] = splits.loc[df['uuid']]['train0_test1'].values

In [7]:
# identify the fold that each training plot will belong to
# we will try to balance the number of plots from each agency and forest type in each fold
min_yr_idx = df.loc[df['train0_test1'] == 0].groupby(by=['uuid'])['year_diff'].idxmin()
min_yrs = df.loc[min_yr_idx]
grouped = min_yrs.loc[min_yrs['train0_test1'] == 0].groupby(by=['sppgrp_forest_type_name', 'uuid'])['pot_veg_type'].count().reset_index().sort_values(by='sppgrp_forest_type_name')
grouped['fold_assignment'] = grouped.index % 5
grouped = grouped.set_index('uuid')

# add fold assignments to dataframe
df.loc[df['train0_test1'] == 0, 'fold_assignment'] = grouped.loc[df.loc[df['train0_test1'] == 0]['uuid']]['fold_assignment'].values

# join to the dataframe identify the train/test split for each lot
splits = splits.join(grouped['fold_assignment'])

In [11]:
# confirm how the fold assignments fall out for the plots from each agency
train_plots = df.loc[df['train0_test1'] == 0]
(train_plots.reset_index().groupby(by=['agency', 'fold_assignment'])['uuid'].count() / \
    train_plots.reset_index().groupby(by=['agency'])['uuid'].count()).round(2)

agency  fold_assignment
BLM     0.0                0.21
        1.0                0.19
        2.0                0.20
        3.0                0.20
        4.0                0.21
USFS    0.0                0.21
        1.0                0.21
        2.0                0.20
        3.0                0.21
        4.0                0.17
WADNR   0.0                0.20
        1.0                0.20
        2.0                0.20
        3.0                0.21
        4.0                0.19
Name: uuid, dtype: float64

In [12]:
#  confirm how the fold assignments fall out for the forest types
train_plots = df.loc[df['train0_test1'] == 0]
(train_plots.reset_index().groupby(by=['sppgrp_forest_type_name', 'fold_assignment'])['uuid'].nunique() / \
    train_plots.reset_index().groupby(by=['sppgrp_forest_type_name'])['uuid'].nunique()).round(2)

sppgrp_forest_type_name                  fold_assignment
COTTONWOOD-AND-ASPEN                     0.0                0.25
                                         1.0                0.25
                                         2.0                0.25
                                         3.0                0.25
COTTONWOOD-AND-ASPEN_LODGEPOLE-PINE      0.0                0.50
                                         3.0                0.50
DOUGLAS-FIR                              0.0                0.20
                                         1.0                0.20
                                         2.0                0.20
                                         3.0                0.20
                                         4.0                0.20
DOUGLAS-FIR_ENGELMANN-AND-OTHER-SPRUCES  4.0                1.00
DOUGLAS-FIR_INCENSE-CEDAR                0.0                0.50
                                         1.0                0.50
DOUGLAS-FIR_LODGEPOLE-PINE       

## Weighting inventory plots by their area
This will intensify the model's attention on larger plots where the inventory attributes are presumably easier to distinguish from remote sensing, partly due to concept that inventory attributes should be lower in variance at this slightly larger scale, as well as the fact that edge effects associated with co-registration errors between field-based and remotely-sensed data that are likely to be larger on smaller plots.

In [8]:
# assign per-sample weights based on the size of the inventory plots
# dnr plots are 1/10th acre, and will have weight of 1
# blm plots are 1/8th acre, will have weight of 1.25
# usfs plots are 1/4 acre, will have weight of 2.5
df.loc[df['agency'] == 'WADNR', 'weights'] = 1.0
df.loc[df['agency'] == 'BLM', 'weights'] = 1.25
df.loc[df['agency'] == 'USFS', 'weights'] = 2.5

## Generate categories for diameter class and canopy cover

In [9]:
# SIZE CLASS
# 0 – Nonstocked 0-1" qmd
# 1 – Seedling/Sapling 1-5" qmd
# 2 – Small Tree 5-10" qmd
# 3 – Medium Tree 10-15" qmd
# 4 – Large Tree 15-20" qmd
# 5 – Very Large Tree 20"+ qmd
df['size_class'] = pd.cut(df['qmd'],
                          bins=[0,1,5,10,15,20,999],
                          labels=['Nonstocked', 'Seedling-Sapling', 'Small', 'Medium', 'Large', 'Very Large'],
                          right=True,
                          include_lowest=True).astype(str)

In [10]:
# CANOPY COVER
# 0 – Sparse 0-10%
# 1 – Open 10-40%
# 2 – Moderate 40-70%
# 3 – Closed 70%+
df['cover_class'] = pd.cut(df['total_cover'],
                           bins=[0,10,40,70,100],
                           labels=['Sparse', 'Open', 'Moderate', 'Closed'],
                           right=True,
                           include_lowest=True).astype(str)

## Getting the data into h2o
We'll migrate over from Pandas into h2o now.

In [13]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_45"; Java(TM) SE Runtime Environment (build 1.8.0_45-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.45-b02, mixed mode)
  Starting server from /storage/anaconda3/envs/forest_mapping/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp2ggkqp9y
  JVM stdout: /tmp/tmp2ggkqp9y/h2o_ubuntu_started_from_python.out
  JVM stderr: /tmp/tmp2ggkqp9y/h2o_ubuntu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_ubuntu_3i8jzv
H2O cluster total nodes:,1
H2O cluster free memory:,13.98 Gb
H2O cluster total cores:,32
H2O cluster allowed cores:,32


In [14]:
hf = h2o.H2OFrame(df)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [15]:
hf.describe()

Rows:37911
Cols:89




Unnamed: 0,uuid,year,mat,mwmt,mcmt,td,map,msp,ahm,shm,dd_0,dd5,dd_18,dd18,nffd,bffp,effp,ffp,pas,emt,ext,eref,cmd,rh,distance_to_water_m,strat0_return-proportion,strat1_return-proportion,strat2_return-proportion,strat3_return-proportion,strat4_return-proportion,strat5_return-proportion,strat0_intensity-median,strat1_intensity-median,strat2_intensity-median,strat3_intensity-median,strat4_intensity-median,strat5_intensity-median,height_05-percentile,height_25-percentile,height_50-percentile,height_75-percentile,height_95_percentile,height_max,cover,potential_volume,stddev_height,surface_area_ratio,surface_volume,surface_volume_ratio,aspect,elevation,overall_curvature,plan_curvature,profile_curvature,slope,solar_radiation_index,bulk_dens,soil_depth,pct_clay_surf,pct_rock_surf,pct_sand_surf,green,blue,red,nir,endvi,ndvi,savi,swir1,swir2,wetness,greenness,brightness,pot_veg_type,tpa,ba,qmd,total_cover,sdi,structure_class,year_diff,spp_forest_type_name,sppgrp_forest_type_name,agency,train0_test1,fold_assignment,weights,size_class,cover_class
type,uuid,int,real,real,real,real,int,int,real,real,int,int,int,int,int,int,int,int,int,real,real,int,int,int,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,int,real,int,int,int,real,real,real,real,real,real,real,real,real,real,real,real,int,int,int,real,int,int,enum,int,enum,enum,enum,int,int,real,enum,enum
mins,,2003.0,2.2,12.6,-10.4,9.8,290.0,41.0,3.8,13.4,37.0,740.0,2075.0,6.0,125.0,55.0,243.0,58.0,6.0,-39.6,29.1,434.0,18.0,46.0,0.042366231,0.0,0.0,0.0,0.0,0.0,0.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.014305000193417072,1.8655999898910516,4.382550239562988,-21.215700149536133,-12.874150276184082,-11.851350784301758,0.4023000001907349,0.7843999862670898,0.0,-3.4028199999999998e+38,-9999.0,-9999.0,-9999.0,107.8387087,25.24,72.0,636.0971429,-0.396288015,-0.327946509,-0.491880839,265.791445,89.92465753,-370.1401315,-317.0603333,69.42755593,11.0,4.0,10.0,0.64,3.0,14.0,,0.0,,,,0.0,0.0,1.0,,
mean,,2012.7337712009692,9.735092717153325,18.569140882593434,1.6232940307562456,16.945461739336853,1843.6942048481963,233.70341061960917,13.805721294611066,118.40592176413189,172.22687346680382,2153.6994803619004,3146.43483421698,137.3364722639869,274.7484107514969,110.82313840310192,304.06233019440236,193.23987760808203,121.99221861728769,-21.11207828862334,36.52884914668567,805.0048798501753,396.07193162934186,66.41705573580221,248.95836077744565,0.14454461414892772,0.09002573279523092,0.08004946886128034,0.1434881144258921,0.12412265558808808,0.3426702104929965,2800.6180528078903,2437.1609823006515,1823.6223127852086,1251.8632586848153,976.8962702118115,-505.7246313734795,1.3467843224921514,13.860238678826736,26.866026906016717,38.771367558043835,51.96224084505816,62.71825931159827,76.54296530452893,3135.4257045122445,6.1444030599740875,4.439864150432785,1878.7819624237627,2.2444880655277377e+34,182.61962846933264,566.8458773029316,-0.4407490372181566,-0.05267046275679865,-0.3387914803773819,21.334789538756862,1.638848224937499,109.46442984885655,-6.103552003376337e+35,2.1109176756086616,1.7307114030228499,14.616180000527551,393.8855174269897,240.20695421417273,293.5538805945269,2512.657706881919,0.7307190518196305,0.7932382358799822,1.189630158673445,1051.7888962850952,474.12650080260556,-44.67920696473714,147.78473594741178,237.21283742502615,1062.2558096594648,279.40996016987185,196.22869351903142,16.649762865659085,57.82371870960936,295.90786315317445,,5.7255677771623015,,,,0.21405396850518318,1.9968116525708153,1.2767600432592106,,
maxs,,2017.0,13.4,23.2,7.2,28.9,4789.0,1240.0,62.0,553.9,993.0,3189.0,5746.0,465.0,336.0,193.0,344.0,279.0,1432.0,-11.1,41.8,1197.0,880.0,80.0,34254.80025,1.0,0.6256970000000001,0.710214,0.897681,0.8748600000000001,0.996817,57257.0,55521.5,60032.0,40633.0,36822.0,28704.0,73.25,93.225006,110.880005,118.277496,121.407501,122.0,100.0,10990.0,28.33873176574707,18.446910858154297,10787.2880859375,1.7014117331926445e+38,357.6234130859375,2089.6982421875,25.27804946899414,11.132100105285645,13.916950225830078,61.56449890136719,1.9931000471115112,200.0,178.0,41.0,95.0,83.0,5926.0,5844.196448,5881.7143799999985,5387.339391,0.942984214,0.929363581,1.393873647,3951.47098,3027.505696,79.87442131,359.788159,1030.508122,1178.0,25356.0,951.0,82.12,100.0,1578.0,,10.0,,,,1.0,4.0,2.5,,
sigma,,3.1337121353228174,1.8687817853774324,1.5062424219471016,3.404076330593434,3.1476135868830224,854.496358563407,170.77281963430514,8.22820943269226,73.93047749420269,156.10874011642088,396.2184225050115,626.8981306635368,75.55041604305423,41.2617959415169,21.39194050378206,19.16778277298544,39.30073920632875,126.21756857234564,6.270073497641846,1.8889052409929208,103.14778488253269,140.53115043556912,4.485455059350698,1019.7007894314727,0.17686342057249968,0.09168590615081892,0.1057148483924281,0.16601869167059877,0.14917350149039466,0.31098788358468527,8165.955210008992,7454.824374939028,6141.135607992979,4733.317344180611,4441.204405264867,6682.674200084088,4.690107599318413,17.99683784703265,25.220806991825576,29.583999519544147,33.098768967240396,35.56202288638247,22.684309205598563,1695.2591319663397,4.07382074667197,2.2063454239766553,1355.338287279818,1.9540714677135495e+36,90.15117513121143,416.0821793884306,3.9506864646219415,1.9997322757221205,2.174785113229838,11.676451717025188,0.1940606035597847,16.611986804996235,1.4398813269218666e+37,424.0056765330994,424.0921041342856,424.6628034698694,236.05793783790588,280.5559422032443,294.95850377533935,658.1583082900864,0.12428409818712903,0.1353337756702627,0.2029682497043859,441.561226146426,331.0283024308225,46.307496862583875,53.458565675476095,67.11607938040802,55.498413873196235,1020.8138595545155,127.96508929703012,8.210327081019305,21.55444782040182,175.1665210046589,,2.838113472198769,,,,0.4101698487409271,1.4079425778896228,0.4922031751276477,,
zeros,0,0,0,0,519,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,34,67,739,1295,4251,7918,125,186,99,109,35,15,21859,5365,1936,481,37,21,179,36,36,0,31,0,0,0,24,0,0,0,0,68,0,0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,892,,,,29796,5973,0,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,428,0,428,428,428,428,428,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8115,0,0,0
0,00027724-39D2-42B3-954D-CF2DBE527A44,2015.0,8.2,19.8,-2.5,22.4,435.0,104.0,41.8,191.1,357.0,2029.0,3744.0,182.0,192.0,140.0,259.0,118.0,98.0,-34.5,38.1,936.0,700.0,53.0,89.72353573,0.911437,0.015141,0.038568,0.01038,0.0,0.0,91.0,99.0,55.0,61.0,86.0,-9999.0,0.0,0.0,0.0,0.0,4.83,12.91,7.342158,94.0,0.0795459970831871,1.0243749618530271,1.406838059425354,0.0256744995713234,131.97630310058594,1328.897216796875,0.4869999885559082,-0.07015000283718109,0.10244999825954436,11.935199737548828,1.7980999946594238,106.0,118.0,18.0,22.0,16.0,812.2624377999998,549.7394279,978.8905473,2124.200871,0.458227581,0.370752561,0.556038605,2304.209577,1702.5907960000004,-153.2668939,64.11187174,345.6270828000001,1159.0,8.0,10.0,15.31,5.0,16.0,0=BG,3.0,PP,PONDEROSA-AND-JEFFREY-PINES,USFS,0.0,0.0,2.5,Large,Sparse
1,00027724-39D2-42B3-954D-CF2DBE527A44,2015.0,8.2,19.8,-2.5,22.4,435.0,104.0,41.8,191.1,357.0,2029.0,3744.0,182.0,192.0,140.0,259.0,118.0,98.0,-34.5,38.1,936.0,700.0,53.0,89.72353573,0.911437,0.015141,0.038568,0.01038,0.0,0.0,91.0,99.0,55.0,61.0,86.0,-9999.0,0.0,0.0,0.0,0.0,4.83,12.91,7.342158,94.0,0.0795459970831871,1.0243749618530271,1.406838059425354,0.0256744995713234,131.97630310058594,1328.897216796875,0.4869999885559082,-0.07015000283718109,0.10244999825954436,11.935199737548828,1.7980999946594238,106.0,118.0,18.0,22.0,16.0,812.2624377999998,549.7394279,978.8905473,2124.200871,0.458227581,0.370752561,0.556038605,2304.209577,1702.5907960000004,-153.2668939,64.11187174,345.6270828000001,1159.0,8.0,10.0,15.57,5.0,16.0,0=BG,4.0,PP,PONDEROSA-AND-JEFFREY-PINES,USFS,0.0,0.0,2.5,Large,Sparse
2,00027724-39D2-42B3-954D-CF2DBE527A44,2015.0,8.2,19.8,-2.5,22.4,435.0,104.0,41.8,191.1,357.0,2029.0,3744.0,182.0,192.0,140.0,259.0,118.0,98.0,-34.5,38.1,936.0,700.0,53.0,89.72353573,0.911437,0.015141,0.038568,0.01038,0.0,0.0,91.0,99.0,55.0,61.0,86.0,-9999.0,0.0,0.0,0.0,0.0,4.83,12.91,7.342158,94.0,0.0795459970831871,1.0243749618530271,1.406838059425354,0.0256744995713234,131.97630310058594,1328.897216796875,0.4869999885559082,-0.07015000283718109,0.10244999825954436,11.935199737548828,1.7980999946594238,106.0,118.0,18.0,22.0,16.0,812.2624377999998,549.7394279,978.8905473,2124.200871,0.458227581,0.370752561,0.556038605,2304.209577,1702.5907960000004,-153.2668939,64.11187174,345.6270828000001,1159.0,8.0,11.0,15.82,5.0,17.0,0=BG,5.0,PP,PONDEROSA-AND-JEFFREY-PINES,USFS,0.0,0.0,2.5,Large,Sparse


Identify our categorical variables.

In [16]:
hf['uuid'] = hf['uuid'].ascharacter().asfactor()
hf['pot_veg_type'] = hf['pot_veg_type'].asfactor()

We'll create a couple boolean masks that we can use to separate our data into train and test sets.

In [17]:
train_mask = hf['train0_test1'] == 0
test_mask = hf['train0_test1'] == 1

In [18]:
LIDAR_COLS = ['strat0_return-proportion', 'strat1_return-proportion', 
              'strat2_return-proportion', 'strat3_return-proportion', 'strat4_return-proportion', 
              'strat5_return-proportion', 'strat0_intensity-median', 'strat1_intensity-median', 
              'strat2_intensity-median', 'strat3_intensity-median', 'strat4_intensity-median', 
              'strat5_intensity-median', 'height_05-percentile', 'height_25-percentile', 
              'height_50-percentile', 'height_75-percentile', 'height_95_percentile', 
              'height_max', 'cover', 'potential_volume', 'stddev_height', 'surface_area_ratio', 
              'surface_volume', 'surface_volume_ratio', 'aspect', 'elevation', 'overall_curvature', 
              'plan_curvature', 'profile_curvature', 'slope', 'solar_radiation_index']
LANDSAT_COLS = ['green', 'blue', 'red', 'nir', 'endvi', 'ndvi', 'savi', 'swir1',
                'swir2', 'wetness', 'greenness', 'brightness']
CLIMATE_COLS = ['mat','mwmt','mcmt','td','map','msp',
                'ahm','shm','dd_0','dd5','dd_18','dd18',
                'nffd','bffp','effp','ffp','pas','emt',
                'ext','eref','cmd','rh']
SOIL_COLS = ['bulk_dens', 'soil_depth', 'pct_clay_surf', 'pct_rock_surf', 'pct_sand_surf']

## Fitting models in h2o
First, split into train and test h2o dataframes.

In [19]:
train_hf, test_hf = hf[train_mask,:], hf[test_mask,:]

Then, define several models to train.

In [20]:
# initialize the estimator then train the model
rf_qmd = H2ORandomForestEstimator(
    model_id="rf_qmd",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [21]:
# initialize the estimator then train the model
rf_size_class = H2ORandomForestEstimator(
    model_id="rf_size_class",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [22]:
# initialize the estimator then train the model
rf_cov = H2ORandomForestEstimator(
    model_id="rf_cov",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [23]:
# initialize the estimator then train the model
rf_cov_class = H2ORandomForestEstimator(
    model_id="rf_cov_class",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [24]:
rf_type = H2ORandomForestEstimator(
    model_id="rf_type",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [25]:
gbm_size_class = H2OGradientBoostingEstimator(
    ntrees=200,
    learn_rate=0.05,
    sample_rate=0.7,
    col_sample_rate=0.7,
    stopping_rounds=2,
    stopping_tolerance=0.01, 
    score_each_iteration=True,
    model_id="gbm_size_class",
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [26]:
gbm_cov_class = H2OGradientBoostingEstimator(
    ntrees=200,
    learn_rate=0.05,
    sample_rate=0.7,
    col_sample_rate=0.7,
    stopping_rounds=2,
    stopping_tolerance=0.01, 
    score_each_iteration=True,
    model_id="gbm_cov_class",
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [27]:
gbm_type = H2OGradientBoostingEstimator(
    ntrees=200,
    learn_rate=0.05,
    sample_rate=0.7,
    col_sample_rate=0.7,
    stopping_rounds=2,
    stopping_tolerance=0.01, 
    score_each_iteration=True,
    model_id="gbm_type",
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

Gradient Boosting Machines will accept a per-category sampling rate to help balance out the fact that classes are highly imbalanced (e.g., we have a lot more Douglas-fir forest types than other forest types). We'll calculate the sampling rate as an inverse of the number of plots of each category, set at a floor of 25% of available samples.

In [28]:
table = hf['sppgrp_forest_type_name'].table(dense=False).as_data_frame()
sample_rate = (1 / (table.Count / table.Count.sum())) / (1 / (table.Count / table.Count.sum())).max()
sample_rate[sample_rate < 0.25] = 0.25

In [62]:
table.columns

Index(['sppgrp_forest_type_name', 'Count'], dtype='object')

In [63]:
table[['sppgrp_forest_type_name']].to_csv('../data/processed/sppgrp_forest_type_crosswalk.csv')

In [29]:
gbm_type_rebalance = H2OGradientBoostingEstimator(
    ntrees=200,
    learn_rate=0.05,
    sample_rate_per_class=list(sample_rate),
    col_sample_rate=0.6,
    stopping_rounds=2,
    stopping_tolerance=0.01, 
    score_each_iteration=True,
    model_id="gbm_type_rebalance",
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [30]:
gbm_qmd = H2OGradientBoostingEstimator(
    ntrees=200,
    learn_rate=0.05,
    col_sample_rate=0.6,
    stopping_rounds=2,
    stopping_tolerance=0.01, 
    score_each_iteration=True,
    model_id="gbm_qmd",
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

In [31]:
gbm_cov = H2OGradientBoostingEstimator(
    ntrees=200,
    learn_rate=0.05,
    col_sample_rate=0.6,
    stopping_rounds=2,
    stopping_tolerance=0.01, 
    score_each_iteration=True,
    model_id="gbm_cov",
    seed=127,
    fold_column='fold_assignment',
    weights_column='weights')

### Fit the models

In [32]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
rf_qmd.train(X_COLS, 'qmd', train_hf)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [33]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
rf_size_class.train(X_COLS, 'size_class', train_hf)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [34]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
gbm_qmd.train(X_COLS, 'qmd', train_hf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [35]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
gbm_size_class.train(X_COLS, 'size_class', train_hf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [36]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
rf_cov.train(X_COLS, 'total_cover', train_hf)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [37]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
rf_cov_class.train(X_COLS, 'cover_class', train_hf)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [38]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
gbm_cov.train(X_COLS, 'total_cover', train_hf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [39]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + ['year_diff']
gbm_cov_class.train(X_COLS, 'cover_class', train_hf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [40]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + CLIMATE_COLS + SOIL_COLS + ['pot_veg_type', 'distance_to_water_m', 'year_diff']
rf_type.train(X_COLS, 'sppgrp_forest_type_name', train_hf)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [41]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + CLIMATE_COLS + SOIL_COLS + ['pot_veg_type', 'distance_to_water_m', 'year_diff']
gbm_type.train(X_COLS, 'sppgrp_forest_type_name', train_hf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [42]:
X_COLS = LIDAR_COLS + LANDSAT_COLS + CLIMATE_COLS + SOIL_COLS + ['pot_veg_type', 'distance_to_water_m', 'year_diff']
gbm_type_rebalance.train(X_COLS, 'sppgrp_forest_type_name', train_hf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


## Inspect model performance
Also, check out [Flow](http://localhost:54321) for more model statistics and visualizations.

In [43]:
rf_qmd.mae(xval=True), rf_cov.mae(xval=True)

(3.90304167441095, 11.09470763866289)

In [44]:
gbm_qmd.mae(xval=True), gbm_cov.mae(xval=True)

(4.054358450491799, 11.699167114601577)

In [45]:
perf_qmd = rf_qmd.model_performance(test_hf)
perf_qmd.mae()

3.537234896694631

In [46]:
perf_cov = rf_cov.model_performance(test_hf)
perf_cov.mae()

10.376527119098988

In [47]:
perf_rf_type = rf_type.model_performance(test_hf)
perf_rf_type.logloss()

3.2715075572139423

In [48]:
perf_gbm_type = gbm_type.model_performance(test_hf)
perf_gbm_type.logloss()

1.8777599755971215

In [49]:
perf_gbm_type_rebalance = gbm_type_rebalance.model_performance(test_hf)
perf_gbm_type_rebalance.logloss()  # 25% min. sampling rate

1.8489639468522543

In [50]:
perf = gbm_size_class.model_performance(test_hf)
perf.logloss()

1.0941957092034764

In [51]:
perf = rf_size_class.model_performance(test_hf)
perf.logloss()

1.2937358135120287

In [52]:
perf = gbm_cov_class.model_performance(test_hf)
perf.logloss()

0.9392128510406111

In [53]:
perf = rf_cov_class.model_performance(test_hf)
perf.logloss()

1.0829109758829785

## Save our favorite models to disk

In [54]:
# save the model
gbm_type_rebalance_path = h2o.save_model(model=gbm_type_rebalance, 
                                         path="../models/gbm_forest_type_rebalance", 
                                         force=True)
print(gbm_type_rebalance_path)

/storage/forestmapping/models/gbm_forest_type_rebalance/gbm_type_rebalance


In [55]:
# save the qmd model
rf_qmd_path = h2o.save_model(model=rf_qmd, 
                             path="../models/rf_qmd",
                             force=True)
print(rf_qmd_path)

/storage/forestmapping/models/rf_qmd/rf_qmd


In [56]:
# save the qmd model
rf_cov_path = h2o.save_model(model=rf_cov, 
                             path="../models/rf_cov",
                             force=True)
print(rf_cov_path)

/storage/forestmapping/models/rf_cov/rf_cov


In [57]:
# save the model
gbm_cov_class_path = h2o.save_model(model=gbm_cov_class,
                                    path="../models/gbm_cov_class",
                                    force=True)
print(gbm_cov_class_path)

/storage/forestmapping/models/gbm_cov_class/gbm_cov_class


In [58]:
# save the model
gbm_size_class_path = h2o.save_model(model=gbm_size_class,
                                    path="../models/gbm_size_class",
                                    force=True)
print(gbm_size_class_path)

/storage/forestmapping/models/gbm_size_class/gbm_size_class


## Shutdown the h2o cluster

In [59]:
# h2o.cluster().shutdown()

H2O session _sid_9480 closed.
