In [16]:
import pickle
from copy import deepcopy
import time

# data prep and model-tuning
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# types of models we'll fit
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.multioutput import RegressorChain

In [17]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

In [18]:
from warnings import simplefilter
simplefilter(action='ignore', category=UserWarning)

In [19]:
sns.set_style('darkgrid')
%matplotlib inline

## Our Data

In [20]:
PLOT_DATA = '../data/processed/plot_features.csv'
KEEP_PLOT_COLS = ['uuid', 'lat', 'lon', 'ecoregion3', 'agency', 'distance_to_water_m', 'plot_size_ac', 'meas_yr']
plot_data = pd.read_csv(PLOT_DATA)[KEEP_PLOT_COLS]
plot_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5089 entries, 0 to 5088
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   uuid                 5089 non-null   object 
 1   lat                  5089 non-null   float64
 2   lon                  5089 non-null   float64
 3   ecoregion3           5089 non-null   object 
 4   agency               5089 non-null   object 
 5   distance_to_water_m  5089 non-null   float64
 6   plot_size_ac         5089 non-null   float64
 7   meas_yr              5089 non-null   int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 318.2+ KB


In [21]:
LIDAR_DATA = '../data/processed/lidar_features.csv'
lidar_data = pd.read_csv(LIDAR_DATA)[['uuid', 'elevation']]
lidar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5411 entries, 0 to 5410
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   uuid       5411 non-null   object 
 1   elevation  5411 non-null   float64
dtypes: float64(1), object(1)
memory usage: 84.7+ KB


In [22]:
plot_data = plot_data.merge(lidar_data, left_on=['uuid'], right_on=['uuid'], how='inner').drop_duplicates(subset=['uuid'])
plot_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4272 entries, 0 to 5410
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   uuid                 4272 non-null   object 
 1   lat                  4272 non-null   float64
 2   lon                  4272 non-null   float64
 3   ecoregion3           4272 non-null   object 
 4   agency               4272 non-null   object 
 5   distance_to_water_m  4272 non-null   float64
 6   plot_size_ac         4272 non-null   float64
 7   meas_yr              4272 non-null   int64  
 8   elevation            4272 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 333.8+ KB


In [23]:
INVENTORY = '../data/processed/inventory_features.csv'
inv_data = pd.read_csv(INVENTORY, index_col=['uuid', 'year'])
inv_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 49410 entries, ('ba510248', 2010) to ('c4f7f099', 2025)
Data columns (total 75 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tpa                  49410 non-null  int64  
 1   ba                   49410 non-null  int64  
 2   sdi                  49410 non-null  int64  
 3   ccf                  49410 non-null  int64  
 4   qmd                  49410 non-null  float64
 5   tcuft                49410 non-null  int64  
 6   topht                49410 non-null  int64  
 7   number_of_strata     49410 non-null  int64  
 8   total_cover          49410 non-null  int64  
 9   structure_class      49410 non-null  object 
 10  canopy_baseheight    49410 non-null  int64  
 11  canopy_bulkdensity   49410 non-null  float64
 12  aboveground_biomass  49410 non-null  int64  
 13  aboveground_carbon   49410 non-null  int64  
 14  gs_tpa               49410 non-null  int64  
 15  AF    

In [24]:
SATELLITE = '../data/processed/satellite_features.csv'
sat = pd.read_csv(SATELLITE, index_col=['uuid', 'year'])
S2_COLS = [col for col in sat.columns if col.startswith('S2')]
LANDTRENDR_COLS = [col for col in sat.columns if col.startswith('LT')]
sat = sat[S2_COLS + LANDTRENDR_COLS].dropna()
sat.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10178 entries, ('00027724', 2019) to ('fff7e1c3', 2020)
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   S2_R_LEAFOFF           10178 non-null  float64
 1   S2_G_LEAFOFF           10178 non-null  float64
 2   S2_B_LEAFOFF           10178 non-null  float64
 3   S2_NIR_LEAFOFF         10178 non-null  float64
 4   S2_SWIR1_LEAFOFF       10178 non-null  float64
 5   S2_SWIR2_LEAFOFF       10178 non-null  float64
 6   S2_RE1_LEAFOFF         10178 non-null  float64
 7   S2_RE2_LEAFOFF         10178 non-null  float64
 8   S2_RE3_LEAFOFF         10178 non-null  float64
 9   S2_RE4_LEAFOFF         10178 non-null  float64
 10  S2_R_LEAFON            10178 non-null  float64
 11  S2_G_LEAFON            10178 non-null  float64
 12  S2_B_LEAFON            10178 non-null  float64
 13  S2_NIR_LEAFON          10178 non-null  float64
 14  S2_SWIR1_LEAFON        1

## Filter out some of the training data
We can exclude some of the training data based on how far separated the inventory data (interpolated using FVS simulations) is from the year the imagery was collected. Similarly, we can screen out training examples that had relatively low density of lidar returns.

In [25]:
sat_and_inv = sat.merge(inv_data, how='inner', left_index=True, right_index=True).reset_index()
sat_and_inv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9385 entries, 0 to 9384
Columns: 130 entries, uuid to LARCH
dtypes: float64(47), int64(81), object(2)
memory usage: 9.3+ MB


In [26]:
df = sat_and_inv.merge(plot_data, how='inner', left_on=['uuid'], right_on=['uuid']).dropna()
print('{:,d} samples'.format(len(df)))
print('Columns:', df.columns.values)

7,952 samples
Columns: ['uuid' 'year' 'S2_R_LEAFOFF' 'S2_G_LEAFOFF' 'S2_B_LEAFOFF'
 'S2_NIR_LEAFOFF' 'S2_SWIR1_LEAFOFF' 'S2_SWIR2_LEAFOFF' 'S2_RE1_LEAFOFF'
 'S2_RE2_LEAFOFF' 'S2_RE3_LEAFOFF' 'S2_RE4_LEAFOFF' 'S2_R_LEAFON'
 'S2_G_LEAFON' 'S2_B_LEAFON' 'S2_NIR_LEAFON' 'S2_SWIR1_LEAFON'
 'S2_SWIR2_LEAFON' 'S2_RE1_LEAFON' 'S2_RE2_LEAFON' 'S2_RE3_LEAFON'
 'S2_RE4_LEAFON' 'S2_NDVI_LEAFON' 'S2_SAVI_LEAFON' 'S2_BRIGHTNESS_LEAFON'
 'S2_GREENNESS_LEAFON' 'S2_WETNESS_LEAFON' 'S2_NDVI_LEAFOFF'
 'S2_SAVI_LEAFOFF' 'S2_BRIGHTNESS_LEAFOFF' 'S2_GREENNESS_LEAFOFF'
 'S2_WETNESS_LEAFOFF' 'S2_dR' 'S2_dG' 'S2_dB' 'S2_dNIR' 'S2_dSWIR1'
 'S2_dSWIR2' 'S2_dRE1' 'S2_dRE2' 'S2_dNDVI' 'S2_dSAVI' 'S2_dBRIGHTNESS'
 'S2_dGREENNESS' 'S2_dWETNESS' 'S2_dRE3' 'S2_dRE4' 'LT_DUR_NBR'
 'LT_DUR_SWIR1' 'LT_MAG_NBR' 'LT_MAG_SWIR1' 'LT_RATE_NBR' 'LT_RATE_SWIR1'
 'LT_YSD_NBR' 'LT_YSD_SWIR1' 'tpa' 'ba' 'sdi' 'ccf' 'qmd' 'tcuft' 'topht'
 'number_of_strata' 'total_cover' 'structure_class' 'canopy_baseheight'
 'canopy_bulkdensity' '

In [27]:
OUTLIERS = '../data/interim/outlier_uuids.csv'
outliers = pd.read_csv(OUTLIERS)
# filter out the height outliers
df = df[~df.uuid.isin(outliers.outlier_uuid)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7952 entries, 0 to 7951
Columns: 138 entries, uuid to elevation
dtypes: float64(52), int64(82), object(4)
memory usage: 8.4+ MB


In [28]:
df = df.loc[(df.topht > 0)&(df.total_cover >= 10)&(df.qmd > 0)]
df.loc[df.qmd > 50, 'qmd'] = 50
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7477 entries, 0 to 7951
Columns: 138 entries, uuid to elevation
dtypes: float64(52), int64(82), object(4)
memory usage: 7.9+ MB


In [29]:
df.to_csv('../data/processed/sentinel_structure_training_data.csv', index=False, header=True)

## Inspect how many samples we have for different years, regions, etc.

In [30]:
df.groupby(by=['year'])[['uuid']].count().rename({'uuid':'count'}, axis=1)

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2019,3979
2020,3498


In [31]:
pd.pivot_table(df, 
               values='uuid', 
               aggfunc='count', 
               index=['meas_yr'], 
               columns=['year'], 
               fill_value=0)

year,2019,2020
meas_yr,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,487,0
2011,392,392
2013,739,741
2014,451,452
2015,444,445
2016,650,651
2017,483,484
2018,333,333


In [32]:
ecoreg_counts = df.groupby(by=['ecoregion3'])[['uuid', 'year', 'plot_size_ac']].nunique()
ecoreg_counts

Unnamed: 0_level_0,uuid,year,plot_size_ac
ecoregion3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
blue_mountains,87,2,1
cascades,704,2,3
coast_range,1760,2,2
columbia_plateau,10,2,1
eastern_cascades_slopes_and_foothills,415,2,2
klamath_mountains_california_high_north_coast_range,291,2,2
north_cascades,411,2,2
northern_rockies,103,2,1
puget_lowland,159,2,1
willamette_valley,45,2,3


## Available features
The different types of predictor variables we can use to predict a forest attribute, including climate, lidar-derived, soil, and satellite imagery.

In [33]:
df[S2_COLS + LANDTRENDR_COLS].describe()

Unnamed: 0,S2_R_LEAFOFF,S2_G_LEAFOFF,S2_B_LEAFOFF,S2_NIR_LEAFOFF,S2_SWIR1_LEAFOFF,S2_SWIR2_LEAFOFF,S2_RE1_LEAFOFF,S2_RE2_LEAFOFF,S2_RE3_LEAFOFF,S2_RE4_LEAFOFF,...,S2_dRE3,S2_dRE4,LT_DUR_NBR,LT_DUR_SWIR1,LT_MAG_NBR,LT_MAG_SWIR1,LT_RATE_NBR,LT_RATE_SWIR1,LT_YSD_NBR,LT_YSD_SWIR1
count,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,...,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0,7477.0
mean,531.694289,605.416149,505.358747,2237.596022,707.922248,375.12432,857.882566,1819.585928,2082.188337,2248.629412,...,411.260546,461.857058,18.19901,14.429183,76.050823,279.592751,16.830814,311.751103,28.655343,26.740671
std,871.376845,871.553734,890.183177,867.85963,309.202564,224.237164,881.365459,863.581571,837.038646,805.090546,...,916.861539,867.560133,12.229163,14.951374,191.08372,636.569985,35.910229,525.947378,10.411304,9.833718
min,9.79719,26.922216,1.0,348.355904,91.010674,42.869246,70.049804,257.344712,311.294348,379.008394,...,-5838.540354,-5129.971255,1.0,1.0,-734.0,-3098.0,-116.0,-390.0,0.0,0.0
25%,137.034631,224.795087,120.027203,1692.104687,524.976136,247.182184,433.9729,1328.277211,1572.159461,1742.03119,...,168.669551,183.927594,6.0,1.0,-5.0,-134.0,0.0,-5.0,28.0,22.0
50%,211.648345,307.034381,188.857381,2100.699373,649.669441,309.616828,564.841457,1624.619934,1924.779642,2118.979181,...,482.779503,503.855434,19.0,4.0,40.0,151.0,3.0,30.0,34.0,31.0
75%,497.018692,532.594164,418.635884,2588.109086,809.862578,421.175325,863.546154,2041.937584,2380.962769,2586.79336,...,830.845174,867.817089,27.0,35.0,126.0,521.0,17.0,397.0,35.0,34.0
max,8672.33209,9067.047264,9251.70246,7712.45783,3020.551859,2062.405424,8807.240741,8250.834024,7530.579326,7091.325768,...,4035.860442,4245.146064,36.0,36.0,1071.0,3133.0,268.0,3133.0,35.0,35.0


## Selecting features and targets
This is the first step in determining what features we want to use, and what we want to predict.

In [34]:
X_COLS = S2_COLS + LANDTRENDR_COLS + ['elevation', 'lat', 'lon'] + ['ecoregion3'] 
Y_COLS = ['total_cover', 'topht', 'qmd', 'tcuft']

Y_NAMES = [col.upper() for col in Y_COLS]

In [35]:
X_COLS

['S2_R_LEAFOFF',
 'S2_G_LEAFOFF',
 'S2_B_LEAFOFF',
 'S2_NIR_LEAFOFF',
 'S2_SWIR1_LEAFOFF',
 'S2_SWIR2_LEAFOFF',
 'S2_RE1_LEAFOFF',
 'S2_RE2_LEAFOFF',
 'S2_RE3_LEAFOFF',
 'S2_RE4_LEAFOFF',
 'S2_R_LEAFON',
 'S2_G_LEAFON',
 'S2_B_LEAFON',
 'S2_NIR_LEAFON',
 'S2_SWIR1_LEAFON',
 'S2_SWIR2_LEAFON',
 'S2_RE1_LEAFON',
 'S2_RE2_LEAFON',
 'S2_RE3_LEAFON',
 'S2_RE4_LEAFON',
 'S2_NDVI_LEAFON',
 'S2_SAVI_LEAFON',
 'S2_BRIGHTNESS_LEAFON',
 'S2_GREENNESS_LEAFON',
 'S2_WETNESS_LEAFON',
 'S2_NDVI_LEAFOFF',
 'S2_SAVI_LEAFOFF',
 'S2_BRIGHTNESS_LEAFOFF',
 'S2_GREENNESS_LEAFOFF',
 'S2_WETNESS_LEAFOFF',
 'S2_dR',
 'S2_dG',
 'S2_dB',
 'S2_dNIR',
 'S2_dSWIR1',
 'S2_dSWIR2',
 'S2_dRE1',
 'S2_dRE2',
 'S2_dNDVI',
 'S2_dSAVI',
 'S2_dBRIGHTNESS',
 'S2_dGREENNESS',
 'S2_dWETNESS',
 'S2_dRE3',
 'S2_dRE4',
 'LT_DUR_NBR',
 'LT_DUR_SWIR1',
 'LT_MAG_NBR',
 'LT_MAG_SWIR1',
 'LT_RATE_NBR',
 'LT_RATE_SWIR1',
 'LT_YSD_NBR',
 'LT_YSD_SWIR1',
 'elevation',
 'lat',
 'lon',
 'ecoregion3']

In [19]:
USE_REGIONS = ['blue_mountains', 'coast_range', 'north_cascades', 'cascades',
               'klamath_mountains_california_high_north_coast_range', 
               'eastern_cascades_slopes_and_foothills', 'northern_rockies',
               'puget_lowland', 'willamette_valley']
display(df.groupby('ecoregion3')[Y_COLS].mean().round(1).loc[USE_REGIONS])
display(df[Y_COLS].describe())

Unnamed: 0_level_0,total_cover,topht,qmd,tcuft
ecoregion3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blue_mountains,33.2,61.5,13.2,2016.9
coast_range,70.6,110.7,16.6,9783.9
north_cascades,63.0,87.5,12.3,6119.7
cascades,65.4,103.5,13.8,8039.2
klamath_mountains_california_high_north_coast_range,60.8,93.9,11.2,7284.3
eastern_cascades_slopes_and_foothills,43.0,72.9,12.3,3542.5
northern_rockies,42.4,75.0,11.7,2741.1
puget_lowland,68.7,92.1,12.8,6325.9
willamette_valley,64.2,113.0,18.8,9516.5


Unnamed: 0,total_cover,topht,qmd,tcuft
count,7477.0,7477.0,7477.0,7477.0
mean,63.105256,98.35014,14.383718,7631.583523
std,21.035939,38.580756,7.930582,6280.335812
min,10.0,11.0,2.045915,17.0
25%,50.0,70.0,8.842887,2849.0
50%,67.0,96.0,12.913312,6076.0
75%,79.0,124.0,17.886864,10763.0
max,100.0,267.0,50.0,51330.0


In [20]:
df = df.reset_index(drop=True)
X, Y = df[X_COLS], df[Y_COLS]

In [21]:
df[X_COLS].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7477 entries, 0 to 7476
Data columns (total 57 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   S2_R_LEAFOFF           7477 non-null   float64
 1   S2_G_LEAFOFF           7477 non-null   float64
 2   S2_B_LEAFOFF           7477 non-null   float64
 3   S2_NIR_LEAFOFF         7477 non-null   float64
 4   S2_SWIR1_LEAFOFF       7477 non-null   float64
 5   S2_SWIR2_LEAFOFF       7477 non-null   float64
 6   S2_RE1_LEAFOFF         7477 non-null   float64
 7   S2_RE2_LEAFOFF         7477 non-null   float64
 8   S2_RE3_LEAFOFF         7477 non-null   float64
 9   S2_RE4_LEAFOFF         7477 non-null   float64
 10  S2_R_LEAFON            7477 non-null   float64
 11  S2_G_LEAFON            7477 non-null   float64
 12  S2_B_LEAFON            7477 non-null   float64
 13  S2_NIR_LEAFON          7477 non-null   float64
 14  S2_SWIR1_LEAFON        7477 non-null   float64
 15  S2_S

## Split datasets by ecoregion
We want to explore model transferability between regions, so we'll train models independently on subsets of the data within a single ecoregion, as well as a model that is trained on all available ecoregions. 

In [22]:
ecoregions = list(np.sort([reg for reg in pd.unique(df.ecoregion3) if ecoreg_counts.loc[reg]['uuid'] > 20]))

eco_X_idx = [X.loc[X.ecoregion3 == eco].index.values for eco in ecoregions]

eco_X_dfs = [X.loc[X.ecoregion3 == eco].drop(['ecoregion3'], axis=1) for eco in ecoregions]
eco_Y_dfs = [Y.loc[idx] for idx in eco_X_idx]

# append a "global" model that contains data from all ecoregions
ecoregions.append('all')
ecoregion_names = ['_'.join(x.split('_')[0:2]) for x in ecoregions]
eco_X_dfs.append(X.drop(['ecoregion3'], axis=1))
eco_Y_dfs.append(Y)

ecoregion_display_names = [' '.join(x.upper().split('_')[:2]) for x in ecoregions]

In [23]:
cover_class_bins = [10,40,70,100]
cover_class_labels = ['OPEN', 'MODERATE', 'CLOSED']
height_class_bins = np.arange(0,300,20)
height_class_labels = [f'{x}-{x+20}' for x in height_class_bins[:-1]]
diameter_class_bins = [1, 5, 10, 15, 20, 999]
diameter_class_labels = ['SEED/SAP', 'SMALL', 'MEDIUM', 'LARGE', 'VERY_LARGE']

## Scoring
We'll use Root Mean Square Error to evaluate model performance.

In [24]:
def rmse(obs, pred):
    return np.sqrt((np.square(obs-pred)).mean())

def nrmse(obs, pred):
    return rmse(pred,obs) / obs.mean()

def mae(obs, pred):   
    return abs(pred - obs).mean()

def mape(obs, pred):    
    return abs(pred - obs).mean() / obs.mean()

def bias(obs, pred):   
    return (pred - obs).mean()

def rel_bias(obs, pred):
    return bias(pred,obs) / obs.mean()

def bin_accuracy(obs, pred, bins, fuzzy_tol=0):
    pred_binned = np.digitize(pred, bins)
    obs_binned = np.digitisze(obs, bins)
    diff = abs(pred_binned - obs_binned)
    
    return (diff <= fuzzy_tol).sum() / len(diff)

def confidence_interval_half(X, confidence=0.95):
    n = len(X)
    se = stats.sem(X)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return h

This helper function will calculate RMSE scores for each regionally-trained model and the global model on each ecoregion.

## Fit some models
For each type of model, we'll employ cross-validation to tune model hyperparameters, generating a tuned model for each ecoregion as well as a tuned model using all training data. 

In [25]:
MODELS = {
    'ElasticNet': ElasticNet(),
    'Lasso': Lasso(), 
    'KNeighborsRegressor': KNeighborsRegressor(n_jobs=-1),
    'RandomForestRegressor': RandomForestRegressor(n_jobs=-1), 
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(), 
}

FIT_PARAMS = {
    'ElasticNet': {
        'alpha': np.logspace(-4,2,7),
        'l1_ratio': np.arange(0.0, 1.0, 0.1),
    },
    'Lasso': {
        'alpha': np.logspace(-4,2,7),
    },
    'KNeighborsRegressor': {
        'n_neighbors': [1,2,3,4,5,10,20],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'manhattan']
    },
    'RandomForestRegressor': {
        'n_estimators': [100, 500, 1000],
        'max_features': ['sqrt', None],
        'max_depth': [5, 20, None],
        'max_samples': [0.5, None]
    },
    'HistGradientBoostingRegressor': {
        'max_iter': [50, 100, 200],
        'min_samples_leaf': [5, 10, 20],
        'max_depth': [3, 5, 10],
        'learning_rate': [0.01, 0.1],
    },
}

In [26]:
NUM_OUTER_FOLDS = 5
NUM_INNER_FOLDS = 3
SCORE_FUNCS = [rmse, nrmse, mae, mape, bias, rel_bias]
score_names = [func.__name__ for func in SCORE_FUNCS]

In [27]:
def build_insider_results_dictionary(regions, model_names, num_outer_folds, score_funcs, target_vars):
    results = {}
    for region in regions:
        results[region] = {}
        for model_name in model_names:
            results[region][model_name] = {}
            results[region][model_name]['fitted_model'] = None
            results[region][model_name]['best_params'] = None
            results[region][model_name]['cv_results'] = {}
            for fold_idx in range(num_outer_folds):  # results from each outer loop of nested CV
                fold_num = fold_idx + 1
                results[region][model_name]['cv_results'][fold_num] = {}
                results[region][model_name]['cv_results'][fold_num]['best_params'] = None 
                results[region][model_name]['cv_results'][fold_num]['predict_time'] = None
                for score_func in score_funcs:
                    score_func_name = score_func.__name__
                    results[region][model_name]['cv_results'][fold_num][score_func_name] = {
                        y: None for y in target_vars
                    }
    return results

def build_global_results_dictionary(regions, model_names, num_outer_folds, score_funcs, target_vars):
    results = {}
    for model_name in model_names:
        results[model_name] = {}
        results[model_name]['fitted_model'] = None
        results[model_name]['best_params'] = None
        results[model_name]['cv_results'] = {}
        for fold_idx in range(num_outer_folds):  # results from each outer loop of nested CV
            fold_num = fold_idx + 1
            results[model_name]['cv_results'][fold_num] = {}
            results[model_name]['cv_results'][fold_num]['best_params'] = None 
            results[model_name]['cv_results'][fold_num]['predict_time'] = None
            for region in regions:
                results[model_name]['cv_results'][fold_num][region] = {}
                for score_func in score_funcs:
                    score_func_name = score_func.__name__
                    results[model_name]['cv_results'][fold_num][region][score_func_name] = {
                        y: None for y in target_vars
                    }
    return results

def build_outsider_results_dictionary(regions, model_names, score_funcs, target_vars):
    results = {}
    for region in regions:
        results[region] = {}
        for model_name in model_names:
            results[region][model_name] = {}
            results[region][model_name]['fitted_model'] = None
            results[region][model_name]['best_params'] = None
            results[region][model_name]['predict_time'] = None
            for score_func in score_funcs:
                score_func_name = score_func.__name__
                results[region][model_name][score_func_name] = {
                    y: None for y in target_vars
                }
    return results

def build_visiting_insider_results_dictionary(regions, model_names, score_funcs, target_vars):
    results = {}
    for target_region in regions:
        results[target_region] = {}
        for train_region in [r for r in regions if r != target_region]:
            results[target_region][train_region] = {}
            for model_name in model_names:
                results[target_region][train_region][model_name] = {}
                for score_func in score_funcs:
                    score_func_name = score_func.__name__
                    results[target_region][train_region][model_name][score_func_name] = {
                        y: None for y in target_vars
                    }
    return results

In [28]:
def tune_insider_model(model_name, num_outer_folds=NUM_OUTER_FOLDS, num_inner_folds=NUM_INNER_FOLDS):
    print(model_name)
    print('-'*len(model_name))
    model = MODELS[model_name]
    fit_params = FIT_PARAMS[model_name]
    train_regions = [x for x in ecoregions if x.upper() != 'ALL']
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RegressorChain(base_estimator=model)),
    ])
    search_params = {f'model__base_estimator__{key}': value for key, value in fit_params.items()}
    
    cv_outer = GroupKFold(num_outer_folds)
    cv_inner = GroupKFold(num_inner_folds)
    
    for i, ecoregion in enumerate(train_regions):
        ecoregion_name = ecoregion_display_names[i]
        print(f'Starting on {ecoregion_name}', end='... ')
        X = eco_X_dfs[i]
        Y = eco_Y_dfs[i]
        outer_groups = df.loc[X.index, 'uuid'].values
        
        outer_fold_num = 1
        for train_ix, test_ix in cv_outer.split(X, groups=outer_groups):
            X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
            Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
            inner_groups = df.loc[X_train.index, 'uuid'].values
            
            inner_search = GridSearchCV(pipe, search_params, 
                                        scoring='neg_mean_squared_error', 
                                        n_jobs=-1, cv=cv_inner, refit=True)
            
            inner_result = inner_search.fit(X_train, Y_train, groups=inner_groups)
            insider_results[ecoregion][model_name]['cv_results'][outer_fold_num]['best_params'] = inner_result.best_params_
            
            inner_best_model = inner_result.best_estimator_
            start_time = time.time()
            Y_pred = inner_best_model.predict(X_test)
            end_time = time.time()
            total_predict_time = end_time - start_time
            avg_predict_time = total_predict_time / len(X_test)
            insider_results[ecoregion][model_name]['cv_results'][outer_fold_num]['predict_time'] = avg_predict_time
            
            for score_func in SCORE_FUNCS:
                score_func_name = score_func.__name__
                scores = score_func(Y_test, Y_pred)
                for y_var in scores.index:
                    insider_results[ecoregion][model_name]['cv_results'][outer_fold_num][score_func_name][y_var] = scores.loc[y_var]
                    
            print(outer_fold_num, end='... ')
            outer_fold_num += 1
        print('Done scoring. Now fitting a final model', end='... ')
        
        # done with scoring of models, now time to tune a model using the whole dataset
        outer_search = GridSearchCV(pipe, search_params, 
                                    scoring='neg_mean_squared_error', 
                                    n_jobs=-1, cv=cv_outer, refit=True)
        outer_result = outer_search.fit(X, Y, groups=outer_groups)
        
        # now fit on the entire dataset, not just training set
        model = outer_result.best_estimator_
        model.set_params(**outer_result.best_params_)
        X = df.loc[df.ecoregion3 == ecoregion, X_COLS].drop(['ecoregion3'], axis=1)
        y = df.loc[df.ecoregion3 == ecoregion, Y_COLS]
        model.fit(X, y)

        eco_name = '_'.join(ecoregion.split('_')[:2])
        outfile = f'{eco_name}-sentinel-{model_name}-chained.pkl'
        outpath = os.path.join('../models/structure_models', outfile)
        with open(outpath, 'wb') as file:
            pickle.dump(model, file)
        
        insider_results[ecoregion][model_name]['fitted_model'] = model
        insider_results[ecoregion][model_name]['best_params'] = outer_result.best_params_
        
        cv_results_dict = {ecoregion: insider_results[ecoregion][model_name]['cv_results'] for ecoregion in train_regions}
        print('All done.')
    
    return cv_results_dict

def tune_outsider_model(model_name, num_folds=5):
    print(model_name)
    print('-'*len(model_name))
    model = MODELS[model_name]
    fit_params = FIT_PARAMS[model_name]
    train_regions = [x for x in ecoregions if x.upper() != 'ALL']
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RegressorChain(base_estimator=model)),
    ])
    search_params = {f'model__base_estimator__{key}': value for key, value in fit_params.items()}
    
    groupkfold = GroupKFold(num_folds)
    
    for i, ecoregion in enumerate(train_regions):
        ecoregion_name = ecoregion_display_names[i]
        print(f'Starting on {ecoregion_name}', end='... ')
        X_train = X.loc[X.ecoregion3 != ecoregion].drop('ecoregion3', axis=1)
        Y_train = Y.loc[X_train.index]
        X_test = X.loc[X.ecoregion3 == ecoregion].drop('ecoregion3', axis=1)
        Y_test = Y.loc[X_test.index]
        groups = df.loc[X_train.index]['ecoregion3'].values
        
        search = GridSearchCV(pipe, search_params, 
                              scoring='neg_mean_squared_error',
                              n_jobs=-1, cv=groupkfold, refit=True)
            
        result = search.fit(X_train, Y_train, groups=groups)
        print('Done fitting, now scoring', end='... ')
        outsider_results[ecoregion][model_name]['best_params'] = result.best_params_
        outsider_results[ecoregion][model_name]['fitted_model'] = result.best_estimator_
        
        best_model = result.best_estimator_       
        start_time = time.time()
        Y_pred = best_model.predict(X_test)
        end_time = time.time()
        total_predict_time = end_time - start_time
        avg_predict_time = total_predict_time / len(X_test)
        outsider_results[ecoregion][model_name]['predict_time'] = avg_predict_time
            
        for score_func in SCORE_FUNCS:
            score_func_name = score_func.__name__
            scores = score_func(Y_test, Y_pred)
            for y_var in scores.index:
                outsider_results[ecoregion][model_name][score_func_name][y_var] = scores.loc[y_var]
        
        results_dict = {ecoregion: outsider_results[ecoregion][model_name] for ecoregion in train_regions}
        print('All done.')
    
    return results_dict

def tune_global_model(model_name, num_outer_folds=NUM_OUTER_FOLDS, num_inner_folds=NUM_INNER_FOLDS):
    print(model_name)
    print('-'*len(model_name))
    print(f'Scoring with {NUM_OUTER_FOLDS} folds... ', end='')
    model = MODELS[model_name]
    fit_params = FIT_PARAMS[model_name]
    test_regions = [x for x in ecoregions if x.upper() != 'ALL']
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RegressorChain(base_estimator=model)),
    ])
    search_params = {f'model__base_estimator__{key}': value for key, value in fit_params.items()}
    
    cv_outer = GroupKFold(num_outer_folds)
    cv_inner = GroupKFold(num_inner_folds)
    
    X = df[X_COLS].drop('ecoregion3', axis=1)
    Y = df[Y_COLS]
    outer_groups = df['uuid'].values
        
    outer_fold_num = 1
    for train_ix, test_ix in cv_outer.split(X, groups=outer_groups):
        X_train, X_test = X.loc[train_ix], X.loc[test_ix]
        Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]
        inner_groups = df.loc[train_ix, 'uuid'].values

        inner_search = GridSearchCV(pipe, search_params, 
                                    scoring='neg_mean_squared_error', 
                                    n_jobs=-1, cv=cv_inner, refit=True)

        inner_result = inner_search.fit(X_train, Y_train, groups=inner_groups)
        global_results[model_name]['cv_results'][outer_fold_num]['best_params'] = inner_result.best_params_

        inner_best_model = inner_result.best_estimator_
        start_time = time.time()
        Y_pred = inner_best_model.predict(X_test)
        end_time = time.time()
        total_predict_time = end_time - start_time
        avg_predict_time = total_predict_time / len(X_test)
        global_results[model_name]['cv_results'][outer_fold_num]['predict_time'] = avg_predict_time

        for ecoregion in test_regions:
            region_mask = (df.loc[test_ix, 'ecoregion3'] == ecoregion).values
            regional_X_test = X_test.loc[test_ix[region_mask]]
            regional_Y_test = Y_test.loc[test_ix[region_mask]]
            regional_Y_pred = inner_best_model.predict(regional_X_test)
            
            for score_func in SCORE_FUNCS:
                score_func_name = score_func.__name__
                scores = score_func(regional_Y_test, regional_Y_pred)
                for y_var in scores.index:
                    global_results[model_name]['cv_results'][outer_fold_num][ecoregion][score_func_name][y_var] = scores.loc[y_var]

        print(outer_fold_num, end='... ')
        outer_fold_num += 1
    
    print('Done scoring. Now fitting a final model', end='... ')
        
    # done with scoring of models, now time to tune a model using the whole dataset
    outer_search = GridSearchCV(pipe, search_params, 
                                scoring='neg_mean_squared_error', 
                                n_jobs=-1, cv=cv_outer, refit=True)
    outer_result = outer_search.fit(X, Y, groups=outer_groups)

     # now fit on the entire dataset, not just training set
    model = outer_result.best_estimator_
    model.set_params(**outer_result.best_params_)
    X = df[X_COLS].drop(['ecoregion3'], axis=1)
    y = df[Y_COLS]
    model.fit(X, y)

    outfile = f'global-sentinel-{model_name}-chained.pkl'
    outpath = os.path.join('../models/structure_models', outfile)
    with open(outpath, 'wb') as file:
        pickle.dump(model, file)
    print('All done.')

    global_results[model_name]['fitted_model'] = outer_result.best_estimator_
    global_results[model_name]['best_params'] = outer_result.best_params_

    results_dict = global_results[model_name]
    
    return results_dict

## Fit Global Models
These models get to see data from every ecoregion during training and tuning.

In [29]:
global_results = build_global_results_dictionary(ecoregions[:-1], MODELS.keys(), NUM_OUTER_FOLDS, SCORE_FUNCS, Y_COLS)

In [30]:
def parse_global_results(results):
    data = []
    for fold in range(NUM_OUTER_FOLDS):
        for ecoregion in ecoregions[:-1]:
            for target in Y_COLS:
                for score_name in score_names:
                    data.append((fold+1, ecoregion, target, score_name, results['cv_results'][fold+1][ecoregion][score_name][target]))
    return pd.DataFrame(data, columns=['cv_fold', 'ecoregion', 'target', 'metric', 'score'])

In [31]:
elastic_global = tune_global_model('ElasticNet')
lasso_global = tune_global_model('Lasso')
knn_global = tune_global_model('KNeighborsRegressor')
rf_global = tune_global_model('RandomForestRegressor')
gbm_global = tune_global_model('HistGradientBoostingRegressor')

ElasticNet
----------
Scoring with 5 folds... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Lasso
-----
Scoring with 5 folds... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
KNeighborsRegressor
-------------------
Scoring with 5 folds... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
RandomForestRegressor
---------------------
Scoring with 5 folds... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
HistGradientBoostingRegressor
-----------------------------
Scoring with 5 folds... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.


In [32]:
RESULTS_TO_CONCAT = [elastic_global, lasso_global, knn_global, rf_global, gbm_global]
NAMES = ['ElasticNet', 'Lasso', 'kNN', 'RF', 'GBM']
dfs_to_concat = []
for res, name in zip(RESULTS_TO_CONCAT, NAMES):
    tmp_df = parse_global_results(res)
    tmp_df['model'] = name
    dfs_to_concat.append(tmp_df)
all_global_results = pd.concat(dfs_to_concat, axis=0, ignore_index=True)
all_global_results['ecoregion'] = all_global_results['ecoregion'].apply(lambda x: ' '.join(x.title().replace('_',' ').split()[:2]))
all_global_results.columns = [col.upper() for col in all_global_results.columns]
# all_global_results = all_global_results.rename({'SCORE': 'MAE'}, axis=1)
all_global_results.head()

Unnamed: 0,CV_FOLD,ECOREGION,TARGET,METRIC,SCORE,MODEL
0,1,Blue Mountains,total_cover,rmse,10.313029,ElasticNet
1,1,Blue Mountains,total_cover,nrmse,0.299894,ElasticNet
2,1,Blue Mountains,total_cover,mae,8.691027,ElasticNet
3,1,Blue Mountains,total_cover,mape,0.252728,ElasticNet
4,1,Blue Mountains,total_cover,bias,-1.658816,ElasticNet


In [33]:
all_global_results.to_csv('../data/processed/nestedcv_chained_global_results_satellite_structure.csv', header=True, index=False)

## Fit Outsider Models
These models have data from the ecoregion they're tested on held out during training.

In [34]:
outsider_results = build_outsider_results_dictionary(ecoregions[:-1], MODELS.keys(), SCORE_FUNCS, Y_COLS)

In [35]:
def parse_outsider_results(results):
    data = []
    for ecoregion in ecoregions[:-1]:
        for target in Y_COLS:
            for score_name in score_names:
                data.append((np.nan, ecoregion, target, score_name, results[ecoregion][score_name][target]))
    return pd.DataFrame(data, columns=['cv_fold', 'ecoregion', 'target', 'metric', 'score'])

In [None]:
elastic_outsider = tune_outsider_model('ElasticNet')
lasso_outsider = tune_outsider_model('Lasso')
knn_outsider = tune_outsider_model('KNeighborsRegressor')
rf_outsider = tune_outsider_model('RandomForestRegressor')
gbm_outsider = tune_outsider_model('HistGradientBoostingRegressor')

ElasticNet
----------
Starting on BLUE MOUNTAINS... Done fitting, now scoring... All done.
Starting on CASCADES... Done fitting, now scoring... All done.
Starting on COAST RANGE... Done fitting, now scoring... All done.
Starting on EASTERN CASCADES... Done fitting, now scoring... All done.
Starting on KLAMATH MOUNTAINS... Done fitting, now scoring... All done.
Starting on NORTH CASCADES... Done fitting, now scoring... All done.
Starting on NORTHERN ROCKIES... Done fitting, now scoring... All done.
Starting on PUGET LOWLAND... Done fitting, now scoring... All done.
Starting on WILLAMETTE VALLEY... Done fitting, now scoring... All done.
Lasso
-----
Starting on BLUE MOUNTAINS... Done fitting, now scoring... All done.
Starting on CASCADES... Done fitting, now scoring... All done.
Starting on COAST RANGE... Done fitting, now scoring... All done.
Starting on EASTERN CASCADES... Done fitting, now scoring... All done.
Starting on KLAMATH MOUNTAINS... Done fitting, now scoring... All done.
Star

In [None]:
RESULTS_TO_CONCAT = [elastic_outsider, lasso_outsider, knn_outsider, rf_outsider, gbm_outsider]
NAMES = ['ElasticNet', 'Lasso', 'kNN', 'RF', 'GBM']
dfs_to_concat = []
for res, name in zip(RESULTS_TO_CONCAT, NAMES):
    tmp_df = parse_outsider_results(res)
    tmp_df['model'] = name
    dfs_to_concat.append(tmp_df)
all_outsider_results = pd.concat(dfs_to_concat, axis=0, ignore_index=True)
all_outsider_results['ecoregion'] = all_outsider_results['ecoregion'].apply(lambda x: ' '.join(x.title().replace('_',' ').split()[:2]))
all_outsider_results.columns = [col.upper() for col in all_outsider_results.columns]
all_outsider_results.head()

In [None]:
all_outsider_results.to_csv('../data/processed/nestedcv_chained_outsider_results_satellite_structure.csv', header=True, index=False)

## Fit Insider Models
These models are trained with observations from a single ecoregion.

In [29]:
insider_results = build_insider_results_dictionary(ecoregions[:-1], MODELS.keys(), 5, SCORE_FUNCS, Y_COLS)

In [30]:
def parse_insider_results(results):
    data = []
    for ecoregion in ecoregions[:-1]:
        for fold_num in results[ecoregion].keys():
            for target in Y_COLS:
                for score_name in score_names:
                    data.append((fold_num, ecoregion, target, score_name, results[ecoregion][fold_num][score_name][target]))
    return pd.DataFrame(data, columns=['cv_fold', 'ecoregion', 'target', 'metric', 'score'])

In [31]:
elastic_insider = tune_insider_model('ElasticNet')
lasso_insider = tune_insider_model('Lasso')
knn_insider = tune_insider_model('KNeighborsRegressor')
rf_insider = tune_insider_model('RandomForestRegressor')
gbm_insider = tune_insider_model('HistGradientBoostingRegressor')

ElasticNet
----------
Starting on BLUE MOUNTAINS... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on CASCADES... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on COAST RANGE... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on EASTERN CASCADES... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on KLAMATH MOUNTAINS... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on NORTH CASCADES... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on NORTHERN ROCKIES... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on PUGET LOWLAND... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Starting on WILLAMETTE VALLEY... 1... 2... 3... 4... 5... Done scoring. Now fitting a final model... All done.
Lasso


In [32]:
RESULTS_TO_CONCAT = [elastic_insider, lasso_insider, knn_insider, rf_insider, gbm_insider]
NAMES = ['ElasticNet', 'Lasso', 'kNN', 'RF', 'GBM']
dfs_to_concat = []
for res, name in zip(RESULTS_TO_CONCAT, NAMES):
    tmp_df = parse_insider_results(res)
    tmp_df['model'] = name
    dfs_to_concat.append(tmp_df)
all_insider_results = pd.concat(dfs_to_concat, axis=0, ignore_index=True)
all_insider_results['ecoregion'] = all_insider_results['ecoregion'].apply(lambda x: ' '.join(x.title().replace('_',' ').split()[:2]))
all_insider_results.columns = [col.upper() for col in all_insider_results.columns]
all_insider_results.head()

Unnamed: 0,CV_FOLD,ECOREGION,TARGET,METRIC,SCORE,MODEL
0,1,Blue Mountains,total_cover,rmse,8.713998,ElasticNet
1,1,Blue Mountains,total_cover,nrmse,0.251769,ElasticNet
2,1,Blue Mountains,total_cover,mae,7.248796,ElasticNet
3,1,Blue Mountains,total_cover,mape,0.209436,ElasticNet
4,1,Blue Mountains,total_cover,bias,1.29172,ElasticNet


In [33]:
all_insider_results.to_csv('../data/processed/nestedcv_chained_insider_results_satellite_structure.csv', header=True, index=False)

## Use Trained Insider Models to Score Visiting Insider Models
These models are trained on a single region, and scored on other regions they've never seen before. 

In [34]:
visitor_results = build_visiting_insider_results_dictionary(ecoregions[:-1], MODELS.keys(), SCORE_FUNCS, Y_COLS)

In [35]:
visitor_results = []
for target_region in ecoregions[:-1]:
    for train_region in [r for r in ecoregions[:-1] if r != target_region]:
        for model_name in MODELS.keys():
            model = insider_results[train_region][model_name]['fitted_model']
            targ_idx = X.loc[X.ecoregion3 == target_region].index.values
            targ_X = X.loc[targ_idx].drop(['ecoregion3'], axis=1)
            pred = model.predict(targ_X)
            obs = Y.loc[targ_idx]
            for score_func in SCORE_FUNCS:
                score_func_name = score_func.__name__
                scores = score_func(obs, pred)
                for y, score in scores.iteritems():
                    visitor_results.append(
                        (' '.join(target_region.title().replace('_',' ').split()),
                         ' '.join(train_region.title().replace('_',' ').split()),
                         model_name, score_func_name, y, score))
visitor_df = pd.DataFrame(visitor_results, 
                          columns = ['TARGET_ECOREGION', 'TRAIN_ECOREGION', 
                                     'MODEL', 'METRIC', 'TARGET', 'SCORE'])
visitor_df.head()

Unnamed: 0,TARGET_ECOREGION,TRAIN_ECOREGION,MODEL,METRIC,TARGET,SCORE
0,Blue Mountains,Cascades,ElasticNet,rmse,total_cover,25.945215
1,Blue Mountains,Cascades,ElasticNet,rmse,topht,27.145965
2,Blue Mountains,Cascades,ElasticNet,rmse,qmd,15.284737
3,Blue Mountains,Cascades,ElasticNet,rmse,tcuft,3832.520344
4,Blue Mountains,Cascades,ElasticNet,nrmse,total_cover,0.781861


In [36]:
visitor_df.to_csv('../data/processed/nestedcv_chained_visitor_results_satellite_structure.csv', 
                  header=True, index=False)