In [None]:
import pickle
from copy import deepcopy
import time

# data prep and model-tuning
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# types of models we'll fit
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
# from sklearn.multioutput import RegressorChain
from sklearn.base import clone

In [None]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
from warnings import simplefilter
simplefilter(action='ignore', category=UserWarning)

In [None]:
sns.set_style('darkgrid')
%matplotlib inline

## Our Data

In [None]:
PLOT_DATA = '../data/processed/plot_features.csv'
KEEP_PLOT_COLS = ['uuid', 'lat', 'lon', 'ecoregion3', 'agency', 'distance_to_water_m', 'plot_size_ac', 'meas_yr']
plot_data = pd.read_csv(PLOT_DATA)[KEEP_PLOT_COLS]
plot_data.info()

In [None]:
LIDAR_DATA = '../data/processed/lidar_features.csv'
lidar_data = pd.read_csv(LIDAR_DATA)[['uuid', 'elevation']]
lidar_data.info()

In [None]:
plot_data = plot_data.merge(lidar_data, left_on=['uuid'], right_on=['uuid'], how='inner').drop_duplicates(subset=['uuid'])
plot_data.info()

In [None]:
INVENTORY = '../data/processed/inventory_features.csv'
inv_data = pd.read_csv(INVENTORY, index_col=['uuid', 'year'])
inv_data.info()

In [None]:
SATELLITE = '../data/processed/satellite_features.csv'
sat = pd.read_csv(SATELLITE, index_col=['uuid', 'year'])
S2_COLS = [col for col in sat.columns if col.startswith('S2')]
LANDTRENDR_COLS = [col for col in sat.columns if col.startswith('LT')]
sat = sat[S2_COLS + LANDTRENDR_COLS].dropna()
sat.info()

## Filter out some of the training data
We can exclude some of the training data based on how far separated the inventory data (interpolated using FVS simulations) is from the year the imagery was collected. Similarly, we can screen out training examples that had relatively low density of lidar returns.

In [None]:
sat_and_inv = sat.merge(inv_data, how='inner', left_index=True, right_index=True).reset_index()
sat_and_inv.info()

In [None]:
df = sat_and_inv.merge(plot_data, how='inner', left_on=['uuid'], right_on=['uuid']).dropna()
print('{:,d} samples'.format(len(df)))
print('Columns:', df.columns.values)

In [None]:
OUTLIERS = '../data/interim/outlier_uuids.csv'
outliers = pd.read_csv(OUTLIERS)
# filter out the height outliers
df = df[~df.uuid.isin(outliers.outlier_uuid)]
df.info()

In [None]:
df = df.loc[(df.topht > 0)&(df.total_cover >= 10)&(df.qmd > 0)]
df.loc[df.qmd > 50, 'qmd'] = 50
df.info()

## Inspect how many samples we have for different years, regions, etc.

In [None]:
df.groupby(by=['year'])[['uuid']].count().rename({'uuid':'count'}, axis=1)

In [None]:
pd.pivot_table(df, 
               values='uuid', 
               aggfunc='count', 
               index=['meas_yr'], 
               columns=['year'], 
               fill_value=0)

In [None]:
ecoreg_counts = df.groupby(by=['ecoregion3'])[['uuid', 'year', 'plot_size_ac']].nunique()
ecoreg_counts

## Available features
The different types of predictor variables we can use to predict a forest attribute, including climate, lidar-derived, soil, and satellite imagery.

In [None]:
df[S2_COLS + LANDTRENDR_COLS].describe()

## Selecting features and targets
This is the first step in determining what features we want to use, and what we want to predict.

In [None]:
X_COLS = S2_COLS + LANDTRENDR_COLS + ['elevation', 'lat', 'lon'] + ['ecoregion3'] 
Y_COLS = ['total_cover', 'topht', 'qmd', 'tcuft']

Y_NAMES = [col.upper() for col in Y_COLS]

In [None]:
USE_REGIONS = ['blue_mountains', 'coast_range', 'north_cascades', 'cascades',
               'klamath_mountains_california_high_north_coast_range', 
               'eastern_cascades_slopes_and_foothills', 'northern_rockies',
               'puget_lowland', 'willamette_valley']
display(df.groupby('ecoregion3')[Y_COLS].mean().round(1).loc[USE_REGIONS])
display(df[Y_COLS].describe())

In [None]:
df = df.reset_index(drop=True)
X, Y = df[X_COLS], df[Y_COLS]

In [None]:
df[X_COLS].info()

## Split datasets by ecoregion
We want to explore model transferability between regions, so we'll train models independently on subsets of the data within a single ecoregion, as well as a model that is trained on all available ecoregions. 

In [None]:
ecoregions = list(np.sort([reg for reg in pd.unique(df.ecoregion3) if ecoreg_counts.loc[reg]['uuid'] > 20]))

eco_X_idx = [X.loc[X.ecoregion3 == eco].index.values for eco in ecoregions]

eco_X_dfs = [X.loc[X.ecoregion3 == eco].drop(['ecoregion3'], axis=1) for eco in ecoregions]
eco_Y_dfs = [Y.loc[idx] for idx in eco_X_idx]

# append a "global" model that contains data from all ecoregions
ecoregions.append('all')
ecoregion_names = ['_'.join(x.split('_')[0:2]) for x in ecoregions]
eco_X_dfs.append(X.drop(['ecoregion3'], axis=1))
eco_Y_dfs.append(Y)

ecoregion_display_names = [' '.join(x.upper().split('_')[:2]) for x in ecoregions]

In [None]:
cover_class_bins = [10,40,70,100]
cover_class_labels = ['OPEN', 'MODERATE', 'CLOSED']
height_class_bins = np.arange(0,300,20)
height_class_labels = [f'{x}-{x+20}' for x in height_class_bins[:-1]]
diameter_class_bins = [1, 5, 10, 15, 20, 999]
diameter_class_labels = ['SEED/SAP', 'SMALL', 'MEDIUM', 'LARGE', 'VERY_LARGE']

## Scoring
We'll use Root Mean Square Error to evaluate model performance.

In [None]:
def rmse(obs, pred):
    return np.sqrt((np.square(obs-pred)).mean())

def nrmse(obs, pred):
    return rmse(pred,obs) / obs.mean()

def mae(obs, pred):   
    return abs(pred - obs).mean()

def mape(obs, pred):    
    return abs(pred - obs).mean() / obs.mean()

def bias(obs, pred):   
    return (pred - obs).mean()

def rel_bias(obs, pred):
    return bias(pred,obs) / obs.mean()

def bin_accuracy(obs, pred, bins, fuzzy_tol=0):
    pred_binned = np.digitize(pred, bins)
    obs_binned = np.digitisze(obs, bins)
    diff = abs(pred_binned - obs_binned)
    
    return (diff <= fuzzy_tol).sum() / len(diff)

def confidence_interval_half(X, confidence=0.95):
    n = len(X)
    se = stats.sem(X)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return h

## Fit some models
For each type of model, we'll employ cross-validation to tune model hyperparameters, generating a tuned model for each ecoregion as well as a tuned model using all training data. 

In [None]:
MODELS = {
    'ElasticNet': ElasticNet(),
    'Lasso': Lasso(), 
    'KNeighborsRegressor': KNeighborsRegressor(n_jobs=-1),
    'RandomForestRegressor': RandomForestRegressor(n_jobs=-1), 
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(), 
}

FIT_PARAMS = {
    'ElasticNet': {
        'alpha': np.logspace(-4,2,7),
        'l1_ratio': np.arange(0.0, 1.0, 0.1),
    },
    'Lasso': {
        'alpha': np.logspace(-4,2,7),
    },
    'KNeighborsRegressor': {
        'n_neighbors': [1,2,3,4,5,10,20],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'manhattan']
    },
    'RandomForestRegressor': {
        'n_estimators': [100, 500, 1000],
        'max_features': ['sqrt', None],
        'max_depth': [5, 20, None],
        'max_samples': [0.5, None]
    },
    'HistGradientBoostingRegressor': {
        'max_iter': [50, 100, 200],
        'min_samples_leaf': [5, 10, 20],
        'max_depth': [3, 5, 10],
        'learning_rate': [0.01, 0.1],
    },
}

In [None]:
NUM_OUTER_FOLDS = 5
NUM_INNER_FOLDS = 3
SCORE_FUNCS = [rmse, nrmse, mae, mape, bias, rel_bias]
score_names = [func.__name__ for func in SCORE_FUNCS]

In [None]:
def build_insider_results_dictionary(regions, model_names, num_outer_folds, score_funcs, target_vars):
    results = {}
    for region in regions:
        results[region] = {}
        for model_name in model_names:
            results[region][model_name] = {}
            for y_col in target_vars:
                results[region][model_name][y_col] = {}
                results[region][model_name][y_col]['fitted_model'] = None
                results[region][model_name][y_col]['best_params'] = None
                results[region][model_name][y_col]['cv_results'] = {}
                for fold_idx in range(num_outer_folds):  # results from each outer loop of nested CV
                    fold_num = fold_idx + 1
                    results[region][model_name][y_col]['cv_results'][fold_num] = {}
                    results[region][model_name][y_col]['cv_results'][fold_num]['best_params'] = None 
                    results[region][model_name][y_col]['cv_results'][fold_num]['predict_time'] = None
                    for score_func in score_funcs:
                        score_func_name = score_func.__name__
                        results[region][model_name][y_col]['cv_results'][fold_num][score_func_name] = None
    return results

def parse_insider_results(results):
    data = []
    for ecoregion in ecoregions[:-1]:
        for target in Y_COLS:
            for fold_num in results[ecoregion][target].keys():
                for score_name in score_names:
                    data.append((fold_num, ecoregion, target, score_name, results[ecoregion][target][fold_num][score_name]))
    return pd.DataFrame(data, columns=['cv_fold', 'ecoregion', 'target', 'metric', 'score'])

def build_global_results_dictionary(regions, model_names, num_outer_folds, score_funcs, target_vars):
    results = {}
    for model_name in model_names:
        results[model_name] = {}
        for y_col in target_vars:
            results[model_name][y_col] = {}
            results[model_name][y_col]['fitted_model'] = None
            results[model_name][y_col]['best_params'] = None
            results[model_name][y_col]['cv_results'] = {}
            for fold_idx in range(num_outer_folds):  # results from each outer loop of nested CV
                fold_num = fold_idx + 1
                results[model_name][y_col]['cv_results'][fold_num] = {}
                results[model_name][y_col]['cv_results'][fold_num]['best_params'] = None 
                results[model_name][y_col]['cv_results'][fold_num]['predict_time'] = None
                for region in regions:
                    results[model_name][y_col]['cv_results'][fold_num][region] = {}
                    for score_func in score_funcs:
                        score_func_name = score_func.__name__
                        results[model_name][y_col]['cv_results'][fold_num][region][score_func_name] = None
    return results

def build_outsider_results_dictionary(regions, model_names, score_funcs, target_vars):
    results = {}
    for region in regions:
        results[region] = {}
        for model_name in model_names:
            results[region][model_name] = {}
            for y_col in target_vars:
                results[region][model_name][y_col] = {}
                results[region][model_name][y_col]['fitted_model'] = None
                results[region][model_name][y_col]['best_params'] = None
                results[region][model_name][y_col]['predict_time'] = None
                for score_func in score_funcs:
                    score_func_name = score_func.__name__
                    results[region][model_name][y_col][score_func_name] = None
    return results

def build_visiting_insider_results_dictionary(regions, model_names, score_funcs, target_vars):
    results = {}
    for target_region in regions:
        results[target_region] = {}
        for train_region in [r for r in regions if r != target_region]:
            results[target_region][train_region] = {}
            for model_name in model_names:
                results[target_region][train_region][model_name] = {}
                for score_func in score_funcs:
                    score_func_name = score_func.__name__
                    results[target_region][train_region][model_name][score_func_name] = {
                        y: None for y in target_vars
                    }
    return results

In [None]:
def tune_insider_model(model_name, num_outer_folds=NUM_OUTER_FOLDS, num_inner_folds=NUM_INNER_FOLDS):
    print(model_name)
    print('-'*len(model_name))
    model = MODELS[model_name]
    fit_params = FIT_PARAMS[model_name]
    train_regions = [x for x in ecoregions if x.upper() != 'ALL']
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', clone(model)),
    ])
    search_params = {f'model__{key}': value for key, value in fit_params.items()}
    
    cv_outer = GroupKFold(num_outer_folds)
    cv_inner = GroupKFold(num_inner_folds)
    
    for i, ecoregion in enumerate(train_regions):
        ecoregion_name = ecoregion_display_names[i]
        print(f'Starting on {ecoregion_name}')
        for y_col in Y_COLS:
            print(f'    {y_col}', end='... ')
            X = eco_X_dfs[i]
            Y = eco_Y_dfs[i][y_col]
            outer_groups = df.loc[X.index, 'uuid'].values
        
            outer_fold_num = 1
            for train_ix, test_ix in cv_outer.split(X, groups=outer_groups):
                X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
                Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                inner_groups = df.loc[X_train.index, 'uuid'].values

                inner_search = GridSearchCV(pipe, search_params, 
                                            scoring='neg_mean_squared_error', 
                                            n_jobs=-1, cv=cv_inner, refit=True)

                inner_result = inner_search.fit(X_train, Y_train, groups=inner_groups)
                insider_results[ecoregion][model_name][y_col]['cv_results'][outer_fold_num]['best_params'] = inner_result.best_params_

                inner_best_model = inner_result.best_estimator_
                start_time = time.time()
                Y_pred = inner_best_model.predict(X_test)
                end_time = time.time()
                total_predict_time = end_time - start_time
                avg_predict_time = total_predict_time / len(X_test)
                insider_results[ecoregion][model_name][y_col]['cv_results'][outer_fold_num]['predict_time'] = avg_predict_time

                for score_func in SCORE_FUNCS:
                    score_func_name = score_func.__name__
                    score = score_func(Y_test, Y_pred)
                    insider_results[ecoregion][model_name][y_col]['cv_results'][outer_fold_num][score_func_name] = score
                    
                print(outer_fold_num, end='... ')
                outer_fold_num += 1
            print('Done scoring.', end='... ')
            
            # done with scoring of models, now time to tune a model using the whole dataset
            outer_search = GridSearchCV(pipe, search_params, 
                                        scoring='neg_mean_squared_error', 
                                        n_jobs=-1, cv=cv_outer, refit=True)
            outer_result = outer_search.fit(X, Y, groups=outer_groups)
            
            # now fit on the entire dataset, not just training set
            model = outer_result.best_estimator_
            model.set_params(**outer_result.best_params_)
            X = df.loc[df.ecoregion3 == ecoregion, X_COLS].drop(['ecoregion3'], axis=1)
            y = df.loc[df.ecoregion3 == ecoregion, y_col]
            model.fit(X, y)
            
            eco_name = '_'.join(ecoregion.split('_')[:2])
            outfile = f'{eco_name}-sentinel-{model_name}-{y_col}.pkl'
            outpath = os.path.join('../models/structure_models', outfile)
            with open(outpath, 'wb') as file:
                pickle.dump(model, file)
            
            insider_results[ecoregion][model_name][y_col]['fitted_model'] = model
            insider_results[ecoregion][model_name][y_col]['best_params'] = outer_result.best_params_
            print('All done.')
    
        cv_results_dict = {ecoregion: {y_col: insider_results[ecoregion][model_name][y_col]['cv_results'] for y_col in Y_COLS} 
                           for ecoregion in train_regions}
    
    return cv_results_dict

def tune_outsider_model(model_name, num_folds=5):
    print(model_name)
    print('-'*len(model_name))
    model = MODELS[model_name]
    fit_params = FIT_PARAMS[model_name]
    train_regions = [x for x in ecoregions if x.upper() != 'ALL']
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', clone(model)),
    ])
    search_params = {f'model__{key}': value for key, value in fit_params.items()}
    
    groupkfold = GroupKFold(num_folds)
    
    for i, ecoregion in enumerate(train_regions):
        ecoregion_name = ecoregion_display_names[i]
        print(f'Starting on {ecoregion_name}')
        for y_col in Y_COLS:
            print(f'    {y_col}', end='... ')
            X_train = df.loc[df.ecoregion3 != ecoregion, X_COLS].drop('ecoregion3', axis=1)
            Y_train = Y.loc[X_train.index][y_col]
            X_test = df.loc[df.ecoregion3 == ecoregion, X_COLS].drop('ecoregion3', axis=1)
            Y_test = Y.loc[X_test.index][y_col]
            groups = df.loc[X_train.index]['ecoregion3'].values

            search = GridSearchCV(pipe, search_params, 
                                  scoring='neg_mean_squared_error',
                                  n_jobs=-1, cv=groupkfold, refit=True)

            result = search.fit(X_train, Y_train, groups=groups)
            print('Done fitting, now scoring', end='... ')
            outsider_results[ecoregion][model_name][y_col]['best_params'] = result.best_params_
            outsider_results[ecoregion][model_name][y_col]['fitted_model'] = result.best_estimator_

            best_model = result.best_estimator_       
            start_time = time.time()
            Y_pred = best_model.predict(X_test)
            end_time = time.time()
            total_predict_time = end_time - start_time
            avg_predict_time = total_predict_time / len(X_test)
            outsider_results[ecoregion][model_name][y_col]['predict_time'] = avg_predict_time
            
            for score_func in SCORE_FUNCS:
                score_func_name = score_func.__name__
                score = score_func(Y_test, Y_pred)
                outsider_results[ecoregion][model_name][y_col][score_func_name] = score
            print('All done.')
        
        results_dict = {ecoregion: {y_col: outsider_results[ecoregion][model_name][y_col] for y_col in Y_COLS} for ecoregion in train_regions}
        

    return results_dict

def tune_global_model(model_name, num_outer_folds=NUM_OUTER_FOLDS, num_inner_folds=NUM_INNER_FOLDS):
    print(model_name)
    print('-'*len(model_name))
#     print(f'Scoring with {NUM_OUTER_FOLDS} folds... ', end='')
    model = MODELS[model_name]
    fit_params = FIT_PARAMS[model_name]
    test_regions = [x for x in ecoregions if x.upper() != 'ALL']
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', clone(model)),
    ])
    search_params = {f'model__{key}': value for key, value in fit_params.items()}
    
    cv_outer = GroupKFold(num_outer_folds)
    cv_inner = GroupKFold(num_inner_folds)
    
    X = df[X_COLS].drop('ecoregion3', axis=1)
    Y = df[Y_COLS]
    outer_groups = df['uuid'].values

    for y_col in Y_COLS:
        outer_fold_num = 1
        print(f'{y_col}', end='... ')
        for train_ix, test_ix in cv_outer.split(X, groups=outer_groups):
            X_train, X_test = X.loc[train_ix], X.loc[test_ix]
            Y_train, Y_test = Y.loc[train_ix][y_col], Y.loc[test_ix][y_col]
            inner_groups = df.loc[train_ix, 'uuid'].values

            inner_search = GridSearchCV(pipe, search_params, 
                                        scoring='neg_mean_squared_error', 
                                        n_jobs=-1, cv=cv_inner, refit=True)

            inner_result = inner_search.fit(X_train, Y_train, groups=inner_groups)
            global_results[model_name][y_col]['cv_results'][outer_fold_num]['best_params'] = inner_result.best_params_

            inner_best_model = inner_result.best_estimator_
            start_time = time.time()
            Y_pred = inner_best_model.predict(X_test)
            end_time = time.time()
            total_predict_time = end_time - start_time
            avg_predict_time = total_predict_time / len(X_test)
            global_results[model_name][y_col]['cv_results'][outer_fold_num]['predict_time'] = avg_predict_time

            for ecoregion in test_regions:
                region_mask = (df.loc[test_ix, 'ecoregion3'] == ecoregion).values
                regional_X_test = X_test.loc[test_ix[region_mask]]
                regional_Y_test = Y_test.loc[test_ix[region_mask]]
                regional_Y_pred = inner_best_model.predict(regional_X_test)

                for score_func in SCORE_FUNCS:
                    score_func_name = score_func.__name__
                    score = score_func(regional_Y_test, regional_Y_pred)
                    global_results[model_name][y_col]['cv_results'][outer_fold_num][ecoregion][score_func_name] = score

            print(outer_fold_num, end='... ')
            outer_fold_num += 1

        print('Done scoring. Now fitting a final model', end='... ')

        # done with scoring of models, now time to tune a model using the whole dataset
        outer_search = GridSearchCV(pipe, search_params, 
                                    scoring='neg_mean_squared_error', 
                                    n_jobs=-1, cv=cv_outer, refit=True)
        outer_result = outer_search.fit(X, Y[y_col], groups=outer_groups)
        
        # now fit on the entire dataset, not just training set
        model = outer_result.best_estimator_
        model.set_params(**outer_result.best_params_)
        X = df[X_COLS].drop(['ecoregion3'], axis=1)
        y = df[y_col]
        model.fit(X, y)

        outfile = f'global-sentinel-{model_name}-{y_col}.pkl'
        outpath = os.path.join('../models/structure_models', outfile)
        with open(outpath, 'wb') as file:
            pickle.dump(model, file)
        print('All done.')
        
        global_results[model_name][y_col]['fitted_model'] = model
        global_results[model_name][y_col]['best_params'] = outer_result.best_params_

    results_dict = global_results[model_name]

    return results_dict

In [None]:
def parse_global_results(results):
    data = []
    for fold in range(NUM_OUTER_FOLDS):
        for ecoregion in ecoregions[:-1]:
            for target in Y_COLS:
                for score_name in score_names:
                    data.append((fold+1, ecoregion, target, score_name, results[target]['cv_results'][fold+1][ecoregion][score_name]))
    return pd.DataFrame(data, columns=['cv_fold', 'ecoregion', 'target', 'metric', 'score'])

## Fit Global Models
These models get to see data from every ecoregion during training and tuning.

In [None]:
global_results = build_global_results_dictionary(ecoregions[:-1], MODELS.keys(), NUM_OUTER_FOLDS, SCORE_FUNCS, Y_COLS)

In [None]:
elastic_global = tune_global_model('ElasticNet')
lasso_global = tune_global_model('Lasso')
knn_global = tune_global_model('KNeighborsRegressor')
rf_global = tune_global_model('RandomForestRegressor')
gbm_global = tune_global_model('HistGradientBoostingRegressor')

In [None]:
RESULTS_TO_CONCAT = [elastic_global, lasso_global, knn_global, rf_global, gbm_global]
NAMES = ['ElasticNet', 'Lasso', 'kNN', 'RF', 'GBM']
dfs_to_concat = []
for res, name in zip(RESULTS_TO_CONCAT, NAMES):
    tmp_df = parse_global_results(res)
    tmp_df['model'] = name
    dfs_to_concat.append(tmp_df)
all_global_results = pd.concat(dfs_to_concat, axis=0, ignore_index=True)
all_global_results['ecoregion'] = all_global_results['ecoregion'].apply(lambda x: ' '.join(x.title().replace('_',' ').split()[:2]))
all_global_results.columns = [col.upper() for col in all_global_results.columns]
all_global_results.head()

In [None]:
all_global_results.to_csv('../data/processed/nestedcv_unchained_global_results_satellite_structure.csv', header=True, index=False)

## Fit Outsider Models
These models have data from the ecoregion they're tested on held out during training.

In [None]:
outsider_results = build_outsider_results_dictionary(ecoregions[:-1], MODELS.keys(), SCORE_FUNCS, Y_COLS)

In [None]:
def parse_outsider_results(results):
    data = []
    for ecoregion in ecoregions[:-1]:
        for target in Y_COLS:
            for score_name in score_names:
                data.append((np.nan, ecoregion, target, score_name, results[ecoregion][target][score_name]))
    return pd.DataFrame(data, columns=['cv_fold', 'ecoregion', 'target', 'metric', 'score'])

In [None]:
elastic_outsider = tune_outsider_model('ElasticNet')
lasso_outsider = tune_outsider_model('Lasso')
knn_outsider = tune_outsider_model('KNeighborsRegressor')
# rf_outsider = tune_outsider_model('RandomForestRegressor')
# gbm_outsider = tune_outsider_model('HistGradientBoostingRegressor')

In [None]:
rf_outsider = tune_outsider_model('RandomForestRegressor')
gbm_outsider = tune_outsider_model('HistGradientBoostingRegressor')

In [None]:
RESULTS_TO_CONCAT = [elastic_outsider, lasso_outsider, knn_outsider, rf_outsider, gbm_outsider]
NAMES = ['ElasticNet', 'Lasso', 'kNN', 'RF', 'GBM']
dfs_to_concat = []
for res, name in zip(RESULTS_TO_CONCAT, NAMES):
    tmp_df = parse_outsider_results(res)
    tmp_df['model'] = name
    dfs_to_concat.append(tmp_df)
all_outsider_results = pd.concat(dfs_to_concat, axis=0, ignore_index=True)
all_outsider_results['ecoregion'] = all_outsider_results['ecoregion'].apply(lambda x: ' '.join(x.title().replace('_',' ').split()[:2]))
all_outsider_results.columns = [col.upper() for col in all_outsider_results.columns]
all_outsider_results.head()

In [None]:
all_outsider_results.to_csv('../data/processed/nestedcv_unchained_outsider_results_satellite_structure.csv', header=True, index=False)

## Fit Insider Models
These models are trained with observations from a single ecoregion.

In [None]:
insider_results = build_insider_results_dictionary(ecoregions[:-1], MODELS.keys(), 5, SCORE_FUNCS, Y_COLS)

elastic_insider = tune_insider_model('ElasticNet')
lasso_insider = tune_insider_model('Lasso')
knn_insider = tune_insider_model('KNeighborsRegressor')
rf_insider = tune_insider_model('RandomForestRegressor')
gbm_insider = tune_insider_model('HistGradientBoostingRegressor')

In [None]:
RESULTS_TO_CONCAT = [elastic_insider, lasso_insider, knn_insider, rf_insider, gbm_insider]
NAMES = ['ElasticNet', 'Lasso', 'kNN', 'RF', 'GBM']
dfs_to_concat = []
for res, name in zip(RESULTS_TO_CONCAT, NAMES):
    tmp_df = parse_insider_results(res)
    tmp_df['model'] = name
    dfs_to_concat.append(tmp_df)
all_insider_results = pd.concat(dfs_to_concat, axis=0, ignore_index=True)
all_insider_results['ecoregion'] = all_insider_results['ecoregion'].apply(lambda x: ' '.join(x.title().replace('_',' ').split()[:2]))
all_insider_results.columns = [col.upper() for col in all_insider_results.columns]
all_insider_results.head()

In [None]:
all_insider_results.to_csv('../data/processed/nestedcv_unchained_insider_results_satellite_structure.csv', header=True, index=False)

## Use Trained Insider Models to Score Visiting Insider Models
These models are trained on a single region, and scored on other regions they've never seen before. 

In [None]:
visitor_results = build_visiting_insider_results_dictionary(ecoregions[:-1], MODELS.keys(), SCORE_FUNCS, Y_COLS)

In [None]:
visitor_results = []
for target_region in ecoregions[:-1]:
    for train_region in [r for r in ecoregions[:-1] if r != target_region]:
        for model_name in MODELS.keys():
            for y_col in Y_COLS:
                model = insider_results[train_region][model_name][y_col]['fitted_model']
                targ_idx = df.loc[df.ecoregion3 == target_region].index.values
                targ_X = df.loc[targ_idx, X_COLS].drop(['ecoregion3'], axis=1)
                pred = model.predict(targ_X)
                obs = df.loc[targ_idx, y_col]
                for score_func in SCORE_FUNCS:
                    score_func_name = score_func.__name__
                    score = score_func(obs, pred)
                    visitor_results.append(
                        (' '.join(target_region.title().replace('_',' ').split()),
                         ' '.join(train_region.title().replace('_',' ').split()),
                         model_name, score_func_name, y_col, score))
visitor_df = pd.DataFrame(visitor_results, 
                          columns = ['TARGET_ECOREGION', 'TRAIN_ECOREGION', 
                                     'MODEL', 'METRIC', 'TARGET', 'SCORE'])
visitor_df.head()

In [None]:
visitor_df.to_csv('../data/processed/nestedcv_unchained_visitor_results_satellite_structure.csv', 
                  header=True, index=False)