## Import libraries (XGB3 environment)

In [1]:
from rasterio.transform import from_origin
from rasterio.mask import mask
from rasterio.enums import Resampling
from shapely.geometry import box
from fiona.crs import from_epsg
import geopandas as gpd
import xgboost as xgb
import pandas as pd
import numpy as np
import rasterio
import os
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, GridSearchCV, StratifiedKFold, ParameterGrid
from scipy import ndimage
from sklearn.inspection import PartialDependenceDisplay, partial_dependence

import matplotlib.pyplot as plt
from plotnine import *
from scipy import interpolate
from sklearn.metrics import mean_squared_error, mean_absolute_error
import itertools
import sys
sys.version

import datetime
from dateutil.relativedelta import relativedelta
import os

from math import *

#from Tools import *
#import Tools
exec(open('Tools.py').read())

pd.set_option('display.max_columns',None)

## Create storage directory for fitted models and figures

In [2]:
dr_grid = 50 ## Spatial block size
prefix = 'Fit_all_scaled_multigrid1_md1_new' + str(dr_grid)
model_path = 'Fitted_models/' + prefix + '/'
model_fig_path = 'Fitted_models/' + prefix + '/Figures/'
make_dir(model_path)
make_dir(model_fig_path)

[Errno 17] File exists: 'Fitted_models/Fit_all_scaled_multigrid1_md1_new50/'
[Errno 17] File exists: 'Fitted_models/Fit_all_scaled_multigrid1_md1_new50/Figures/'


## Choose predictors, load and aggregate data 

In [3]:
## Load dataset
All_dat = pd.read_csv('Data/Trap_Data/Clean_Both_Data_By_Trap.csv', low_memory = False)

xmin = All_dat.Longitude.min()
xmax = All_dat.Longitude.max()
ymin = All_dat.Latitude.min()
ymax = All_dat.Latitude.max()
mean_lat = All_dat.Latitude.mean()
dy = dr_grid / 111139


all_agg_data = pd.DataFrame()
np.random.seed(47)
ngrid = 5
for ii in range(ngrid):
    drll = dr_grid/111139
    djx = drll*np.random.random(1)
    djy = drll*np.random.random(1)  
    All_dat.loc[:,'boxi'] = get_grid(All_dat, drll = dy, 
                                djx = djx, djy =djy)[0]
    agg_data = aggregate_data(All_dat, 'boxi', get_var_names() + ['Longitude', 'Latitude'])
    agg_data = agg_data.loc[agg_data.TotTraps > 0,:]
    agg_data.loc[:,'gridi'] = ii
    all_agg_data = pd.concat((all_agg_data, agg_data))

agg_data = all_agg_data



In [4]:
site_means = agg_data.groupby(['Site']).apply(lambda x: x.TS_Mn.mean()).to_frame()
site_means.columns = ['mean_ts']
agg_data = agg_data.merge(site_means, left_on = 'Site', right_index = True)
agg_data.loc[:,'TS_Mn'] = agg_data.loc[:,'TS_Mn'] - agg_data.loc[:,'mean_ts']

In [5]:
All_dat.groupby('Visit').apply(lambda x: x['Date'].min())

Visit
1         2019-09-07
2         2019-11-05
3         2020-01-08
4         2020-03-02
5         2020-12-02
Jan-04    2004-01-26
Jan-05    2005-01-24
May-03    2003-05-24
May-04    2004-05-20
Oct-03    2003-09-23
Oct-04    2004-10-03
dtype: object

## Set model hyperparameters

In [6]:
town_list = All_dat.Site.unique() 
max_iters = 2000
vary_n_iters =np.arange(50,max_iters, 50).tolist()

param_dict = {'eval_metric':['rmse'],
              'nthread':[2], 
              'max_depth':[1], 'subsample':[0.001,0.005,0.01, 0.025, 0.05, 0.1,0.25,0.5], 'n_estimators':[max_iters],
              'colsample_bytree':[0.05, 0.1, 0.2, 0.3], 'lambda':[0], 'eta':[0.01],
              'test_site': town_list, 'gamma':[0] ,
              'dr':[0], 'booster':['gbtree'], 'prec':[True]}
# param_dict = {'eval_metric':['rmse'],
#               'nthread':[2], 
#               'max_depth':[4], 'subsample':[0.001,0.005,0.01, 0.025, 0.05, 0.1,0.25,0.5], 'n_estimators':[max_iters],
#               'colsample_bytree':[0.05, 0.1, 0.2, 0.3], 'lambda':[0], 'eta':[0.01],
#               'test_site': town_list, 'gamma':[0, 0.1, 0.5, 1, 2, 5] ,
#               'dr':[0], 'booster':['gbtree'], 'prec':[True]}
# param_dict = {'eval_metric':['rmse'],
#               'nthread':[2], 
#               'max_depth':[1], 'subsample':[0.1], 'n_estimators':[max_iters],
#               'colsample_bytree':[0.1], 'lambda':[0], 'eta':[0.01],
#               'test_site': town_list, 'gamma':[0] ,
#               'dr':[0], 'booster':['gbtree'], 'prec':[True]}
xgboost_param_names = ['eval_metric', 'nthread', 
                 'max_depth', 'subsample', 'n_estimators', 
                 'colsample_bytree', 'lambda', 'eta', 'booster',
                       'gamma'
                 ]

param_grid = ParameterGrid(param_dict)
npars = len(param_grid)
print('Number of hyperparameters: ' + str(len(param_grid)))

Number of hyperparameters: 96


In [7]:
prefix

'Fit_all_scaled_multigrid1_md1_new50'

In [9]:
## Set up validation fold structure
nfolds = 3
val_sgkf = GroupKFold(n_splits=nfolds-1)

## Loop through hyperparameters, fit models, store results in store_summ
store_summ = list()

for param_i, param in enumerate(param_grid): 
    
    ## Retrieve the training and validation data only
    test_sites = [param['test_site']]
    X_trainval, y_trainval, W_trainval, trainval_dat, test_dat = get_trainval_data(dataset = agg_data, 
                                                                         test_sites = test_sites, 
                                                                        response = 'TS_Mn', 
                                                                        weight = 'TotTraps')
    if param['prec']:
        col_names = get_var_names(prec_flag = True)
    else:
        col_names = get_var_names(prec_flag = False)
    
    test_xvals = test_dat.loc[:,col_names]
    test_yvals = test_dat.loc[:,'TS_Mn']
    test_wvals = test_dat.loc[:,'TotTraps']    

    ## Create k-fold CV splitter
    fold_gen = val_sgkf.split(trainval_dat.Site, trainval_dat.TS_Mn, 
                              groups = trainval_dat.Site)

    ## Extract xgboost parameters
    xgboost_params = {k: param[k] for k in (xgboost_param_names)}        
    xgboost_params['eval_metric'] = ['rmse', 'mae']
    for train_index, val_index in fold_gen: 
        
        val_site = trainval_dat.Site.loc[val_index].unique()[0]
        train_site = trainval_dat.Site.loc[train_index].unique()[0]
        
        train_xvals = X_trainval.loc[train_index, col_names]    
        train_yvals = y_trainval.loc[train_index]
        train_wvals = W_trainval.loc[train_index]   
        
        train_dat_set, train_w_set = get_eval_sets(trainval_dat.iloc[train_index], col_names)
        val_dat_set, val_w_set = get_eval_sets(trainval_dat.iloc[val_index], col_names)
        test_dat_set, test_w_set = get_eval_sets(test_dat, col_names)
        
        eval_sets = train_dat_set + val_dat_set + test_dat_set
        weight_sets = train_w_set + val_w_set + test_w_set
        
        #xgb_model = xgb.XGBRegressor(objective = 'reg:logistic', **xgboost_params)
        xgb_model = xgb.XGBRegressor(objective = 'reg:squarederror', **xgboost_params)
        xgb_model.fit(X = train_xvals, y = train_yvals, 
                     sample_weight = train_wvals, 
                     eval_set = eval_sets, 
                      sample_weight_eval_set = weight_sets, 
                     verbose = False)                     
        xgb_model.save_model(model_path + 'par_' + str(param_i) + '_fold_' + str(val_site) + '.txt')        
        
        for n_iter in [iters for iters in vary_n_iters if (iters <= param['n_estimators'])]:
            store_summ_new = [param_i, train_site, val_site, n_iter]
            
            for gridi in range(3*ngrid):
                val_name = 'validation_' + str(gridi)
                loss = xgb_model.evals_result()[val_name]['mae'][n_iter-1]
  
                store_summ_new = store_summ_new + [loss]
    
            add = [param[xx] for xx in list(param.keys())]
            store_summ_new += add
            store_summ.append(store_summ_new)
    print('*** Finished parameter set: ', param_i, ' / ', npars)

name_list = ['train','val','test']
eval_name_list = [name_list[ii//ngrid] + '_loss' + str(ii%ngrid) for ii in range(3*ngrid)]
    
col_names = ['param', 'train_site', 'val_site', 'n_iter'] + eval_name_list
col_names += list(param.keys())                                                  

store_summ_df = pd.DataFrame(store_summ, columns = col_names)

store_summ_df.loc[:,'train_loss'] = store_summ_df.loc[:,['train_loss' + str(x) for x in range(nfolds)]].mean(axis = 1)
store_summ_df.loc[:,'sdtrain_loss'] = store_summ_df.loc[:,['train_loss' + str(x) for x in range(nfolds)]].std(axis = 1)
store_summ_df.loc[:,'val_loss'] = store_summ_df.loc[:,['val_loss' + str(x) for x in range(nfolds)]].mean(axis = 1)
store_summ_df.loc[:,'sdval_loss'] = store_summ_df.loc[:,['val_loss' + str(x) for x in range(nfolds)]].std(axis = 1)
store_summ_df.loc[:,'test_loss'] = store_summ_df.loc[:,['test_loss' + str(x) for x in range(nfolds)]].mean(axis = 1)
store_summ_df.loc[:,'sdtest_loss'] = store_summ_df.loc[:,['test_loss' + str(x) for x in range(nfolds)]].std(axis = 1)

store_summ_df.to_csv(model_path + prefix + '_store_summ.csv')

      Frac_bare.25  Frac_bare.50  Frac_bare.100  Frac_bare.200  Frac_bare.500  \
0         0.451228      0.309551       0.248592       0.243437       0.151980   
1         0.397438      0.334815       0.245603       0.274264       0.174028   
2         0.576543      0.336877       0.266117       0.258552       0.161785   
3         0.412035      0.307712       0.212617       0.224503       0.132467   
4         0.439022      0.338981       0.208907       0.199559       0.134226   
...            ...           ...            ...            ...            ...   
3038      0.931158      0.883047       0.837492       0.643596       0.390185   
3039      0.895905      0.785297       0.844422       0.743696       0.384316   
3040      0.849744      0.807831       0.844337       0.696309       0.388639   
3041      0.972110      0.897926       0.842484       0.727954       0.371403   
3042      0.874044      0.848517       0.844886       0.758338       0.379075   

      Frac_bare.1000  Frac_