In [1]:
import pandas as pd
import xgboost as xg
from xgboost import cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import reduce
from sklearn.metrics import r2_score as R2
from time import time

import warnings
warnings.filterwarnings('ignore')

from data_loader import load_datasets, nee_cumulative_to_daily, to_pd

In [2]:
target = 'NEE'

model = ['rcef_RandomForestRegressor', 'rcef_RidgeCV', 'rcef_XGBRegressor', 'xgboost'] 

extracted_features = model[-1]

In [None]:
dirs = ['warm_temp_maize_soybean_irrigated', 'warm_temp_maize-soybean_dryland', 'cool_temp_maize_soybean']

datasets = load_datasets(dirs, '/csv_outs/with_plant_soil_details/')

df_dry = nee_cumulative_to_daily(to_pd(datasets['warm_temp_maize-soybean_dryland'], True, True))
df_irr = nee_cumulative_to_daily(to_pd(datasets['warm_temp_maize_soybean_irrigated'], True, True))
df_cool = nee_cumulative_to_daily(to_pd(datasets['cool_temp_maize_soybean'], True, True))

df = pd.concat([df_dry,df_irr, df_cool])

y = df[target].copy()

### Read simulated data, extracted features, l2 normalize data  

In [5]:
xgbFeatImp = pd.read_csv('feature_analysis/xgboost/FeaturesImportance'  + target  + 'weather_soil_data' + '.csv')
feat_cols = []
for i in range(len(xgbFeatImp.values)):
    feat_cols.append(xgbFeatImp.values[i][0])

y_simulated = df[target].copy()
x_simulated = df[feat_cols]
x_simulated['WIND'] = x_simulated['WIND'] / 100

for entr in x_simulated.columns:
    x_simulated[entr] = preprocessing.normalize([x_simulated[entr]])[0]
y_simulated = pd.Series(preprocessing.normalize([y_simulated])[0], name='NEE')

### Read observed data, and normalize

In [6]:
observed_df = pd.read_csv('datasets/extractedNEE_CUT_REF_NIGHT.csv')

observed_df['initial planting density (m-2)1'] = 8.2

x_observed = observed_df[feat_cols]
y_observed = observed_df['NEE']

for entr in x_observed.columns:
    x_observed[entr] = preprocessing.normalize([x_observed[entr]])[0]
y_observed = pd.Series(preprocessing.normalize([y_observed])[0], name='NEE')

### Aggregate data and turn into xgb matrix for efficiency

In [7]:
x = pd.concat([x_simulated, x_observed])
y = pd.concat([y_simulated, y_observed])

data_dmatrix = xg.DMatrix(data=x,label=y)


### Load optimized model

In [8]:
reg = xg.XGBRegressor()
reg.load_model("models/bayesian_pretrain_gridsearch_fine_tune_xgb.json")

### Kfold CV 

In [13]:
xgb_cv = cv(dtrain=data_dmatrix, params=reg.get_xgb_params(), nfold=7
            , early_stopping_rounds=10, metrics="rmse", as_pandas=False, seed=123)

In [14]:
xgb_cv

{'train-rmse-mean': [0.48630289834970136,
  0.48151215641375433,
  0.4767687552064816,
  0.47207192427369027,
  0.46742091447901785,
  0.4628158365324438,
  0.4582563286092451,
  0.4537415947682975,
  0.4492711582176799,
  0.4448456064133661],
 'train-rmse-std': [3.558510231498195e-05,
  3.4925805317727016e-05,
  3.507741728353164e-05,
  3.495909770895408e-05,
  3.457021267154883e-05,
  3.4099143133720486e-05,
  3.4283279945923305e-05,
  3.383797199538649e-05,
  3.3188150721158984e-05,
  3.2848290133632486e-05],
 'test-rmse-mean': [0.48630282884427806,
  0.48151208656600997,
  0.47676868482864,
  0.47207185343299446,
  0.467420843276594,
  0.4628157648377899,
  0.45825625636099687,
  0.4537415221219118,
  0.4492710852536544,
  0.4448455329265806],
 'test-rmse-std': [0.00021601089562700115,
  0.00021663944680883505,
  0.00021644439453420475,
  0.00021653002076594378,
  0.00021686387268317198,
  0.0002172745064438204,
  0.0002170311304935956,
  0.00021742052890172948,
  0.000218052750297