In [1]:
import pandas as pd
import xgboost as xg
from xgboost import cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import reduce
from sklearn.metrics import r2_score as R2
from time import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
target = 'NEE'

model = ['rcef_RandomForestRegressor', 'rcef_RidgeCV', 'rcef_XGBRegressor', 'xgboost'] 

extracted_features = model[-1]

In [3]:
# loads all data sets into a dict
def load_datasets(dirs: list, load_path: str) -> dict:
    files = ['soil_c','surf_water','flux_soc','soil_water','n_flux','p_flux','temp', 
         'plant_c','plant_n','plant_p','canopcy_c','plant_stress','photosynthesis','plant_growth']
    
    #files.append('soil_temp', 'canopy_temp') missing 
    datasets = {}
    
    for dr in dirs:
        csv_list = []
        path = 'datasets/' + dr + load_path
        for f in files:
            df = pd.read_csv(os.path.join(path,f + '.csv'))
            df.drop(df.columns[0], axis=1)

            csv_list.append(df)


        data_dict = {}
        for i in range (len(csv_list)):
            data_dict[files[i]] = csv_list[i]

        datasets[dr] = data_dict

    return datasets

dirs = ['warm_temp_maize_soybean_irrigated', 'warm_temp_maize-soybean_dryland', 'cool_temp_maize_soybean']

datasets = load_datasets(dirs, '/csv_outs/with_plant_soil_details/')

In [4]:
def rename_dupes(suffix: str, df: pd.DataFrame, dupes: list) -> pd.DataFrame:
    for col in df.columns:
        if col in dupes:
            df.rename(columns={col: col + suffix}, inplace=True)
    return df

def average_numbered_columns(df):
    numbered_cols = [col for col in df.columns if '_' in col and col.split('_')[-1].isdigit()]

    col_groups = {}
    for col in numbered_cols:
        prefix = '_'.join(col.split('_')[:-1])
        if prefix not in col_groups:
            col_groups[prefix] = []
        col_groups[prefix].append(col)

    # calculate averages and add new columns
    for prefix, cols in col_groups.items():
        avg_col_name = prefix
        avg_col_values = df[cols].mean(axis=1)
        df[avg_col_name] = avg_col_values

    # drop numbered columns
    df = df.drop(columns=numbered_cols)

    return df

# turn all csv's to one dataframe
def to_pd(df: dict, handle_dupes: bool, flatten_num_cols: bool) -> pd.DataFrame:
    x = pd.DataFrame()
    for file_name in df:
        cur = df[file_name]
            
        x = pd.concat([x, df[file_name]], axis = 1)
        
    cheeky_col = 'unnamed.1'
    cheeky_col2 = 'Unnamed: 0'
    if cheeky_col in x.columns:
        x = x.drop([cheeky_col], axis=1)
    elif cheeky_col2 in x.columns:
         x = x.drop([cheeky_col2], axis=1)
    x = x.drop(['DATE'], axis=1)

    if flatten_num_cols:
        x = average_numbered_columns(x)
        
    x = x.loc[:,~x.columns.duplicated()].copy()
    
    one_hot = pd.get_dummies(x['GROWTH_STG'])
    x= x.drop('GROWTH_STG',axis = 1)
    # Join the encoded df
    x = x.join(one_hot)

    x.columns = x.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))
    
    return x

df_dry = to_pd(datasets['warm_temp_maize-soybean_dryland'], True, True)
df_irr = to_pd(datasets['warm_temp_maize_soybean_irrigated'], True, True)
df_cool = to_pd(datasets['cool_temp_maize_soybean'], True, True)

# NEE = GPP - ER:
#GPP = GROSS PRIMARY PRODUCTION (TOTAL C INTAKE) 
#ER = total C uptake =  ECO_RH + ECO_RA =  autotrophic + heterotrophic respiration 
#NPP = GPP + ECO_RA
df_dry['NEE'] = df_dry['ECO_NPP'] - df_dry['ECO_RH']
df_irr['NEE'] = df_irr['ECO_NPP'] - df_irr['ECO_RH']
df_cool['NEE'] = df_cool['ECO_NPP'] - df_cool['ECO_RH']

df= pd.concat([df_dry, df_irr, df_cool])
y = df[target].copy()
#y = df[target].copy()
#df = df.drop(target, axis=1)

### Read simulated data, extracted features, l2 normalize data  

In [5]:
xgbFeatImp = pd.read_csv('feature_analysis/xgboost/FeaturesImportance'  + target  + 'weather_soil_data' + '.csv')
feat_cols = []
for i in range(len(xgbFeatImp.values)):
    feat_cols.append(xgbFeatImp.values[i][0])

y_simulated = df[target].copy()
x_simulated = df[feat_cols]
x_simulated['WIND'] = x_simulated['WIND'] / 100

for entr in x_simulated.columns:
    x_simulated[entr] = preprocessing.normalize([x_simulated[entr]])[0]
y_simulated = pd.Series(preprocessing.normalize([y_simulated])[0], name='NEE')

### Read observed data, and normalize

In [6]:
observed_df = pd.read_csv('datasets/extractedNEE_CUT_REF_NIGHT.csv')

observed_df['initial planting density (m-2)1'] = 8.2

x_observed = observed_df[feat_cols]
y_observed = observed_df['NEE']

for entr in x_observed.columns:
    x_observed[entr] = preprocessing.normalize([x_observed[entr]])[0]
y_observed = pd.Series(preprocessing.normalize([y_observed])[0], name='NEE')

### Aggregate data and turn into xgb matrix for efficiency

In [17]:
x = pd.concat([x_simulated, x_observed])
y = pd.concat([y_simulated, y_observed])

data_dmatrix = xg.DMatrix(data=x,label=y)


### Load optimized model

In [20]:
reg = xg.XGBRegressor()
reg.load_model("models/bayesian_pretrain_gridsearch_fine_tune_xgb.json")

### Kfold CV 

In [37]:
xgb_cv = cv(dtrain=data_dmatrix, params=reg.get_xgb_params(), nfold=7
            , early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

In [38]:
xgb_cv

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,0.486303,3.6e-05,0.486303,0.000216
1,0.481512,3.5e-05,0.481512,0.000217
2,0.476769,3.5e-05,0.476769,0.000216
3,0.472072,3.5e-05,0.472072,0.000217
4,0.467421,3.5e-05,0.467421,0.000217
5,0.462816,3.4e-05,0.462816,0.000217
6,0.458256,3.4e-05,0.458256,0.000217
7,0.453742,3.4e-05,0.453742,0.000217
8,0.449271,3.3e-05,0.449271,0.000218
9,0.444846,3.3e-05,0.444846,0.000218
