# Model Training
Tune and train a Random Forest Regressor model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
import joblib
from pathlib import Path

In [None]:
def set_index_variables(df):    
    df['ndvi'] = (df['B8_mean'] - df['B4_mean'])/(df['B8_mean'] + df['B4_mean'])
    df['gndvi'] = (df['B8_mean'] - df['B3_mean'])/(df['B8_mean'] + df['B3_mean'])
    df['evi'] = 2.5 * ((df['B8_mean'] - df['B4_mean'])/(df['B8_mean'] - 6*df['B4_mean'] - 7.5*df['B2_mean'] + 1))
    df['sr'] = df['B8_mean'] / df['B4_mean']
    df['msr'] = ((df['B8_mean'])/(df['B4_mean']-1)) / (np.sqrt((df['B8_mean'])/(df['B4_mean']))+1)
    df['savi'] = (1+1) * (df['B8_mean']-df['B4_mean'])/(df['B8_mean']+df['B4_mean'])
    df['ctvi'] = (df['ndvi']+0.5)/(abs(df['ndvi']+0.5)) * np.sqrt(abs(df['ndvi']+0.5))
    df['ttvi'] = np.sqrt(abs((df['B8_mean']-df['B4_mean'])/(df['B8_mean']+df['B4_mean']) + 0.5))
    df['rvi'] = df['B4_mean'] / df['B8_mean']
    df['nrvi'] = (df['rvi']-1)/(df['rvi']+1)
    df['ipvi'] = (df['B8_mean']) / (df['B8_mean']+df['B4_mean'])
    df['osavi'] = (df['B8_mean']-df['B4_mean']) / (df['B8_mean']+df['B4_mean']+0.16)
    df['tndvi'] = np.sqrt(df['ndvi']+0.5)
    df['grvi'] = (df['B3_mean']-df['B4_mean']) / (df['B3_mean']+df['B4_mean'])
    df['arvi'] = (df['B8_mean']-(2*df['B4_mean']-df['B2_mean']))/(df['B8_mean']+(2*df['B4_mean']-df['B2_mean']))
    return df

In [None]:
train_data_dir = './data/train'
train_samples = sorted(Path(train_data_dir).glob('Hila_*.csv'))
train_samples

In [None]:
df = []
for path in train_samples:
    tmp_df = pd.read_csv(path, index_col=0)
    df.append(tmp_df)
df = pd.concat(df).reset_index(drop=True)
df['agbm'] = 0.0256 * df['agbm'] / 1000
df = set_index_variables(df)
df.dropna(axis=0, inplace=True)

In [None]:
# Features to include in model
selected_features = ['B8_max', 'grvi', 'B2_mean', 'B3_mean', 'B8_min', 'B4_std', 'B3_max', 'B8_mean',
                     'gndvi', 'B8_std', 'B2_max', 'B4_min', 'B2_min', 'B4_mean', 'B3_min', 'B2_std',
                     'B4_max', 'msr', 'ctvi', 'rvi', 'osavi', 'sr', 'ndvi', 'nrvi', 'ipvi', 'ttvi', 
                     'savi', 'tndvi', 'evi', 'B3_std']

X = df[selected_features]
y = df["agbm"]

# Remove features with low variance (doesn't actually remove any)
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
sel.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)

In [None]:
len(y), len(y_train), len(y_test)

In [None]:
rf = RandomForestRegressor(
    n_estimators = 1555,
    random_state = 42,
    oob_score = True,
    max_features = "auto",
    max_samples = 20000,
    max_depth = 30,
    min_samples_leaf = 2,
    min_samples_split = 2,
    n_jobs = 4 # How many parallel processes to use while training
)

In [None]:
# Train model (~7 min with n_jobs=4, ~4 min with n_jobs=12)
%time rf.fit(X_train, y_train)

In [None]:
# Save model
saved_model_path = './model/rf-final.joblib'
joblib.dump(rf, saved_model_path, compress=3)

Model can be loaded with
`rf = joblib.load(saved_model_path)`

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Out-of-bag score:', rf.oob_score_)