# Model training

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt

In [3]:
# Hila files used to train the model
# currently stored in the trainingdata folder
ndf = pd.read_csv('Hila_Q4233E.csv') 
ndf2 = pd.read_csv('Hila_N4243C.csv') 
ndf3 = pd.read_csv('Hila_P5333E.csv')
ndf4 = pd.read_csv('Hila_T4312C.csv')
ndff = pd.concat([ndf, ndf2, ndf3, ndf4])
ndff.head()

Unnamed: 0.1,Unnamed: 0,volume,agbm,coords,B2_mean,B2_std,B2_max,B2_min,B3_mean,B3_std,B3_max,B3_min,B4_mean,B4_std,B4_max,B4_min,B8_mean,B8_std,B8_max,B8_min
0,0,65.62,91983.5179,"[[[24.85087170343739, 64.04666583384162], [24....",230.309272,10.558041,241.0,210.0,353.507947,4.818944,361.0,347.0,214.735099,4.678556,224.0,210.0,2098.331126,40.749915,2154.0,2056.0
1,1,116.47,154540.855921,"[[[24.850882751753304, 64.04652235143483], [24...",231.17351,0.471405,233.0,232.0,365.088742,0.471405,367.0,366.0,224.619205,8.956686,238.0,219.0,2295.527152,2.828427,2316.0,2310.0
2,2,175.31,189297.285851,"[[[24.851210183134885, 64.04652719247815], [24...",233.077188,6.495191,245.0,230.0,362.765679,3.031089,372.0,365.0,224.782908,1.299038,229.0,226.0,2288.259132,8.660254,2308.0,2288.0
3,3,132.95,162919.441706,"[[[24.8511991364991, 64.04667067491562], [24.8...",233.754755,9.051933,242.0,212.0,356.177989,5.290026,366.0,351.0,214.063179,7.968689,230.0,206.0,2156.311141,103.944396,2360.0,2056.0
4,4,143.89,166333.482841,"[[[24.8515265696878, 64.04667551525253], [24.8...",229.707661,9.467452,243.0,212.0,375.510081,10.793044,387.0,354.0,210.559812,7.480587,230.0,207.0,2468.600806,87.274374,2540.0,2302.0


In [4]:
ndff.shape

(562500, 20)

In [223]:
# sample random 10% of the dataframe rows
frac = ndff.sample(frac=0.10)

In [224]:
ndff = frac

In [27]:
ndff.shape #check

(562500, 20)

In [5]:
ndff.head()

Unnamed: 0.1,Unnamed: 0,volume,agbm,coords,B2_mean,B2_std,B2_max,B2_min,B3_mean,B3_std,B3_max,B3_min,B4_mean,B4_std,B4_max,B4_min,B8_mean,B8_std,B8_max,B8_min
0,0,65.62,91983.5179,"[[[24.85087170343739, 64.04666583384162], [24....",230.309272,10.558041,241.0,210.0,353.507947,4.818944,361.0,347.0,214.735099,4.678556,224.0,210.0,2098.331126,40.749915,2154.0,2056.0
1,1,116.47,154540.855921,"[[[24.850882751753304, 64.04652235143483], [24...",231.17351,0.471405,233.0,232.0,365.088742,0.471405,367.0,366.0,224.619205,8.956686,238.0,219.0,2295.527152,2.828427,2316.0,2310.0
2,2,175.31,189297.285851,"[[[24.851210183134885, 64.04652719247815], [24...",233.077188,6.495191,245.0,230.0,362.765679,3.031089,372.0,365.0,224.782908,1.299038,229.0,226.0,2288.259132,8.660254,2308.0,2288.0
3,3,132.95,162919.441706,"[[[24.8511991364991, 64.04667067491562], [24.8...",233.754755,9.051933,242.0,212.0,356.177989,5.290026,366.0,351.0,214.063179,7.968689,230.0,206.0,2156.311141,103.944396,2360.0,2056.0
4,4,143.89,166333.482841,"[[[24.8515265696878, 64.04667551525253], [24.8...",229.707661,9.467452,243.0,212.0,375.510081,10.793044,387.0,354.0,210.559812,7.480587,230.0,207.0,2468.600806,87.274374,2540.0,2302.0


In [6]:
# scale agbm
features = ndff
features['agbm'] = 0.0256 * features['agbm']
features['agbm'] = features['agbm']/1000

In [7]:
# vegetation indices
features['ndvi'] = (features['B8_mean'] - features['B4_mean'])/(features['B8_mean'] + features['B4_mean'])
features['gndvi'] = (features['B8_mean'] - features['B3_mean'])/(features['B8_mean'] + features['B3_mean'])
features['evi'] = 2.5 * ((features['B8_mean'] - features['B4_mean'])/(features['B8_mean'] - 6*features['B4_mean'] - 7.5*features['B2_mean'] + 1))
features['sr'] = features['B8_mean'] / features['B4_mean']
features['msr'] = ((features['B8_mean'])/(features['B4_mean']-1)) / (np.sqrt((features['B8_mean'])/(features['B4_mean']))+1)
features['savi'] = (1+1) * (features['B8_mean']-features['B4_mean'])/(features['B8_mean']+features['B4_mean'])
features['ctvi'] = (features['ndvi']+0.5)/(abs(features['ndvi']+0.5)) * np.sqrt(abs(features['ndvi']+0.5))
features['ttvi'] = np.sqrt(abs((features['B8_mean']-features['B4_mean'])/(features['B8_mean']+features['B4_mean']) + 0.5))
features['rvi'] = features['B4_mean'] / features['B8_mean']
features['nrvi'] = (features['rvi']-1)/(features['rvi']+1)
features['ipvi'] = (features['B8_mean']) / (features['B8_mean']+features['B4_mean'])
features['osavi'] = (features['B8_mean']-features['B4_mean']) / (features['B8_mean']+features['B4_mean']+0.16)
features['tndvi'] = np.sqrt(features['ndvi']+0.5)
features['grvi'] = (features['B3_mean']-features['B4_mean']) / (features['B3_mean']+features['B4_mean'])

In [37]:
features.shape

(562500, 34)

In [17]:
features.describe()

Unnamed: 0.1,Unnamed: 0,volume,agbm,B2_mean,B2_std,B2_max,B2_min,B3_mean,B3_std,B3_max,...,msr,savi,ctvi,ttvi,rvi,nrvi,ipvi,osavi,tndvi,grvi
count,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,...,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0,562500.0
mean,70312.0,110.198657,2.933174,255.88342,13.980524,274.786489,237.78141,416.963958,20.588174,444.628602,...,2.400617,1.579421,1.134188,1.134188,0.123997,-0.789711,0.894855,0.789661,1.134188,0.225702
std,40594.976886,83.242988,2.359541,105.428565,18.978371,120.05468,96.585559,127.15214,22.46261,141.085434,...,0.60508,0.239162,0.057684,0.057684,0.098504,0.119581,0.059791,0.11958,0.057684,0.095567
min,0.0,0.0,0.0,52.713376,0.0,54.0,41.0,35.886994,0.0,37.0,...,0.415994,-0.242707,0.615342,0.615342,0.034679,-0.932967,0.439323,-0.121344,0.615342,-0.311921
25%,35156.0,44.28,0.93316,203.099661,5.437141,214.0,190.0,343.037035,8.389809,361.5,...,2.085914,1.552161,1.129637,1.129637,0.080238,-0.851445,0.88804,0.77603,1.129637,0.19102
50%,70312.0,103.02,2.708936,230.885792,9.227074,244.0,217.5,385.813495,14.689613,408.0,...,2.445481,1.643341,1.149639,1.149639,0.097894,-0.82167,0.910835,0.821617,1.149639,0.249491
75%,105468.0,157.06,4.40586,269.433371,14.991664,287.0,252.5,450.138941,24.402809,482.0,...,2.764623,1.702889,1.162516,1.162516,0.126075,-0.77608,0.925722,0.851393,1.162516,0.286113
max,140624.0,702.66,38.536611,2353.885517,874.761179,3400.0,2186.0,2562.717241,753.550489,3345.0,...,4.553802,1.865934,1.197066,1.197066,1.276229,0.121354,0.966483,0.932937,1.197066,0.450298


In [None]:
# from forward selection
newall = ['B8_max', 'grvi', 'B2_mean', 'B3_mean', 'B8_min', 'B4_std', 'B3_max', 'B8_mean', 'gndvi', 'B8_std', 'B2_max', 'B4_min', 'B2_min', 
'B4_mean', 'B3_min', 'B2_std', 'B4_max', 'msr', 'ctvi', 'rvi', 'osavi', 'sr', 'ndvi', 'nrvi', 'ipvi', 'ttvi', 'savi', 'tndvi', 'evi', 'B3_std']

X = features[newall]
y = features["agbm"]


X_train, X_test, y_train, y_test = train_test_split(XX, y, test_size = 0.2, random_state=42)

In [None]:
X.columns

Index(['B8_max', 'B2_mean', 'B3_mean', 'B8_min', 'B4_std', 'B3_max', 'B8_mean',
       'B8_std', 'B2_max', 'B4_min', 'B2_min', 'B4_mean', 'B3_min', 'B2_std',
       'B4_max', 'msr', 'sr', 'evi', 'B3_std'],
      dtype='object')

In [None]:
#forward selection
selected = []
rmse_values = []

for i in range(len(variables)):
  smallest_var = ""
  smallest_rmse = 1000000

  selected.append(None)

  for var in variables:
    if(var in selected):
      continue

    selected[-1] = var

    X_tr = X_train[selected].values.reshape(-1,len(selected))
    X_te = X_test[selected].values.reshape(-1,len(selected))

    rf = RandomForestRegressor(random_state = 42)
    rf.fit(X_tr, y_train)

    y_pred = rf.predict(X_te)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    if(rmse < smallest_rmse):
      smallest_rmse = rmse
      smallest_var = var
  
  selected[-1] = smallest_var
  rmse_values.append(smallest_rmse)

print(selected)
print(rmse_values)

['B8_max', 'grvi', 'B2_mean', 'B3_mean', 'B8_min', 'B4_std', 'B3_max', 'B8_mean', 'gndvi', 'B8_std', 'B2_max', 'B4_min', 'B2_min', 'B4_mean', 'B3_min', 'B2_std', 'B4_max', 'msr', 'ctvi', 'rvi', 'osavi', 'sr', 'ndvi', 'nrvi', 'ipvi', 'ttvi', 'savi', 'tndvi', 'evi', 'B3_std']
[2.220223806524729, 2.1099774307679082, 1.999619533038855, 1.9423160851546135, 1.901463351418264, 1.8876808814554042, 1.880320752773272, 1.872065132359303, 1.8672486763542937, 1.8650339119767663, 1.8646250423428523, 1.8645716413244733, 1.8661130082565331, 1.8643389333489229, 1.8652892174440112, 1.864171711224941, 1.864090890973867, 1.8663505563166787, 1.8683962886331242, 1.8678311572684516, 1.8688885056004816, 1.8680276077361155, 1.868766482261435, 1.869063945825073, 1.8697216565803199, 1.8684226413151408, 1.8688933409813866, 1.8669785292536598, 1.8691749580101678, 1.8721209034591484]


In [None]:
# Parameters to tune
bootstrap = [True] #, False]
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
max_features = ['auto', 'sqrt', 'log2']
max_samples = [int(x) for x in np.linspace(start = 100, stop = 10000, num = 10)]

# Creating the random grid
random_grid = {'bootstrap': bootstrap,
               'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features,
               'max_samples': max_samples}

In [None]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 60,
                               random_state = 42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_samples': 10000,
 'max_features': 'auto',
 'max_depth': 20,
 'bootstrap': True}

In [None]:
# Get the best estimator
best_random = rf_random.best_estimator_

y_pred = best_random.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1.3349908758254039
Mean Squared Error: 3.2858301010674817
Root Mean Squared Error: 1.8126858804181936


In [None]:
rf2 = RandomForestRegressor(n_estimators = 1555,
                            random_state = 42,
                            oob_score = True,
                            max_features = 'auto',
                            max_samples = 20000,
                            max_depth = 30,
                            min_samples_leaf = 2,
                            min_samples_split = 2)
rf2.fit(XX, y)

y_pred = rf2.predict(X_test)

# True values - Predicted values
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
# 
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
# preferably below 1, .2 - 0.5 for a fairly accurate model
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# correctly predicted rows
print('Out-of-bag score:', rf2.oob_score_)

Mean Absolute Error: 1.2612421500712483
Mean Squared Error: 2.9357767847297236
Root Mean Squared Error: 1.7134108627908613
Out-of-bag score: 0.4334717920062384


Records of different runs:

Without parameter tuning, with features selected with forward selection using 10% of the data:


Mean Absolute Error: 1.288601865232459

Mean Squared Error: 3.058570144349765

Root Mean Squared Error: 1.7488768236641954

Out-of-bag score: 0.4303017730575872


rf2 = RandomForestRegressor(n_estimators = 1400,
                            random_state = 42,
                            oob_score = True,
                            max_features = 'auto',
                            criterion = 'squared_error',
                            max_samples = 10000,
                            max_depth = 20,
                            min_samples_leaf = 2,
                            min_samples_split = 5)

With parameter tuning, with features selected with forward selection using 10% of the data,  max_samples = 30000,:

Mean Absolute Error: 1.2632760344973675

Mean Squared Error: 2.9525356765807116

Root Mean Squared Error: 1.7182944091687873

Out-of-bag score: 0.4359033969028633