In [8]:
import numpy as np
import pandas as pd
import tgess
import os
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from tqdm.auto import tqdm
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, \
                            r2_score, mean_absolute_percentage_error
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
sns.set_style("darkgrid")

tqdm.pandas()

In [3]:
filepath = "../data/processed/yield/PROSAIL_LUT_Sentinel2_100000_winter_wheat.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0.1,Unnamed: 0,n,cab,car,cbrown,cw,cm,lai,lidfa,lidfb,...,B4,B5,B6,B7,B8,B8A,B9,B10,B11,B12
0,0,1.734188,32.6627,6.366175,0.179771,0.0185,0.048976,0.49028,-0.43989,0.0,...,0.068401,0.093449,0.125711,0.135927,0.143495,0.147684,0.156693,0.157193,0.150868,0.113736
1,1,1.020396,77.5901,13.013725,0.124435,0.0185,0.061003,4.09772,-0.14219,0.0,...,0.012911,0.027579,0.119903,0.152466,0.155,0.156023,0.154607,0.085786,0.051301,0.009628
2,2,1.013964,54.9053,3.123925,0.060093,0.0185,0.18126,1.84636,-0.46909,0.0,...,0.022265,0.030154,0.048618,0.052922,0.054914,0.056003,0.058415,0.050093,0.040592,0.029048
3,3,1.777724,32.6825,5.482975,0.132617,0.0185,0.193821,2.35852,-0.94897,0.0,...,0.011562,0.023611,0.035636,0.038003,0.039106,0.039705,0.040843,0.032082,0.022304,0.013455
4,4,1.192892,54.0605,2.043325,0.157657,0.0185,0.038065,7.3198,0.27147,0.0,...,0.020703,0.061226,0.224777,0.276931,0.283315,0.286093,0.283068,0.15456,0.103582,0.019764


In [4]:
df.columns

Index(['Unnamed: 0', 'n', 'cab', 'car', 'cbrown', 'cw', 'cm', 'lai', 'lidfa',
       'lidfb', 'hspot', 'solar_zenith', 'observer_zenith', 'relative_azimuth',
       'rsoil', 'psoil', 'typelidf', 'alpha', 'B1', 'B2', 'B3', 'B4', 'B5',
       'B6', 'B7', 'B8', 'B8A', 'B9', 'B10', 'B11', 'B12'],
      dtype='object')

In [11]:
bands = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B10', 'B11', 'B12']
angles = ['solar_zenith', 'observer_zenith', 'relative_azimuth']
target = "lai"

## Leaf area index (LAI)

In [12]:
rf = RandomForestRegressor(n_jobs=4, n_estimators=100)
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', rf)])

results = cross_validate(pipeline, X=df[bands], y=df[target], cv=5,
                         scoring=('r2', 'neg_mean_squared_error'),
                         return_train_score=True)
results

{'fit_time': array([22.09124494, 22.55903959, 22.97101951, 23.43519235, 24.158077  ]),
 'score_time': array([0.32182884, 0.2114284 , 0.21202874, 0.20758128, 0.21221304]),
 'test_r2': array([0.83587633, 0.83457002, 0.83261224, 0.83424572, 0.83309405]),
 'train_r2': array([0.97662986, 0.97659192, 0.9766823 , 0.97653093, 0.97661703]),
 'test_neg_mean_squared_error': array([-0.86921137, -0.88831222, -0.89428379, -0.87862425, -0.89413376]),
 'train_neg_mean_squared_error': array([-0.12485645, -0.12463   , -0.12430709, -0.12535948, -0.12456935])}

## Dry matter content (cm)

In [13]:
rf = RandomForestRegressor(n_jobs=4, n_estimators=100)
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', rf)])

results = cross_validate(pipeline, X=df[bands], y=df["cm"], cv=5,
                         scoring=('r2', 'neg_mean_squared_error'),
                         return_train_score=True)
results

{'fit_time': array([19.39045382, 20.51151013, 20.65923595, 20.76829267, 20.72150087]),
 'score_time': array([0.20683026, 0.20847631, 0.21388078, 0.20789194, 0.20805836]),
 'test_r2': array([0.95892152, 0.95943761, 0.96037739, 0.96139577, 0.96013774]),
 'train_r2': array([0.99443578, 0.99443409, 0.99435686, 0.9943024 , 0.9944312 ]),
 'test_neg_mean_squared_error': array([-0.00013511, -0.00013204, -0.00013069, -0.00012453, -0.0001302 ]),
 'train_neg_mean_squared_error': array([-1.81469928e-05, -1.82003108e-05, -1.83919420e-05, -1.86726165e-05,
        -1.81943787e-05])}

## Chlorophyll a+b (cab)

In [18]:
rf = RandomForestRegressor(n_jobs=4, n_estimators=100)
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', rf)])

results = cross_validate(pipeline, X=df[bands], y=df["cab"], cv=5,
                         scoring=('r2', 'neg_mean_squared_error'),
                         return_train_score=True)
results

{'fit_time': array([19.14919996, 20.33138537, 20.92085147, 20.64294004, 20.86544967]),
 'score_time': array([0.21191359, 0.21172237, 0.31363964, 0.31374907, 0.20783043]),
 'test_r2': array([0.92054192, 0.9184823 , 0.91887201, 0.91763987, 0.91835021]),
 'train_r2': array([0.98851369, 0.98860193, 0.98857588, 0.9886815 , 0.98857134]),
 'test_neg_mean_squared_error': array([-23.91602677, -24.36973078, -24.19228296, -24.81841859,
        -24.53359268]),
 'train_neg_mean_squared_error': array([-3.44305115, -3.422413  , -3.43228638, -3.3916838 , -3.42724443])}