In [1]:
# Demo for yield prediction based on https://github.com/ADA-research/AutoML4HybridEarthScienceModels

import warnings
import numpy as np
from IPython.core.display import display
import geopandas as gpd
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import ShuffleSplit
import seaborn as sns
# import pixel processing functions
from yield_helper import create_geometry, getYieldwithoutBorders, create_dataset, flatten_time_series, filter_by_2std

sns.set_theme()
sns.set_style("darkgrid")

warnings.filterwarnings("ignore")

  from IPython.core.display import display


In [2]:
# select the fields for prediction
fields = [
    'Baumacker', 'D8', 'Dichtlacker', 'Heindlacker', 'Heng', 'Holzacker',
    'Neulandsiedlung', 'Itzling2', 'Itzling5', 'Itzling6', 'Schluetterfabrik',
    'Thalhausen138', 'Thalhausen141', 'Voettingerfeld'
]

# fields =  ['Dichtlacker', 'Heindlacker', 'Heng',
#                            'Holzacker', 'Neulandsiedlung','Itzling5',
#                            'Itzling6', 'Schluetterfabrik', 'Thalhausen138', 'Voettingerfeld']

# test_fields = ['Baumacker', 'Itzling2', 'Thalhausen141']

# load summary for each field
field_summary = pd.read_excel(
    "../data/cropdata/Bavaria/yields/fields_summary.xlsx")
yields_df = pd.read_csv("../data/cropdata/Bavaria/yields/yields2018.csv")
print('Fields in yield data:', yields_df.Name.unique().tolist())

# select bands and other features
bands = ["B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B11", "B12"]
angles = ['solar_zenith', 'observer_zenith', 'relative_azimuth']
other_features = ["et0", "rain", "cum_rain"]
feature_cols = bands + other_features
target_col = "Ertr.masse (Nass)(tonne/ha)"

# Load shapefile
geo_df = gpd.read_file(
    '../data/cropdata/Bavaria/yields/FeldstueckeTUM/Feldstuecke_WGS84.shp')

yields_gdf = create_geometry(yields_df)

conversion = 1
# Apply the optimized function
yields_df = yields_gdf.groupby(['Name']).apply(
    lambda x: getYieldwithoutBorders(x, geo_df, conversion))
yields_df.reset_index(drop=True, inplace=True)

df = create_dataset(bands=bands, yields_df=yields_df, fields=fields)
out_df, feature_cols = flatten_time_series(df, feature_cols,
                                           "Ertr.masse (Nass)(tonne/ha)")

mean = out_df['Ertr.masse (Nass)(tonne/ha)'].mean()
std = out_df['Ertr.masse (Nass)(tonne/ha)'].std()

# out_df = filter_by_2std(mean, std,'Ertr.masse (Nass)(tonne/ha)', out_df )

Fields in yield data: ['Grafenfeld', 'Krohberg', 'Radarstation', 'Sieblerfeld', 'Striegelfeld', 'Baumacker', 'D2', 'D3', 'D4', 'D8', 'Dichtlacker', 'Feldhof1', 'Feldhof1a', 'Feldhof2', 'Heindlacker', 'Heng', 'Holzacker', 'Itzling2', 'Itzling4', 'Itzling5', 'Itzling6', 'Muehlacker', 'Neulandsiedlung', 'Schluetterfabrik', 'Thalhausen138', 'Thalhausen141', 'Viehhausen1', 'Viehhausen11', 'Viehhausen3', 'Viehhausen5', 'Voettingerfeld']


100%|██████████| 14/14 [01:10<00:00,  5.02s/it]


In [5]:
out_df.describe()

Unnamed: 0,B04_t-20,B05_t-20,B06_t-20,B07_t-20,B08_t-20,B8A_t-20,B09_t-20,B11_t-20,B12_t-20,et0_t-20,...,B07_t-0,B08_t-0,B8A_t-0,B09_t-0,B11_t-0,B12_t-0,et0_t-0,rain_t-0,cum_rain_t-0,Ertr.masse (Nass)(tonne/ha)
count,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,...,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0,4637.0
mean,9e-06,1.2e-05,1.7e-05,1.8e-05,2e-05,2e-05,2e-05,1.9e-05,1.3e-05,1.298619,...,1.9e-05,2e-05,2.1e-05,2.2e-05,2.5e-05,1.8e-05,4.42046,0.344,321.342,7.550732
std,2e-06,3e-06,3e-06,3e-06,4e-06,3e-06,3e-06,3e-06,2e-06,2.220686e-16,...,8e-06,8e-06,9e-06,9e-06,1.2e-05,9e-06,1.776548e-15,1.110343e-16,5.684955e-14,1.721437
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298619,...,0.0,0.0,0.0,0.0,0.0,0.0,4.42046,0.344,321.342,3.368
25%,7e-06,1.1e-05,1.5e-05,1.6e-05,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.2e-05,1.298619,...,1.4e-05,1.5e-05,1.6e-05,1.7e-05,2e-05,1.4e-05,4.42046,0.344,321.342,6.274
50%,9e-06,1.3e-05,1.7e-05,1.8e-05,2e-05,1.9e-05,2e-05,1.9e-05,1.3e-05,1.298619,...,1.8e-05,1.8e-05,2e-05,2.4e-05,2.2e-05,1.6e-05,4.42046,0.344,321.342,7.671
75%,1e-05,1.4e-05,1.9e-05,2e-05,2.2e-05,2.2e-05,2.2e-05,2e-05,1.5e-05,1.298619,...,2.7e-05,2.8e-05,3e-05,3e-05,3.7e-05,2.6e-05,4.42046,0.344,321.342,8.872
max,1.6e-05,2e-05,3.2e-05,3.7e-05,3.5e-05,3.6e-05,3.8e-05,2.9e-05,2.2e-05,1.298619,...,4.2e-05,4.3e-05,4.4e-05,4.5e-05,4.7e-05,3.4e-05,4.42046,0.344,321.342,11.4


In [6]:
out_df[feature_cols]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(out_df[feature_cols].values)

scaler = preprocessing.StandardScaler()
scaler.fit(out_df[feature_cols].values)
x_scaled2 = scaler.transform(out_df[feature_cols].values)

test = pd.DataFrame(columns=feature_cols, data=x_scaled2)

# filter data
out_df = filter_by_2std(mean, std, 'Ertr.masse (Nass)(tonne/ha)', out_df)

In [7]:
# Train RF with pixels and apply
cv = ShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
rf = RandomForestRegressor(n_jobs=4, n_estimators=100)

results = cross_validate(rf,
                         X=out_df[feature_cols],
                         y=out_df[target_col],
                         cv=cv,
                         scoring=('r2', 'neg_mean_squared_error'),
                         return_train_score=True)

display("Mean train R2: {}, individual folds: {}".format(
    np.mean(results["train_r2"]), results["train_r2"]))
display("Mean test R2: {}, individual folds: {}".format(
    np.mean(results["test_r2"]), results["test_r2"]))

'Mean train R2: 0.9382038496354482, individual folds: [0.93754293 0.93758485 0.94111838 0.93707099 0.93770209]'

'Mean test R2: 0.5535455259341691, individual folds: [0.5632135  0.55860653 0.54544467 0.550136   0.55032693]'