In [1]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from pycaret.regression import RegressionExperiment

from etl import ETL
from feature_creation import FeatureCreation

plt.style.use("seaborn-v0_8")

In [2]:
df_yield = pd.read_csv("data/barley_yield_from_1982.csv", sep=";")  # 1982 to 2018
df_climate = pd.read_parquet(
    "data/climate_data_from_1982.parquet"
)  # 1982-2014 2015-2050

In [3]:
df_yield, df_climate = ETL(df_yield, df_climate).run()

--- df_climate---
Departments/Scenario dropped because of any missing values:                                    nom_dep  scenario
date                                                
2015-01-01 12:00:00               Calvados  ssp2_4_5
2015-01-01 12:00:00            Deux_Sevres  ssp2_4_5
2015-01-01 12:00:00                Essonne  ssp2_4_5
2015-01-01 12:00:00                   Eure  ssp2_4_5
2015-01-01 12:00:00                  Rhone  ssp2_4_5
2015-01-01 12:00:00        Tarn_et_Garonne  ssp2_4_5
2015-01-01 12:00:00  Territoire_de_Belfort  ssp2_4_5
2015-01-01 12:00:00               Vaucluse  ssp2_4_5
--- df_yield ---
Departments dropped because of almost absolute absence of data:
 ['Hauts_de_Seine' 'Paris' 'Seine_SeineOise']


In [4]:
df_hist, df_forecast = FeatureCreation(df_yield, df_climate).run()

--- Amplitude feature created ---


# Prepare data


In [9]:
df = df_hist[
    ~df_hist.isna().any(axis=1)
]  # Remove rows where we have yield but no climate data
target = "yield"
df = df.drop(columns=["production"])

# Baseline model comparison


We will use pycaret to quickly compare the performance of different models on the dataset before actually selecting the best model for further tuning.


In [10]:
s = RegressionExperiment()
s.setup(df, target=target, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,yield
2,Target type,Regression
3,Original data shape,"(2934, 39)"
4,Transformed data shape,"(2934, 39)"
5,Transformed train set shape,"(2053, 39)"
6,Transformed test set shape,"(881, 39)"
7,Numeric features,38
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x3001b2070>

In [12]:
best = s.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.2227,0.1076,0.3275,0.9424,0.0617,0.05,0.009
lightgbm,Light Gradient Boosting Machine,0.3138,0.166,0.4065,0.9113,0.0806,0.0732,0.184
et,Extra Trees Regressor,0.3456,0.2034,0.4502,0.8917,0.0885,0.0806,0.144
rf,Random Forest Regressor,0.3594,0.2161,0.4638,0.8848,0.0911,0.084,0.405
gbr,Gradient Boosting Regressor,0.3746,0.2324,0.4809,0.8761,0.0942,0.0872,0.305
lr,Linear Regression,0.4428,0.3197,0.5638,0.8298,0.1099,0.1033,0.236
ridge,Ridge Regression,0.4448,0.3229,0.567,0.828,0.1106,0.104,0.008
br,Bayesian Ridge,0.4446,0.3229,0.567,0.828,0.1107,0.104,0.008
ada,AdaBoost Regressor,0.4784,0.3544,0.5944,0.8108,0.1099,0.1061,0.094
lar,Least Angle Regression,0.4719,0.3625,0.6013,0.8066,0.1155,0.1089,0.007


  master_display_.apply(


# Train Test Split


In [11]:
target = "yield"
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import get_scorer, get_scorer_names

In [13]:
knn = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("knn", KNeighborsRegressor(n_neighbors=5)),
    ]
)
knn.fit(X_train, y_train)

score_names = [
    "r2",
    "neg_mean_absolute_error",
    "neg_root_mean_squared_error",
    "neg_mean_absolute_percentage_error",
]
for name in score_names:
    scorer = get_scorer(name)
    print(name, " : ", scorer(knn, X_test, y_test))

r2  :  0.7713273260059221
neg_mean_absolute_error  :  -0.5187606890764959
neg_root_mean_squared_error  :  -0.6968283947912132
neg_mean_absolute_percentage_error  :  -0.13250654408729334


# Test using df_forecast
