# Train and evaluate model

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


from datetime import datetime, date, time

In [4]:
data_folder='../data/'
df = pd.read_csv(data_folder + 'df_processed.csv')
print(df.shape)
pd.set_option('display.max_columns', None)
df.head()

(180, 11)


Unnamed: 0,state,year,yield,avg_temp_C,max_temp_C,min_temp_C,total_precip_mm,avg_precip_mm,hot_days,dry_days,flooding_days
0,ILLINOIS,2005,46.5,28.029765,36.79556,10.09295,351.391859,2.296679,2.0,88,0.0
1,INDIANA,2005,49.0,27.139371,34.91238,10.4295,419.308881,2.740581,0.0,90,2.0
2,IOWA,2005,52.5,26.673741,35.29424,8.7247,400.706764,2.618998,1.0,92,2.0
3,KANSAS,2005,37.0,29.811761,37.96978,12.986,390.994898,2.555522,16.0,88,3.0
4,MINNESOTA,2005,45.5,24.188087,32.77953,6.2006,460.882343,3.012303,0.0,72,1.0


In [5]:
features = ["avg_temp_C", "max_temp_C", "min_temp_C", "total_precip_mm", "avg_precip_mm",
            "hot_days", "dry_days", "flooding_days"]

X = df[features]
y = df["yield"]

X = pd.concat([X, pd.get_dummies(df["state"], drop_first=True)], axis=1)
X

Unnamed: 0,avg_temp_C,max_temp_C,min_temp_C,total_precip_mm,avg_precip_mm,hot_days,dry_days,flooding_days,INDIANA,IOWA,KANSAS,MINNESOTA,MISSOURI,NEBRASKA,OHIO,SOUTH DAKOTA,WISCONSIN
0,28.029765,36.79556,10.09295,351.391859,2.296679,2.0,88,0.0,False,False,False,False,False,False,False,False,False
1,27.139371,34.91238,10.42950,419.308881,2.740581,0.0,90,2.0,True,False,False,False,False,False,False,False,False
2,26.673741,35.29424,8.72470,400.706764,2.618998,1.0,92,2.0,False,True,False,False,False,False,False,False,False
3,29.811761,37.96978,12.98600,390.994898,2.555522,16.0,88,3.0,False,False,True,False,False,False,False,False,False
4,24.188087,32.77953,6.20060,460.882343,3.012303,0.0,72,1.0,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,28.899841,36.18893,15.47780,471.746290,3.083309,7.0,83,3.0,False,False,False,False,True,False,False,False,False
176,29.199132,37.96996,12.83010,349.044508,2.281337,14.0,98,1.0,False,False,False,False,False,True,False,False,False
177,25.809630,32.62548,14.39752,486.366338,3.178865,0.0,84,2.0,False,False,False,False,False,False,True,False,False
178,26.824554,37.28726,11.38375,283.694535,1.854213,7.0,98,1.0,False,False,False,False,False,False,False,True,False


In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_list = []
rmse_list = []
r2_list = []

# K-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    mae_list.append(mean_absolute_error(y_val, y_pred))
    rmse_list.append(np.sqrt(mean_squared_error(y_val, y_pred)))
    r2_list.append(r2_score(y_val, y_pred))

print("Average MAE:", np.mean(mae_list))
print("Average RMSE:", np.mean(rmse_list))
print("Average R²:", np.mean(r2_list))

Average MAE: 3.8827777777777777
Average RMSE: 4.86424309009585
Average R²: 0.588914107068326
