In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
df = pd.read_csv("yield_df.csv")

In [6]:
df.head(4)

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37


In [7]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [8]:
df.head(5)

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [9]:
df.describe()

Unnamed: 0,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
count,28242.0,28242.0,28242.0,28242.0,28242.0
mean,2001.544296,77053.332094,1149.05598,37076.909344,20.542627
std,7.051905,84956.612897,709.81215,59958.784665,6.312051
min,1990.0,50.0,51.0,0.04,1.3
25%,1995.0,19919.25,593.0,1702.0,16.7025
50%,2001.0,38295.0,1083.0,17529.44,21.51
75%,2008.0,104676.75,1668.0,48687.88,26.0
max,2013.0,501412.0,3240.0,367778.0,30.65


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28242 entries, 0 to 28241
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area                           28242 non-null  object 
 1   Item                           28242 non-null  object 
 2   Year                           28242 non-null  int64  
 3   hg/ha_yield                    28242 non-null  int64  
 4   average_rain_fall_mm_per_year  28242 non-null  float64
 5   pesticides_tonnes              28242 non-null  float64
 6   avg_temp                       28242 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.5+ MB


In [11]:
df.isnull().sum()

Area                             0
Item                             0
Year                             0
hg/ha_yield                      0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
avg_temp                         0
dtype: int64

In [12]:
df.duplicated().sum()

np.int64(2310)

In [13]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [14]:
df.shape

(25932, 7)

In [15]:
X = df.drop("hg/ha_yield", axis=1)
y = df["hg/ha_yield"]

In [16]:
# Categorical and numerical features
cat_features = ["Area", "Item"]
num_features = ["Year", "average_rain_fall_mm_per_year", "pesticides_tonnes", "avg_temp"]

In [17]:
# Preprocessing
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ("num", StandardScaler(), num_features)
])

In [18]:
# Model pipeline
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [19]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train model
model.fit(X_train, y_train)

In [21]:
# Predictions
y_pred = model.predict(X_test)

In [22]:
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.9842040272704149


In [23]:
# Make sure both are 1-D numpy arrays
y_test_arr = np.asarray(y_test).ravel()
y_pred_arr = np.asarray(y_pred).ravel()

In [24]:
print("shapes:", y_test_arr.shape, y_pred_arr.shape)

shapes: (5187,) (5187,)


In [25]:
rmse = np.sqrt(mean_squared_error(y_test_arr, y_pred_arr))

In [26]:
print(f"RMSE: {rmse:.4f}")

RMSE: 10699.8827
