In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
df = pd.read_csv("crop_yield.csv")

df = (
    df.groupby("Crop", group_keys=False)
      .apply(lambda x: x.sample(frac=0.1, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda x: x.sample(frac=0.1, random_state=42))


In [3]:
print("Number of rows after sampling:", len(df))

Number of rows after sampling: 99999


In [18]:
# Encoding categorical columns
categorical_cols = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [20]:
# Setting the target variable
X = df.drop(columns=['Yield_tons_per_hectare'])
y = df['Yield_tons_per_hectare']

In [22]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Training model - using RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [26]:
# Saving the model
joblib.dump(model, 'crop_yield_model.pkl')

['crop_yield_model.pkl']

In [28]:
# Saving encoders
for col in categorical_cols:
    joblib.dump(encoders[col], f"{col}_encoder.pkl")

In [30]:
print("Model and encoders saved.")

Model and encoders saved.


In [32]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test set
y_pred = model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.4f}")


Mean Absolute Error (MAE): 0.42
Root Mean Squared Error (RMSE): 0.52
R-squared (R²): 0.9061
