In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
data = pd.read_csv("crop_yield.csv")

In [3]:
X = data.drop(columns=["Yield"])
y = data["Yield"]

In [4]:
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = data.select_dtypes(include=["object"]).columns.tolist()

In [5]:
for col in numerical_features:
    data[col] = data[col].fillna(data[col].median())

for col in categorical_features:
    data[col] = data[col].fillna("Unknown")

In [6]:
if "Area" in data.columns and "Annual_Rainfall" in data.columns:
    data["Area_Rainfall"] = data["Area"] * data["Annual_Rainfall"]

In [7]:
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [8]:
target_column = "Yield"  # Change if needed
X = data.drop(columns=[target_column])
y = data[target_column]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
param_grid = {
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [5, 7, 10],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

In [11]:
xgb = XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

In [12]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


In [13]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error: 15.746767270725652
R² Score: 0.9270245176958274


In [15]:
def predict_yield(crop, crop_year, season, state, area, annual_rainfall, fertilizer, pesticide):
    custom_input = pd.DataFrame([{
        "Crop": crop,
        "Crop_Year": crop_year,
        "Season": season,
        "State": state,
        "Area": area,
        "Annual_Rainfall": annual_rainfall,
        "Fertilizer": fertilizer,
        "Pesticide": pesticide
    }])
    
    custom_input["Area_Rainfall"] = custom_input["Area"] * custom_input["Annual_Rainfall"]
    custom_input["log_Area"] = np.log1p(custom_input["Area"])

    custom_input = pd.get_dummies(custom_input, columns=["Crop", "Season", "State"], drop_first=True)
    
    missing_cols = set(X.columns) - set(custom_input.columns)
    for col in missing_cols:
        custom_input[col] = 0
    custom_input = custom_input[X.columns]
    
    predicted_yield = best_model.predict(custom_input)
    return predicted_yield[0]

example_prediction = predict_yield("Wheat", 2023, "Rabi", "Punjab", 5000, 1200, 300, 50)
print(f"Predicted Yield: {example_prediction:.2f}")

Predicted Yield: 1.27
