In [1]:
# ----------------------------------------
# IMPORT LIBRARIES
# ----------------------------------------
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# ----------------------------------------
# STEP 1: LOAD DATASET
# ----------------------------------------
df = pd.read_csv("crop_yield_prediction_dataset.csv")
print("✅ Dataset loaded. Shape:", df.shape)

# Drop missing values
df.dropna(inplace=True)

# Feature: Duration
if 'Days_Between_Sowing_and_Harvest' in df.columns:
    df['Crop_Duration'] = df['Days_Between_Sowing_and_Harvest']
elif 'Sowing_Date' in df.columns and 'Harvest_Date' in df.columns:
    df['Sowing_Date'] = pd.to_datetime(df['Sowing_Date'])
    df['Harvest_Date'] = pd.to_datetime(df['Harvest_Date'])
    df['Crop_Duration'] = (df['Harvest_Date'] - df['Sowing_Date']).dt.days
else:
    df['Crop_Duration'] = 120  # Fallback if missing

# ----------------------------------------
# STEP 2: ENCODE & SCALE
# ----------------------------------------
label_encoders = {}
X = df.drop(['Yield_Quintal_per_ha'], axis=1)
y = np.log1p(df['Yield_Quintal_per_ha'])  # Log-transform the target

# Encode categorical features
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save feature names to maintain order
feature_columns = X.columns.tolist()

# ----------------------------------------
# STEP 3: TRAIN/TEST SPLIT
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ----------------------------------------
# STEP 4: TRAIN MODEL
# ----------------------------------------
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=1.0,
    learning_rate=0.05,
    max_depth=10,
    n_estimators=100,
    subsample=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# ----------------------------------------
# STEP 5: EVALUATE
# ----------------------------------------
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
accuracy = 100 - mape

print(f"\n📊 RMSE: {rmse:.2f}")
print(f"📈 R²: {r2:.4f}")
print(f"📉 MAPE: {mape:.2f}%")
print(f"✅ Accuracy: {accuracy:.2f}%")

# ----------------------------------------
# STEP 6: SAVE ARTIFACTS
# ----------------------------------------
joblib.dump(model, "xgb_crop_model_simulation.joblib")
joblib.dump(scaler, "scaler_simulation.joblib")
joblib.dump(label_encoders, "label_encoders_simulation.joblib")
joblib.dump(feature_columns, "feature_columns_simulation.joblib")
print("📂 Model, scaler, encoders, and feature list saved.")

# ----------------------------------------
# STEP 7: PREDICTION FUNCTION
# ----------------------------------------
def predict_crop_yield():
    # Load saved objects
    model = joblib.load("xgb_crop_model_simulation.joblib")
    scaler = joblib.load("scaler_simulation.joblib")
    label_encoders = joblib.load("label_encoders_simulation.joblib")
    feature_columns = joblib.load("feature_columns_simulation.joblib")

    # Prompt structure
    prompts = [
        ("Location", str),
        ("Soil_Type", str),
        ("Rainfall_mm", float),
        ("Temperature_C", float),
        ("Crop_Type", str),
        ("Fertilizer_Type", str),
        ("Irrigation_Type", str),
        ("Sowing_Month", int),
        ("Harvest_Month", int),
        ("Farm_Size_ha", float),
        ("Soil_pH", float),
        ("Soil_Organic_Matter(%)", float),
        ("Max_Temp_C", float),
        ("Min_Temp_C", float),
        ("Humidity_percent", float),
        ("Fertilizer_kg_per_ha", float),
        ("Irrigation_Count", int),
        ("Avg_Temp_C", float),
        ("Soil_Moisture_percent", float),
        ("Sunshine_Hours_per_day", float),
        ("NDVI", float),
        ("Soil_Nitrogen_mg_per_kg", float),
        ("Soil_Phosphorus_mg_per_kg", float),
        ("Soil_Potassium_mg_per_kg", float),
        ("Wind_Speed_kmph", float),
        ("Crop_Duration", int),
    ]

    print("\n📋 Enter Crop Simulation Inputs:")
    user_input = {}
    for key, dtype in prompts:
        while True:
            try:
                val = input(f"🔹 {key.replace('_', ' ')}: ")
                user_input[key] = dtype(val)
                break
            except ValueError:
                print("❌ Invalid input. Try again.")

    df_input = pd.DataFrame([user_input])

    # Apply label encoding to categorical columns
    for col, le in label_encoders.items():
        if df_input[col].iloc[0] not in le.classes_:
            le.classes_ = np.append(le.classes_, df_input[col].iloc[0])
        df_input[col] = le.transform(df_input[col])

    # Reorder columns to match training
    df_input = df_input[feature_columns]

    # Scale features
    X_input = scaler.transform(df_input)

    # Predict and inverse log
    y_pred_log = model.predict(X_input)[0]
    predicted_yield = np.expm1(y_pred_log)

    print(f"\n🌾 Estimated Yield: {predicted_yield:.2f} quintals/ha")

# ----------------------------------------
# RUN PREDICTION
# ----------------------------------------
if __name__ == "__main__":
    predict_crop_yield()


✅ Dataset loaded. Shape: (1000, 11)





📊 RMSE: 8.24
📈 R²: -0.1137
📉 MAPE: 21.11%
✅ Accuracy: 78.89%
📂 Model, scaler, encoders, and feature list saved.

📋 Enter Crop Simulation Inputs:


🔹 Location:  Haldwani
🔹 Soil Type:  Clay
🔹 Rainfall mm:  250
🔹 Temperature C:  35
🔹 Crop Type:  Rice
🔹 Fertilizer Type:  Urea
🔹 Irrigation Type:  Manual
🔹 Sowing Month:  9
🔹 Harvest Month:  2
🔹 Farm Size ha:  680
🔹 Soil pH:  9
🔹 Soil Organic Matter(%):  90
🔹 Max Temp C:  50
🔹 Min Temp C:  0
🔹 Humidity percent:  10
🔹 Fertilizer kg per ha:  84
🔹 Irrigation Count:  0
🔹 Avg Temp C:  30
🔹 Soil Moisture percent:  2
🔹 Sunshine Hours per day:  9
🔹 NDVI:  0.5346
🔹 Soil Nitrogen mg per kg:  100
🔹 Soil Phosphorus mg per kg:  90
🔹 Soil Potassium mg per kg:  84]


❌ Invalid input. Try again.


🔹 Soil Potassium mg per kg:  90
🔹 Wind Speed kmph:  78
🔹 Crop Duration:  900



🌾 Estimated Yield: 39.29 quintals/ha


# Save in PKL

In [4]:
# ----------------------------------------
# IMPORT LIBRARIES
# ----------------------------------------
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# ----------------------------------------
# STEP 1: LOAD DATASET
# ----------------------------------------
df = pd.read_csv("crop_yield_prediction_dataset.csv")
print("✅ Dataset loaded. Shape:", df.shape)

# Drop missing values
df.dropna(inplace=True)

# Feature: Duration
if 'Days_Between_Sowing_and_Harvest' in df.columns:
    df['Crop_Duration'] = df['Days_Between_Sowing_and_Harvest']
elif 'Sowing_Date' in df.columns and 'Harvest_Date' in df.columns:
    df['Sowing_Date'] = pd.to_datetime(df['Sowing_Date'])
    df['Harvest_Date'] = pd.to_datetime(df['Harvest_Date'])
    df['Crop_Duration'] = (df['Harvest_Date'] - df['Sowing_Date']).dt.days
else:
    df['Crop_Duration'] = 120  # Fallback

# ----------------------------------------
# STEP 2: ENCODE & SCALE
# ----------------------------------------
label_encoders = {}
X = df.drop(['Yield_Quintal_per_ha'], axis=1)
y = np.log1p(df['Yield_Quintal_per_ha'])  # Log-transform the target

# Encode categorical features
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save feature names to maintain order
feature_columns = X.columns.tolist()

# ----------------------------------------
# STEP 3: TRAIN/TEST SPLIT
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ----------------------------------------
# STEP 4: TRAIN MODEL
# ----------------------------------------
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=1.0,
    learning_rate=0.05,
    max_depth=10,
    n_estimators=100,
    subsample=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# ----------------------------------------
# STEP 5: EVALUATE
# ----------------------------------------
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
accuracy = 100 - mape

print(f"\n📊 RMSE: {rmse:.2f}")
print(f"📈 R²: {r2:.4f}")
print(f"📉 MAPE: {mape:.2f}%")
print(f"✅ Accuracy: {accuracy:.2f}%")

# ----------------------------------------
# STEP 6: SAVE ARTIFACTS
# ----------------------------------------
joblib.dump(model, "Crop_Predict_model.pkl")
joblib.dump(scaler, "Crop_scaler.pkl")
joblib.dump(label_encoders, "Crop_label_encoders.pkl")
joblib.dump(feature_columns, "Crop_feature_columns.pkl")
print("📂 Model and related artifacts saved successfully.")

# ----------------------------------------
# STEP 7: PREDICTION FUNCTION
# ----------------------------------------
def predict_crop_yield():
    # Load saved objects
    model = joblib.load("Crop_Predict_model.pkl")
    scaler = joblib.load("Crop_scaler.pkl")
    label_encoders = joblib.load("Crop_label_encoders.pkl")
    feature_columns = joblib.load("Crop_feature_columns.pkl")

    # Prompt structure
    prompts = [
        ("Location", str),
        ("Soil_Type", str),
        ("Rainfall_mm", float),
        ("Temperature_C", float),
        ("Crop_Type", str),
        ("Fertilizer_Type", str),
        ("Irrigation_Type", str),
        ("Sowing_Month", int),
        ("Harvest_Month", int),
        ("Farm_Size_ha", float),
        ("Soil_pH", float),
        ("Soil_Organic_Matter(%)", float),
        ("Max_Temp_C", float),
        ("Min_Temp_C", float),
        ("Humidity_percent", float),
        ("Fertilizer_kg_per_ha", float),
        ("Irrigation_Count", int),
        ("Avg_Temp_C", float),
        ("Soil_Moisture_percent", float),
        ("Sunshine_Hours_per_day", float),
        ("NDVI", float),
        ("Soil_Nitrogen_mg_per_kg", float),
        ("Soil_Phosphorus_mg_per_kg", float),
        ("Soil_Potassium_mg_per_kg", float),
        ("Wind_Speed_kmph", float),
        ("Crop_Duration", int),
    ]

    print("\n📋 Enter Crop Simulation Inputs:")
    user_input = {}
    for key, dtype in prompts:
        while True:
            try:
                val = input(f"🔹 {key.replace('_', ' ')}: ")
                user_input[key] = dtype(val)
                break
            except ValueError:
                print("❌ Invalid input. Try again.")

    df_input = pd.DataFrame([user_input])

    # Apply label encoding to categorical columns
    for col, le in label_encoders.items():
        if df_input[col].iloc[0] not in le.classes_:
            le.classes_ = np.append(le.classes_, df_input[col].iloc[0])
        df_input[col] = le.transform(df_input[col])

    # Reorder columns
    df_input = df_input[feature_columns]

    # Scale features
    X_input = scaler.transform(df_input)

    # Predict and inverse log
    y_pred_log = model.predict(X_input)[0]
    predicted_yield = np.expm1(y_pred_log)

    print(f"\n🌾 Estimated Yield: {predicted_yield:.2f} quintals/ha")

# ----------------------------------------
# RUN PREDICTION
# ----------------------------------------
if __name__ == "__main__":
    predict_crop_yield()


✅ Dataset loaded. Shape: (1000, 11)





📊 RMSE: 8.24
📈 R²: -0.1137
📉 MAPE: 21.11%
✅ Accuracy: 78.89%
📂 Model and related artifacts saved successfully.

📋 Enter Crop Simulation Inputs:


🔹 Location:  Haldwani
🔹 Soil Type:  Clay
🔹 Rainfall mm:  345
🔹 Temperature C:  56'


❌ Invalid input. Try again.


🔹 Temperature C:  35
🔹 Crop Type:  Rice
🔹 Fertilizer Type:  Urea
🔹 Irrigation Type:  Manual
🔹 Sowing Month:  9
🔹 Harvest Month:  2
🔹 Farm Size ha:  345
🔹 Soil pH:  9
🔹 Soil Organic Matter(%):  90
🔹 Max Temp C:  90
🔹 Min Temp C:  1
🔹 Humidity percent:  10
🔹 Fertilizer kg per ha:  90
🔹 Irrigation Count:  0
🔹 Avg Temp C:  60
🔹 Soil Moisture percent:  9
🔹 Sunshine Hours per day:  9
🔹 NDVI:  0.45
🔹 Soil Nitrogen mg per kg:  100
🔹 Soil Phosphorus mg per kg:  100
🔹 Soil Potassium mg per kg:  100
🔹 Wind Speed kmph:  89
🔹 Crop Duration:  900



🌾 Estimated Yield: 39.84 quintals/ha
