In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualizations
sns.set_theme(style="whitegrid")
def process_crop(file_path):
    # Load dataset
    data = pd.read_csv(file_path)
    crop_name = file_path.split('/')[-1].split('_')[0]  # Extract crop name from file name

    # Convert PRODUCTION from metric tons to kilograms
    data['PRODUCTION'] = data['PRODUCTION'] * 1000  # Convert to kilograms

    # Handle cases where PRODUCTION is zero
    data['YIELD_PER_AREA'] = np.where(
        data['PRODUCTION'] == 0, 
        0,  # If production is zero, set YIELD_PER_AREA to 0
        data['PRODUCTION'] / data['AREA']  # Otherwise, calculate as usual
    )

    # One-hot encode DISTRICT_NAME
    data = pd.get_dummies(data, columns=['DISTRICT_NAME'], prefix='DISTRICT', drop_first=True)

    # Prepare features (X) and target (y)
    # Remove AREA and PRODUCTION, use YIELD_PER_AREA as target
    X = data.drop(columns=['YEAR', 'CROP_TYPE', 'AREA', 'PRODUCTION', 'YIELD_PER_AREA'])
    y = data['YIELD_PER_AREA']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale features for SVR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train models
    lr_model = LinearRegression()
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    svr_model = SVR(kernel='linear', C=10, epsilon=0.5)

    lr_model.fit(X_train, y_train)
    rf_model.fit(X_train, y_train)
    svr_model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred_lr = lr_model.predict(X_test)
    y_pred_rf = rf_model.predict(X_test)
    y_pred_svr = svr_model.predict(X_test_scaled)

    # Evaluate models
    results = {
        "Crop": crop_name,
        "Model": ["Linear Regression", "Random Forest", "SVR"],
        "MAE": [
            mean_absolute_error(y_test, y_pred_lr),
            mean_absolute_error(y_test, y_pred_rf),
            mean_absolute_error(y_test, y_pred_svr),
        ],
        "RMSE": [
            np.sqrt(mean_squared_error(y_test, y_pred_lr)),
            np.sqrt(mean_squared_error(y_test, y_pred_rf)),
            np.sqrt(mean_squared_error(y_test, y_pred_svr)),
        ],
        "R²": [
            r2_score(y_test, y_pred_lr),
            r2_score(y_test, y_pred_rf),
            r2_score(y_test, y_pred_svr),
        ],
    }

    # Feature importance for Random Forest
    feature_importances = rf_model.feature_importances_
    importance_df = pd.DataFrame(
        {"Feature": X.columns, "Importance": feature_importances}
    ).sort_values(by="Importance", ascending=False).head(10)

    return results, y_test, y_pred_lr, y_pred_rf, y_pred_svr, importance_df

# List of file paths
file_paths = [
    "FinalDatasets/BARLEY_data_merged.csv",
    "FinalDatasets/MAIZE_data_merged.csv",
    "FinalDatasets/MILLET_data_merged.csv",
    "FinalDatasets/PADDY_data_merged.csv",
    "FinalDatasets/WHEAT_data_merged.csv",
]

# Initialize summary DataFrame
summary_df = pd.DataFrame()

# Loop through each crop
for file_path in file_paths:
    results, y_test, y_pred_lr, y_pred_rf, y_pred_svr, importance_df = process_crop(file_path)

    # Append results to summary DataFrame
    summary_df = pd.concat([summary_df, pd.DataFrame(results)], ignore_index=True)

    # Display crop-specific feature importance
    crop_name = results["Crop"]
    print(f"Top 10 Features for {crop_name} - Random Forest")
    display(importance_df)  # Jupyter-friendly display

    # Scatter plot: Actual vs Predicted
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred_lr, label="Linear Regression", alpha=0.6)
    plt.scatter(y_test, y_pred_rf, label="Random Forest", alpha=0.6)
    plt.scatter(y_test, y_pred_svr, label="SVR", alpha=0.6, marker="x")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--", color="red", label="Perfect Fit")
    
    # Set axis limits with padding
    padding = (y_test.max() - y_test.min()) * 0.05  # 5% padding
    plt.xlim(y_test.min() - padding, y_test.max() + padding)
    plt.ylim(y_test.min() - padding, y_test.max() + padding)
    
    # Title and labels
    plt.title(f"Actual vs Predicted Yield per Area - {crop_name}")
    plt.xlabel("Actual Yield per Area")
    plt.ylabel("Predicted Yield per Area")
    
    # Add a legend to differentiate between the models
    plt.legend()
    
    plt.show()

# Summary of all results
summary_df = summary_df.round(2)
print("Model Performance Summary:")
display(summary_df)  # Jupyter-friendly display of the summary

# Display the summary DataFrame as a clear table with formatting
plt.figure(figsize=(20, 10))
sns.set_context("talk", font_scale=1.2)

# Bar plot for R² scores by Crop and Model
sns.barplot(x="Crop", y="R²", hue="Model", data=summary_df)
plt.title("Model Performance by Crop (R² Score)")
plt.ylabel("R² Score")
plt.xlabel("Crop")
plt.legend(title="Model")
plt.show()

Number of NaN values in YIELD_PER_AREA: 100


ValueError: Input y contains NaN.