In [2]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

# Set style for publication-quality plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*80)
print("COMPREHENSIVE ERROR ANALYSIS - CORN YIELD PREDICTION")
print("="*80)

COMPREHENSIVE ERROR ANALYSIS - CORN YIELD PREDICTION


In [4]:
# ============================================================================
# 1. LOAD DATA AND MODELS
# ============================================================================
print("\n[1/9] Loading data and models...")

# Load original data
df = pd.read_csv('modeling_dataset_final.csv')
print(f"  ✓ Loaded {len(df):,} records")

# Create lag features (same as training)
df = df.sort_values(['State', 'County', 'Year'])
df['Yield_Lag1'] = df.groupby(['State', 'County'])['Yield_BU_ACRE'].shift(1)
df['Yield_Lag2'] = df.groupby(['State', 'County'])['Yield_BU_ACRE'].shift(2)
df['Yield_3yr_Avg'] = df.groupby(['State', 'County'])['Yield_BU_ACRE'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
)
df = df.dropna(subset=['Yield_Lag1', 'Yield_Lag2', 'Yield_3yr_Avg'])

# Encode State
state_encoder = {state: idx for idx, state in enumerate(df['State'].unique())}
df['State_Encoded'] = df['State'].map(state_encoder)

# Load saved models and scaler
with open('saved_models/xgboost_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)
print(f"  ✓ Loaded XGBoost model")

with open('saved_models/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
print(f"  ✓ Loaded scaler")

with open('saved_models/feature_columns.pkl', 'rb') as f:
    feature_cols = pickle.load(f)
print(f"  ✓ Loaded feature columns ({len(feature_cols)} features)")


[1/9] Loading data and models...
  ✓ Loaded 82,436 records
  ✓ Loaded XGBoost model
  ✓ Loaded scaler
  ✓ Loaded feature columns (44 features)


In [5]:
print("\n[2/9] Recreating train/validation/test splits...")

from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
TEST_SIZE = 0.15
VAL_SIZE = 0.15

# Prepare features and target
X = df[feature_cols]
y = df['Yield_BU_ACRE']

# Keep identifiers for analysis
identifiers = df[['State', 'County', 'Year', 'Yield_BU_ACRE']]

# Split (same as training)
X_temp, X_test, y_temp, y_test, idx_temp, idx_test = train_test_split(
    X, y, identifiers.index, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

val_size_adjusted = VAL_SIZE / (1 - TEST_SIZE)
X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
    X_temp, y_temp, idx_temp, test_size=val_size_adjusted, random_state=RANDOM_STATE
)

print(f"  ✓ Training set:   {len(X_train):,} samples")
print(f"  ✓ Validation set: {len(X_val):,} samples")
print(f"  ✓ Test set:       {len(X_test):,} samples")

# Get test set identifiers
test_df = identifiers.loc[idx_test].copy()
test_df.reset_index(drop=True, inplace=True)



[2/9] Recreating train/validation/test splits...
  ✓ Training set:   54,065 samples
  ✓ Validation set: 11,586 samples
  ✓ Test set:       11,586 samples


In [6]:
print("\n[3/9] Generating predictions...")

# Make predictions on test set
test_predictions = xgb_model.predict(X_test)
test_df['Predicted_Yield'] = test_predictions
test_df['Actual_Yield'] = y_test.values

# Calculate errors
test_df['Error'] = test_df['Actual_Yield'] - test_df['Predicted_Yield']
test_df['Abs_Error'] = np.abs(test_df['Error'])
test_df['Percent_Error'] = (test_df['Error'] / test_df['Actual_Yield']) * 100
test_df['Abs_Percent_Error'] = np.abs(test_df['Percent_Error'])

# Overall metrics
mae = mean_absolute_error(y_test, test_predictions)
rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
r2 = r2_score(y_test, test_predictions)

print(f"\n  Overall Test Set Performance:")
print(f"    R² Score:  {r2:.4f}")
print(f"    MAE:       {mae:.2f} BU/ACRE")
print(f"    RMSE:      {rmse:.2f} BU/ACRE")
print(f"    MAPE:      {test_df['Abs_Percent_Error'].mean():.2f}%")


[3/9] Generating predictions...

  Overall Test Set Performance:
    R² Score:  0.8626
    MAE:       11.22 BU/ACRE
    RMSE:      15.59 BU/ACRE
    MAPE:      inf%


In [7]:
print("\n[4/9] Analyzing error distribution...")

# Create error distribution plot
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Error histogram
axes[0, 0].hist(test_df['Error'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(0, color='red', linestyle='--', linewidth=2, label='Zero Error')
axes[0, 0].axvline(test_df['Error'].mean(), color='green', linestyle='--', 
                    linewidth=2, label=f'Mean Error: {test_df["Error"].mean():.2f}')
axes[0, 0].set_xlabel('Prediction Error (BU/ACRE)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('Distribution of Prediction Errors', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Actual vs Predicted scatter
axes[0, 1].scatter(test_df['Actual_Yield'], test_df['Predicted_Yield'], 
                   alpha=0.3, s=10)
axes[0, 1].plot([test_df['Actual_Yield'].min(), test_df['Actual_Yield'].max()],
                [test_df['Actual_Yield'].min(), test_df['Actual_Yield'].max()],
                'r--', linewidth=2, label='Perfect Prediction')
axes[0, 1].set_xlabel('Actual Yield (BU/ACRE)', fontsize=11)
axes[0, 1].set_ylabel('Predicted Yield (BU/ACRE)', fontsize=11)
axes[0, 1].set_title('Actual vs. Predicted Yield', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Add R² text
axes[0, 1].text(0.05, 0.95, f'R² = {r2:.4f}\nMAE = {mae:.2f}\nRMSE = {rmse:.2f}',
                transform=axes[0, 1].transAxes, fontsize=10,
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Plot 3: Residual plot (Error vs Predicted)
axes[1, 0].scatter(test_df['Predicted_Yield'], test_df['Error'], alpha=0.3, s=10)
axes[1, 0].axhline(0, color='red', linestyle='--', linewidth=2)
axes[1, 0].axhline(mae, color='orange', linestyle=':', linewidth=1.5, label=f'±MAE ({mae:.2f})')
axes[1, 0].axhline(-mae, color='orange', linestyle=':', linewidth=1.5)
axes[1, 0].set_xlabel('Predicted Yield (BU/ACRE)', fontsize=11)
axes[1, 0].set_ylabel('Prediction Error (BU/ACRE)', fontsize=11)
axes[1, 0].set_title('Residual Plot: Error vs. Predicted Yield', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Absolute error vs Actual yield
axes[1, 1].scatter(test_df['Actual_Yield'], test_df['Abs_Error'], alpha=0.3, s=10)
axes[1, 1].axhline(mae, color='red', linestyle='--', linewidth=2, label=f'MAE: {mae:.2f}')
axes[1, 1].set_xlabel('Actual Yield (BU/ACRE)', fontsize=11)
axes[1, 1].set_ylabel('Absolute Error (BU/ACRE)', fontsize=11)
axes[1, 1].set_title('Absolute Error vs. Actual Yield', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/error_distribution.png', dpi=300, bbox_inches='tight')
print(f"  ✓ Saved: results/error_distribution.png")
plt.close()

# Error statistics
print(f"\n  Error Statistics:")
print(f"    Mean Error:       {test_df['Error'].mean():.2f} BU/ACRE")
print(f"    Median Error:     {test_df['Error'].median():.2f} BU/ACRE")
print(f"    Std Dev Error:    {test_df['Error'].std():.2f} BU/ACRE")
print(f"    Min Error:        {test_df['Error'].min():.2f} BU/ACRE")
print(f"    Max Error:        {test_df['Error'].max():.2f} BU/ACRE")

# Bias analysis
over_predictions = (test_df['Error'] < 0).sum()
under_predictions = (test_df['Error'] > 0).sum()
print(f"\n  Bias Analysis:")
print(f"    Over-predictions:  {over_predictions} ({over_predictions/len(test_df)*100:.1f}%)")
print(f"    Under-predictions: {under_predictions} ({under_predictions/len(test_df)*100:.1f}%)")



[4/9] Analyzing error distribution...
  ✓ Saved: results/error_distribution.png

  Error Statistics:
    Mean Error:       0.00 BU/ACRE
    Median Error:     0.00 BU/ACRE
    Std Dev Error:    15.59 BU/ACRE
    Min Error:        -127.94 BU/ACRE
    Max Error:        134.91 BU/ACRE

  Bias Analysis:
    Over-predictions:  5792 (50.0%)
    Under-predictions: 5794 (50.0%)
