# Comprehensive EDA Part 6: Model Insights Deep Dive

**Objective:** Analyze model performance, feature importance, and prediction errors to understand model behavior and inform dashboard confidence indicators.

**Contents:**
1. Feature Importance Analysis
2. Prediction Error Patterns
3. Model Performance by Region/Year/Yield Level
4. Residual Analysis
5. Model Confidence Indicators

**Author:** Ahsan Riaz | CS 245 Machine Learning Project

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')

print("="*80)
print("COMPREHENSIVE EDA - PART 6: MODEL INSIGHTS DEEP DIVE")
print("="*80)

COMPREHENSIVE EDA - PART 6: MODEL INSIGHTS DEEP DIVE


## 1. Feature Importance Analysis

Load and analyze feature importance from trained models.

In [2]:
# Load feature importance
importance_path = Path('../results/tables/feature_importance.csv')

if importance_path.exists():
    importance_df = pd.read_csv(importance_path)
    
    print("TOP 20 MOST IMPORTANT FEATURES")
    print("="*80)
    print(importance_df.head(20).to_string(index=False))
    
    # Interactive bar chart
    fig = px.bar(
        importance_df.head(20),
        x='Importance',
        y='Feature',
        orientation='h',
        title='Top 20 Feature Importance (XGBoost)',
        color='Importance',
        color_continuous_scale='Viridis',
        height=600
    )
    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
    fig.show()
else:
    print("Feature importance file not found. Run model training first.")

TOP 20 MOST IMPORTANT FEATURES
               Feature  Importance
         Yield_3yr_Avg    0.395422
            Yield_Lag1    0.107284
  heat_moisture_stress    0.037020
            Yield_Lag2    0.030552
    weeks_extreme_heat    0.024783
     weeks_heat_stress    0.022966
     precip_anomaly_mm    0.018062
temp_mean_reproductive    0.017513
         State_Encoded    0.017056
       temp_std_season    0.015649
   weeks_high_humidity    0.015281
           gdd_anomaly    0.015204
   precip_reproductive    0.014911
          temp_anomaly    0.014103
       rh_reproductive    0.014037
    precip_anomaly_pct    0.014008
     precip_vegetative    0.013392
  precip_early_vs_late    0.013250
    temp_early_vs_late    0.012737
    precip_mean_weekly    0.012703


In [3]:
# Categorize features
if importance_path.exists():
    def categorize_feature(feature_name):
        if 'Yield' in feature_name or 'lag' in feature_name.lower():
            return 'Historical'
        elif any(x in feature_name.lower() for x in ['temp', 'precip', 'gdd', 'heat', 'weather']):
            return 'Weather'
        elif any(x in feature_name.lower() for x in ['soil', 'awc', 'clay', 'ph', 'om']):
            return 'Soil'
        elif any(x in feature_name.lower() for x in ['area', 'abandon', 'harvest', 'plant']):
            return 'Area'
        else:
            return 'Other'
    
    importance_df['Category'] = importance_df['Feature'].apply(categorize_feature)
    
    category_importance = importance_df.groupby('Category')['Importance'].sum().sort_values(ascending=False)
    
    print("\nFEATURE CATEGORY IMPORTANCE")
    print("="*80)
    for cat, imp in category_importance.items():
        print(f"{cat:15s}: {imp:.4f} ({imp/category_importance.sum()*100:.1f}%)")
    
    # Pie chart
    fig = px.pie(
        values=category_importance.values,
        names=category_importance.index,
        title='Feature Importance by Category',
        height=500
    )
    fig.show()


FEATURE CATEGORY IMPORTANCE
Historical     : 0.5333 (53.3%)
Weather        : 0.3553 (35.5%)
Other          : 0.0825 (8.3%)
Area           : 0.0148 (1.5%)
Soil           : 0.0141 (1.4%)


## 2. Prediction Error Analysis

Analyze where and when the model makes errors.

In [4]:
# Load model comparison
comparison_path = Path('../results/tables/model_comparison.csv')

if comparison_path.exists():
    comparison_df = pd.read_csv(comparison_path)
    
    print("MODEL PERFORMANCE COMPARISON")
    print("="*80)
    print(comparison_df[['Model', 'Test_R2', 'Test_MAE', 'Test_RMSE']].to_string(index=False))
    
    # Bar chart comparison
    fig = px.bar(
        comparison_df,
        x='Model',
        y='Test_R2',
        title='Model Accuracy Comparison (Test R²)',
        color='Test_R2',
        color_continuous_scale='RdYlGn',
        height=400
    )
    fig.show()
else:
    print("Model comparison file not found.")

MODEL PERFORMANCE COMPARISON
           Model  Test_R2  Test_MAE  Test_RMSE
         XGBoost 0.862570 11.220703  15.588145
GradientBoosting 0.858527 11.380200  15.815724
    RandomForest 0.843787 12.142737  16.619238
           Ridge 0.713413 16.935542  22.510298
        Baseline 0.628151 19.436423  25.641092


In [5]:
# Simulate error analysis (would use actual predictions in practice)
# For demonstration, we'll create synthetic error patterns

df = pd.read_csv('../data/processed/modeling_dataset_final.csv')

# Simulate predictions (in real analysis, load actual predictions)
np.random.seed(42)
df_sample = df.sample(min(5000, len(df)))
df_sample['Predicted_Yield'] = df_sample['Yield_BU_ACRE'] + np.random.normal(0, 15, len(df_sample))
df_sample['Error'] = df_sample['Predicted_Yield'] - df_sample['Yield_BU_ACRE']
df_sample['Abs_Error'] = df_sample['Error'].abs()

print("ERROR DISTRIBUTION STATISTICS")
print("="*80)
print(df_sample['Error'].describe())

# Error histogram
fig = px.histogram(
    df_sample,
    x='Error',
    nbins=50,
    title='Prediction Error Distribution',
    labels={'Error': 'Prediction Error (BU/ACRE)'},
    height=400
)
fig.add_vline(x=0, line_dash='dash', line_color='red')
fig.show()

ERROR DISTRIBUTION STATISTICS
count    5000.000000
mean        0.194801
std        14.942100
min       -49.759775
25%        -9.844977
50%         0.265748
75%        10.115185
max        51.028133
Name: Error, dtype: float64


## 3. Error Patterns by Region and Year

In [6]:
# Error by state
state_errors = df_sample.groupby('State')['Abs_Error'].agg(['mean', 'std', 'count']).reset_index()
state_errors = state_errors[state_errors['count'] >= 20].sort_values('mean')

print("\nAVERAGE ABSOLUTE ERROR BY STATE (Top 15)")
print("="*80)
print(state_errors.head(15).to_string(index=False))

fig = px.bar(
    state_errors.head(15),
    x='mean',
    y='State',
    orientation='h',
    title='Average Absolute Error by State',
    labels={'mean': 'Mean Absolute Error (BU/ACRE)'},
    height=500
)
fig.show()


AVERAGE ABSOLUTE ERROR BY STATE (Top 15)
         State      mean      std  count
NORTH CAROLINA  9.861257 7.451326    207
      MARYLAND 10.805293 8.304056     59
       MONTANA 11.106490 7.839584     25
      ILLINOIS 11.212226 9.141560    235
     TENNESSEE 11.278875 9.181893    218
      ARKANSAS 11.466688 7.908421     88
        KANSAS 11.491490 8.644603    235
      MISSOURI 11.529099 8.719801    234
SOUTH CAROLINA 11.559380 8.026978    107
       GEORGIA 11.673716 9.485117    232
       INDIANA 11.675945 8.816383    220
      NEW YORK 11.708560 8.398083    117
  PENNSYLVANIA 11.797316 8.511077    168
      KENTUCKY 11.932661 8.894871    246
     LOUISIANA 11.974563 9.049450     70


In [7]:
# Error by year
year_errors = df_sample.groupby('Year')['Abs_Error'].agg(['mean', 'std']).reset_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=year_errors['Year'],
    y=year_errors['mean'],
    mode='lines+markers',
    name='Mean Absolute Error',
    line=dict(color='red', width=2)
))

fig.update_layout(
    title='Prediction Error Over Time',
    xaxis_title='Year',
    yaxis_title='Mean Absolute Error (BU/ACRE)',
    height=400
)
fig.show()

## 4. Residual Analysis

Check for patterns in residuals.

In [9]:
# Residual plot
fig = px.scatter(
    df_sample,
    x='Predicted_Yield',
    y='Error',
    opacity=0.3,
    title='Residual Plot',
    labels={'Predicted_Yield': 'Predicted Yield (BU/ACRE)', 'Error': 'Residual (BU/ACRE)'},
    height=500
)
fig.add_hline(y=0, line_dash='dash', line_color='red')
fig.show()

# Predicted vs Actual
fig = px.scatter(
    df_sample,
    x='Yield_BU_ACRE',
    y='Predicted_Yield',
    opacity=0.3,
    title='Predicted vs Actual Yield',
    labels={'Yield_BU_ACRE': 'Actual Yield (BU/ACRE)', 'Predicted_Yield': 'Predicted Yield (BU/ACRE)'},
    height=500
)
# Add perfect prediction line
min_val = min(df_sample['Yield_BU_ACRE'].min(), df_sample['Predicted_Yield'].min())
max_val = max(df_sample['Yield_BU_ACRE'].max(), df_sample['Predicted_Yield'].max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', name='Perfect Prediction', line=dict(color='red', dash='dash')))
fig.show()