# Crop Yield Prediction System

This notebook provides a step-by-step walkthrough of the crop yield prediction system,
covering data generation, preprocessing, model training, and interpretation.

## 1. Setup and Installation

First, let's install the required packages.

In [None]:
# Install dependencies (uncomment for Colab)
# !pip install pandas numpy scikit-learn xgboost matplotlib seaborn plotly shap lime pyyaml

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries loaded successfully!")

## 2. Data Generation

Generate synthetic crop yield data with realistic correlations.

In [None]:
import sys
sys.path.append('./src')

from data_generation import generate_synthetic_dataset

# Generate the dataset
df = generate_synthetic_dataset(n_samples=10000, output_path='data/crop_yield_dataset.csv')

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Data summary
print("\nDataset Info:")
print(df.info())

print("\nStatistical Summary:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Crop type distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df['crop_type'].value_counts().plot(kind='bar', ax=axes[0], color='forestgreen')
axes[0].set_title('Distribution by Crop Type')
axes[0].set_xlabel('Crop Type')
axes[0].set_ylabel('Count')

df['region'].value_counts().plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_title('Distribution by Region')
axes[1].set_xlabel('Region')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Yield distribution by crop type
fig, ax = plt.subplots(figsize=(12, 6))

crops = df['crop_type'].unique()
for crop in crops:
    crop_data = df[df['crop_type'] == crop]['actual_yield_tons_per_ha']
    ax.hist(crop_data, alpha=0.6, label=crop.capitalize(), bins=30)

ax.set_xlabel('Yield (tons/ha)')
ax.set_ylabel('Frequency')
ax.set_title('Yield Distribution by Crop Type')
ax.legend()
plt.show()

In [None]:
# Precipitation vs Yield with Temperature coloring
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['precipitation_mm'], df['actual_yield_tons_per_ha'],
                       c=df['temperature_c'], cmap='RdYlBu_r', alpha=0.5, s=20)
plt.colorbar(scatter, label='Temperature (°C)')
plt.axvspan(800, 1200, alpha=0.2, color='green', label='Optimal Precipitation Range')
plt.xlabel('Precipitation (mm)')
plt.ylabel('Yield (tons/ha)')
plt.title('Precipitation vs Yield (colored by Temperature)')
plt.legend()
plt.show()

## 4. Data Preprocessing

In [None]:
from preprocessing import preprocess_pipeline, prepare_model_data

# Run preprocessing pipeline
df_processed, encoder, scaler = preprocess_pipeline(df)

print(f"\nProcessed dataset shape: {df_processed.shape}")
print(f"\nNew features: {list(df_processed.columns)}")

In [None]:
# Prepare data for modeling
X, y = prepare_model_data(df_processed)

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

## 5. Model Training and Comparison

In [None]:
from sklearn.model_selection import train_test_split
from models import ModelTrainer

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Initialize and train models
trainer = ModelTrainer()
trainer.initialize_models(include_deep_learning=False)

# Train all models
trainer.train_all_models(X_train, y_train)

# Evaluate all models
results = trainer.evaluate_all_models(X_test, y_test)

print("\nModel Comparison Results:")
results.round(4)

In [None]:
# Plot model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# R² comparison
results['r2'].plot(kind='bar', ax=axes[0], color='forestgreen')
axes[0].set_title('R² Score Comparison')
axes[0].set_ylabel('R² Score')
axes[0].axhline(y=0.85, color='red', linestyle='--', label='Target (0.85)')
axes[0].legend()

# RMSE comparison
results['rmse'].plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_title('RMSE Comparison')
axes[1].set_ylabel('RMSE')

plt.tight_layout()
plt.show()

In [None]:
# Get best model
best_name, best_model = trainer.get_best_model()
print(f"Best performing model: {best_name}")

# Predictions vs Actuals
predictions = best_model.predict(X_test)

plt.figure(figsize=(10, 8))
plt.scatter(y_test, predictions, alpha=0.5, s=20)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Yield (tons/ha)')
plt.ylabel('Predicted Yield (tons/ha)')
plt.title(f'{best_name}: Predicted vs Actual Yield')
plt.show()

## 6. Model Interpretation

In [None]:
# Feature importance from Random Forest
if hasattr(best_model.model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Feature': list(X.columns),
        'Importance': best_model.model.feature_importances_
    }).sort_values('Importance', ascending=True)
    
    plt.figure(figsize=(10, 8))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()

In [None]:
# SHAP analysis (if available)
try:
    import shap
    
    # Sample for efficiency
    X_sample = X_test[:100]
    
    # Create explainer
    explainer = shap.TreeExplainer(best_model.model)
    shap_values = explainer.shap_values(X_sample)
    
    # Summary plot
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values, X_sample, feature_names=list(X.columns), show=False)
    plt.title('SHAP Summary Plot')
    plt.tight_layout()
    plt.show()
except ImportError:
    print("SHAP not installed. Install with: pip install shap")

## 7. Making Predictions

In [None]:
# Example prediction
sample_input = {
    'crop_type': 'wheat',
    'region': 'north',
    'sowing_date': '15-03-2024',
    'ndvi': 0.65,
    'precipitation_mm': 850,
    'temperature_c': 22,
    'soil_organic_carbon_pct': 2.5
}

print("Sample Input:")
for key, value in sample_input.items():
    print(f"  {key}: {value}")

In [None]:
# Save models
trainer.save_models()
print("Models saved successfully!")

## 8. Summary

This notebook demonstrated:
1. Synthetic data generation with realistic agricultural patterns
2. Comprehensive preprocessing including feature engineering
3. Training and comparison of multiple ML models
4. Model interpretation using feature importance and SHAP

For production use:
- Use the FastAPI endpoint at `/predict`
- Run the Streamlit dashboard with `streamlit run app.py`
- See `model_card.md` for model details and limitations