# Comprehensive Predictive Modeling Pipeline

This notebook demonstrates a complete modeling pipeline for water quality spill prediction using:
- Pydantic validation for data and model configuration
- Training and prediction pipelines
- Enhanced error handling and validation
- Model evaluation and performance metrics

## Pipeline Overview

1. **Setup and Configuration** - Import libraries and configure paths
2. **Data Loading** - Load cleaned data for modeling
3. **Data Preparation** - Prepare features and target variables
4. **Model Training** - Train models with validation
5. **Model Evaluation** - Evaluate model performance
6. **Predictions** - Make predictions on new data
7. **Model Persistence** - Save and load models

## 1. Setup and Configuration

In [None]:
# Setup paths and imports
from pathlib import Path
import sys
import pandas as pd
import numpy as np
import logging
from typing import Dict, Any

# Configure notebook directory paths
try:
    NOTEBOOK_DIR = Path(__file__).resolve().parent
except NameError:
    NOTEBOOK_DIR = Path.cwd()

PROJECT_ROOT = NOTEBOOK_DIR.parent.parent
SCRIPTS_DIR = PROJECT_ROOT / "scripts"

# Add project root to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

print(f"Project root: {PROJECT_ROOT}")
print(f"Scripts dir:  {SCRIPTS_DIR}")

In [None]:
# Import ML model components
from scripts.ml_models.model_utils import ModelConfig, get_logger
from scripts.ml_models.train_pipeline import SpillTrainingPipeline
from scripts.ml_models.predict import SpillPredictionPipeline, make_prediction
from scripts.ml_models.feature_engineering import FeatureEngineer

# Import Pydantic enhancements for validation
from scripts.pydantic_enhancements import (
    ModelHyperparameterConfig,
    ModelMetricsConfig,
    ModelPathConfig,
    DataFrameInputValidator,
    ErrorHandler,
    ApplicationSettings
)

logger = get_logger(__name__)
print("✅ All imports successful!")

## 2. Data Loading

In [None]:
# Load cleaned data
DATA_FILE = PROJECT_ROOT / "export" / "cleaned_data" / "cleaned_water_data.csv"

if DATA_FILE.exists():
    df = pd.read_csv(DATA_FILE)
    print(f"✅ Data loaded successfully!")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    df.head()
else:
    print(f"⚠️  Data file not found: {DATA_FILE}")
    print("   Trying alternative location...")
    # Try original data file
    ALT_DATA_FILE = PROJECT_ROOT / "data" / "national_water_plan.csv"
    if ALT_DATA_FILE.exists():
        df = pd.read_csv(ALT_DATA_FILE)
        print(f"✅ Alternative data loaded from: {ALT_DATA_FILE}")
        print(f"   Shape: {df.shape}")
        df.head()
    else:
        raise FileNotFoundError(f"Could not find data file. Checked:\n- {DATA_FILE}\n- {ALT_DATA_FILE}")

## 3. Data Preparation and Validation

In [None]:
# Validate data structure using Pydantic enhancements
try:
    input_validator = DataFrameInputValidator(
        data=df,
        required_columns=None,  # Will check available columns
        min_rows=1
    )
    validated_df = input_validator.validate_dataframe()
    print(f"✅ Data validation passed!")
    print(f"   Rows: {len(validated_df)}")
    print(f"   Columns: {len(validated_df.columns)}")
except Exception as e:
    print(f"❌ Data validation failed: {e}")
    raise

In [None]:
# Check available columns and identify potential features/targets
print("Available columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i}. {col}")

# Display basic info
print(f"\nData info:")
print(f"  Shape: {df.shape}")
print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  Missing values: {df.isnull().sum().sum()} ({df.isnull().sum().sum() / df.size * 100:.2f}%)")

## 4. Model Configuration with Pydantic Validation

In [None]:
# Create model configuration with Pydantic validation
# Adjust features and target based on your actual data columns
config = ModelConfig(
    artifacts_dir=PROJECT_ROOT / "artifacts",
    model_filename="spills_pipeline.joblib",
    plots_dir=PROJECT_ROOT / "plots",
    features=["Spill Events 2020", "Spill Events 2021", "Spill Events 2022", "Latitude", "Longitude"],  # Using actual columns from data
    target="Predicted Annual Spill Frequence Post Scheme",  # Adjust based on your data
    n_estimators=100,
    random_state=42,
    test_size=0.2
)

print("✅ Model configuration created:")
print(f"   Features: {config.features}")
print(f"   Target: {config.target}")
print(f"   Model path: {config.model_path}")
print(f"   N estimators: {config.n_estimators}")
print(f"   Test size: {config.test_size}")

In [None]:
# Validate hyperparameters using enhanced Pydantic validator
try:
    hyperparam_config = config.to_hyperparameter_config()
    print("✅ Hyperparameters validated using enhanced validator:")
    print(f"   N estimators: {hyperparam_config.n_estimators}")
    print(f"   Random state: {hyperparam_config.random_state}")
    print(f"   Test size: {hyperparam_config.test_size}")
except Exception as e:
    print(f"❌ Hyperparameter validation failed: {e}")
    raise

# Validate paths using enhanced Pydantic validator
try:
    path_config = config.to_path_config()
    print("✅ Paths validated using enhanced validator:")
    print(f"   Artifacts dir: {path_config.artifacts_dir}")
    print(f"   Model filename: {path_config.model_filename}")
    print(f"   Model path: {path_config.model_path}")
except Exception as e:
    print(f"❌ Path validation failed: {e}")
    raise

## 5. Feature Engineering and Data Cleaning

In [None]:
# Prepare data for modeling using FeatureEngineer
try:
    # Check if required columns exist
    required_cols = config.features + [config.target]
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        print(f"⚠️  Missing columns: {missing_cols}")
        print(f"   Available columns: {list(df.columns)}")
        print("\n   Please update config.features and config.target to match your data.")
        print("   For now, using available numeric columns as features...")
        
        # Use numeric columns as fallback
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if len(numeric_cols) > 1:
            # Use first numeric column as target, rest as features
            config.target = numeric_cols[0]
            config.features = numeric_cols[1:min(4, len(numeric_cols))]  # Use up to 3 features
            print(f"   Updated target: {config.target}")
            print(f"   Updated features: {config.features}")
    
    # Validate and clean data
    clean_df = FeatureEngineer.validate_data(
        df=df,
        required_cols=config.features + [config.target],
        drop_missing=True,
        min_rows_after_cleaning=10
    )
    
    print(f"✅ Data prepared for modeling!")
    print(f"   Shape after cleaning: {clean_df.shape}")
    print(f"   Features: {config.features}")
    print(f"   Target: {config.target}")
    
except Exception as e:
    print(f"❌ Feature engineering failed: {e}")
    logger.error(f"Feature engineering error: {e}", exc_info=True)
    raise

## 6. Model Training

In [None]:
# Initialize training pipeline
trainer = SpillTrainingPipeline(config=config, model_name="WaterQualitySpillModel")

print("✅ Training pipeline initialized")
print(f"   Model name: {trainer.model_name}")
print(f"   Is trained: {trainer.is_trained}")

In [None]:
# Train the model
try:
    print("🚀 Starting model training...")
    metrics = trainer.train(clean_df)
    
    print("\n✅ Training completed successfully!")
    print("\nModel Performance Metrics:")
    for metric_name, metric_value in metrics.items():
        print(f"   {metric_name}: {metric_value:.4f}")
    
    # Validate metrics using enhanced Pydantic validator
    try:
        metrics_config = trainer.metrics.to_metrics_config()
        print(f"\n✅ Metrics validated using enhanced validator:")
        if metrics_config.r2_score is not None:
            print(f"   R² Score: {metrics_config.r2_score:.4f}")
        if metrics_config.rmse is not None:
            print(f"   RMSE: {metrics_config.rmse:.4f}")
        if metrics_config.mae is not None:
            print(f"   MAE: {metrics_config.mae:.4f}")
        
        # Check if performance is good
        is_good = metrics_config.is_good_performance(threshold=0.7)
        print(f"\n   Performance meets threshold (>=0.7): {is_good}")
        
    except Exception as e:
        print(f"⚠️  Could not validate metrics with enhanced validator: {e}")
    
except Exception as e:
    print(f"❌ Training failed: {e}")
    logger.error(f"Training error: {e}", exc_info=True)
    raise

## 7. Model Persistence

In [None]:
# Save the trained model
try:
    trainer.save()
    print(f"✅ Model saved successfully to: {config.model_path}")
    print(f"   File exists: {config.model_path.exists()}")
except Exception as e:
    print(f"❌ Failed to save model: {e}")
    logger.error(f"Save error: {e}", exc_info=True)
    raise

## 8. Model Predictions

In [None]:
# Make predictions using the training pipeline
try:
    # Use a subset of data for prediction demonstration
    prediction_data = clean_df[config.features].head(10)
    
    print(f"Making predictions on {len(prediction_data)} samples...")
    predictions = trainer.predict(prediction_data)
    
    print(f"\n✅ Predictions generated successfully!")
    print(f"\nSample predictions:")
    results_df = prediction_data.copy()
    results_df["Prediction"] = predictions
    if config.target in clean_df.columns:
        results_df["Actual"] = clean_df[config.target].head(10).values
    print(results_df)
    
except Exception as e:
    print(f"❌ Prediction failed: {e}")
    logger.error(f"Prediction error: {e}", exc_info=True)
    raise

## 9. Using Prediction Pipeline

In [None]:
# Load model using prediction pipeline (if model was saved)
if config.model_path.exists():
    try:
        predictor = SpillPredictionPipeline(config=config, model_name="WaterQualitySpillPredictor")
        print("✅ Prediction pipeline initialized")
        print(f"   Model loaded: {predictor.is_trained}")
        
        # Make predictions using prediction pipeline
        prediction_data = clean_df[config.features].head(5)
        predictions = predictor.predict(prediction_data)
        
        print(f"\n✅ Predictions from prediction pipeline:")
        print(f"   Number of predictions: {len(predictions)}")
        print(f"   Sample predictions: {predictions[:5]}")
        
    except Exception as e:
        print(f"⚠️  Could not use prediction pipeline: {e}")
        print("   This is expected if the model file format is incompatible")
        logger.warning(f"Prediction pipeline error: {e}")
else:
    print("⚠️  Model file not found. Skipping prediction pipeline test.")

## 10. Summary and Next Steps

### Summary
- ✅ Data loaded and validated using Pydantic enhancements
- ✅ Model configuration validated with enhanced validators
- ✅ Model trained successfully with performance metrics
- ✅ Model saved and can be loaded for predictions
- ✅ Predictions generated using both training and prediction pipelines

### Next Steps
- Experiment with different hyperparameters
- Try different feature combinations
- Evaluate model on holdout test set
- Generate visualizations of model performance
- Deploy model for production use

## 11. Hyperparameter Tuning

Use grid search, random search, or Bayesian optimization to find optimal hyperparameters.

In [None]:
# Hyperparameter tuning with FAST random search (faster than grid search)
from scripts.ml_models.hyperparameter_tuning import HyperparameterTuner

tuner = HyperparameterTuner(config, model_type='random_forest')

# Use random search instead of grid search (much faster!)
# Random search samples fewer combinations but finds good results quickly
print("🚀 Starting FAST hyperparameter tuning (random search)...\n")
tuning_results = tuner.random_search(
    X=clean_df[config.features],
    y=clean_df[config.target],
    n_iter=10,  # Only 10 iterations instead of 108 combinations
    cv=3,  # 3-fold CV instead of 5 (faster)
    scoring='r2',
    random_state=42
)

print(f"\n✅ Tuning completed!\n")
print(f"Best parameters: {tuning_results['best_params']}")
print(f"Best score: {tuning_results['best_score']:.4f}\n")

# Use best estimator
trainer.pipeline = tuner.get_best_estimator()
print("✅ Best model loaded into trainer")

## 12. Model Comparison

Compare multiple algorithms and select the best performing model.

In [None]:
# Compare multiple models
from scripts.ml_models.model_comparison import ModelComparator

comparator = ModelComparator(config)
comparison_results = comparator.compare_models(
    X=clean_df[config.features],
    y=clean_df[config.target],
    model_names=['random_forest', 'gradient_boosting', 'linear_regression', 'ridge'],
    cv=3,  # 3-fold CV for faster execution
)

# Get best model
best_model_name, best_model = comparator.get_best_model()
print(f"Best model: {best_model_name}")
print(f"\nComparison results:")
for model_name, results in comparison_results.items():
    if 'error' not in results:
        print(f"  {model_name}: CV R² = {results['cv_mean']:.4f}")

## 13. Advanced Feature Engineering

Create new features and select the most important ones.

In [None]:
# Advanced feature engineering
from scripts.ml_models.advanced_feature_engineering import AdvancedFeatureEngineer

feature_engineer = AdvancedFeatureEngineer()

# Create interaction features
df_with_interactions = feature_engineer.create_interaction_features(
    clean_df[config.features],
    columns=config.features
)
print(f"Created {len(df_with_interactions.columns) - len(config.features)} interaction features")

# Feature selection based on importance
X_selected, selected_features, importance = feature_engineer.select_features_importance(
    clean_df[config.features],
    clean_df[config.target],
    max_features=7
)
print(f"\nSelected features: {selected_features}")
print(f"\nFeature importance:")
for feature, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {feature}: {imp:.4f}")

## 14. Model Visualization

Create visualizations for model evaluation and analysis.

In [None]:
# Create visualizations
from scripts.ml_models.visualization import ModelVisualizer

visualizer = ModelVisualizer(output_dir=config.plots_dir)

# Feature importance plot
if importance:
    importance_path = visualizer.plot_feature_importance(importance, top_n=10)
    print(f"✓ Feature importance plot: {importance_path}")

# Predictions vs actual
predictions = trainer.predict(clean_df[config.features])
actual = clean_df[config.target].values
pred_vs_actual_path = visualizer.plot_prediction_vs_actual(actual, predictions)
print(f"✓ Predictions vs actual plot: {pred_vs_actual_path}")

# Residual analysis
residuals_path = visualizer.plot_residuals(actual, predictions)
print(f"✓ Residual analysis plot: {residuals_path}")

# Model comparison
if comparison_results:
    comparison_path = visualizer.plot_model_comparison(comparison_results, metric='cv_mean')
    print(f"✓ Model comparison plot: {comparison_path}")