In [None]:
%run "./00_setup_and_config"

# Generate Calibrated Synthetic Predictions - All Commodities

**Enhanced with validation metrics saved to Delta table:**
- Point accuracy: Median prediction has target MAPE (aligned with forecast_agent)
- Distribution calibration: Prediction intervals properly calibrated
- Includes 100% accurate scenario (perfect foresight for testing)
- **NEW**: Validation metrics saved to `commodity_analysis.synthetic_validation_metrics`

**Accuracy levels:**
- 100% accurate: MAPE = 0%, MAE = 0 (all predictions exactly match actuals)
- 90% accurate: MAPE = 10%
- 80% accurate: MAPE = 20%
- 70% accurate: MAPE = 30%
- 60% accurate: MAPE = 40%

In [None]:
import pandas as pd
import numpy as np
import os
import gc
import time
import json
from datetime import datetime
from builtins import min as builtin_min, max as builtin_max

In [None]:
# Configuration
SYNTHETIC_START_DATE = '2022-01-01'
ACCURACY_LEVELS = [1.00, 0.90, 0.80, 0.70, 0.60]  # 100%, 90%, 80%, 70%, 60%
VALIDATION_METRICS_TABLE = "commodity_analysis.synthetic_validation_metrics"

print(f"Synthetic prediction configuration:")
print(f"  Synthetic start date: {SYNTHETIC_START_DATE}")
print(f"  Accuracy levels: {[f'{a:.0%}' for a in ACCURACY_LEVELS]}")
print(f"  Validation metrics table: {VALIDATION_METRICS_TABLE}")
print(f"\nAccuracy definition (aligned with forecast_agent):")
print(f"  - Point forecast: Median has target MAPE")
print(f"  - Distribution: Calibrated prediction intervals")
print(f"  - Validation: MAE, MAPE, Directional Accuracy, CRPS (saved to table)")
print(f"  - 100% accurate: Perfect foresight (MAPE = 0%, MAE = 0)")

## Load Market Data

In [None]:
MARKET_TABLE = "commodity.bronze.market"
print(f"\nLoading price data from {MARKET_TABLE}...")

market_df = spark.table(MARKET_TABLE).toPandas()
market_df['date'] = pd.to_datetime(market_df['date'])

print(f"✓ Loaded market price data (FULL HISTORY)")
commodity_counts = market_df.groupby('commodity').size()
print(f"Available commodities:")
for commodity, count in commodity_counts.items():
    print(f"  - {commodity}: {count} rows")
print(f"\nDate range: {market_df['date'].min()} to {market_df['date'].max()}")

## Calibrated Prediction Generation

Key improvements:
1. **Target MAPE**: Median prediction has specified MAPE
2. **Calibrated uncertainty**: Prediction spread reflects realistic uncertainty
3. **100% accuracy**: Perfect scenario for algorithm testing
4. **Aligned validation**: Uses same metrics as forecast_agent (MAE, MAPE, Directional, CRPS)
5. **Saved metrics**: Validation results saved to Delta table for later review

In [None]:
def generate_calibrated_predictions(prices_df, model_version, target_accuracy=0.90, 
                                    n_runs=2000, n_horizons=14, chunk_size=20):
    """
    Generate calibrated synthetic predictions.
    
    Parameters:
    - target_accuracy: 0.90 means median has 10% MAPE
    - n_runs: Number of ensemble runs (2000)
    - n_horizons: Forecast horizon (14 days)
    
    Returns:
    - DataFrame with predictions having target MAPE and calibrated intervals
    """
    n_dates = len(prices_df) - n_horizons
    target_mape = 1.0 - target_accuracy  # 90% accurate = 10% MAPE
    
    print(f"    Target MAPE: {target_mape:.1%}")
    print(f"    Calibration: 80% interval should contain actual ~80% of time")
    
    all_chunks = []
    
    for chunk_start in range(0, n_dates, chunk_size):
        chunk_end = builtin_min(chunk_start + chunk_size, n_dates)
        chunk_records = []
        
        for i in range(chunk_start, chunk_end):
            current_date = prices_df.loc[i, 'date']
            future_prices = prices_df.loc[i+1:i+n_horizons, 'price'].values
            
            if target_accuracy == 1.0:
                # 100% accurate: All runs exactly match actual
                predicted_prices_matrix = np.tile(future_prices, (n_runs, 1))
            
            else:
                # Generate predictions with target MAPE
                sigma_lognormal = target_mape * np.sqrt(np.pi / 2)
                
                # Generate 2000 runs with calibrated uncertainty
                log_errors = np.random.normal(0, sigma_lognormal, (n_runs, n_horizons))
                multiplicative_errors = np.exp(log_errors)
                
                future_prices_matrix = np.tile(future_prices, (n_runs, 1))
                predicted_prices_matrix = future_prices_matrix * multiplicative_errors
                
                # Add small run-specific bias for additional realism (±2%)
                run_biases = np.random.normal(1.0, 0.02, (n_runs, 1))
                predicted_prices_matrix *= run_biases
            
            # Store predictions
            for run_id in range(1, n_runs + 1):
                for day_ahead in range(1, n_horizons + 1):
                    chunk_records.append({
                        'timestamp': current_date,
                        'run_id': run_id,
                        'day_ahead': day_ahead,
                        'predicted_price': predicted_prices_matrix[run_id-1, day_ahead-1],
                        'model_version': model_version
                    })
        
        chunk_df = pd.DataFrame(chunk_records)
        all_chunks.append(chunk_df)
        
        del chunk_records
        gc.collect()
        
        if chunk_end % 100 == 0 or chunk_end == n_dates:
            print(f"    Progress: {chunk_end}/{n_dates} dates...")
    
    final_df = pd.concat(all_chunks, ignore_index=True)
    del all_chunks
    gc.collect()
    
    return final_df

## Enhanced Validation Functions (Aligned with forecast_agent)

These functions use the exact same formulas as forecast_agent for consistency:
- **MAE**: Mean Absolute Error (from `ground_truth/core/evaluator.py`)
- **MAPE**: Mean Absolute Percentage Error (from `ground_truth/core/evaluator.py`)
- **Directional Accuracy**: Day-to-day and from Day 0 (from `ground_truth/core/evaluator.py`)
- **CRPS**: Continuous Ranked Probability Score (from `evaluate_historical_forecasts.py`)

In [None]:
def calculate_crps(actuals: np.ndarray, forecast_paths: np.ndarray) -> float:
    """
    Calculate Continuous Ranked Probability Score (CRPS).
    (Aligned with forecast_agent/evaluate_historical_forecasts.py lines 47-92)
    """
    n_paths, horizon = forecast_paths.shape
    crps_values = []
    
    for t in range(horizon):
        if np.isnan(actuals[t]):
            continue
        
        actual = actuals[t]
        forecast_samples = forecast_paths[:, t]
        sorted_samples = np.sort(forecast_samples)
        
        # CRPS = E[|X - Y|] - 0.5 * E[|X - X'|]
        term1 = np.mean(np.abs(sorted_samples - actual))
        
        n = len(sorted_samples)
        indices = np.arange(1, n + 1)
        term2 = np.sum((2 * indices - 1) * sorted_samples) / (n ** 2) - np.mean(sorted_samples)
        
        crps = term1 - 0.5 * term2
        crps_values.append(crps)
    
    return float(np.mean(crps_values)) if crps_values else None


def calculate_directional_accuracy(actuals: pd.Series, forecasts: pd.Series) -> dict:
    """
    Calculate directional accuracy metrics.
    (Aligned with forecast_agent/ground_truth/core/evaluator.py lines 39-67)
    """
    metrics = {}
    
    # Day-to-day directional accuracy
    if len(actuals) > 1:
        actual_direction = np.sign(actuals.diff().dropna())
        forecast_direction = np.sign(forecasts.diff().dropna())
        correct_direction = (actual_direction == forecast_direction).sum()
        metrics['directional_accuracy'] = float(correct_direction / len(actual_direction) * 100)
    
    # Directional accuracy from Day 0 (primary trading metric)
    if len(actuals) > 1:
        day_0_actual = actuals.iloc[0]
        day_0_forecast = forecasts.iloc[0]
        
        correct_from_day0 = 0
        total_from_day0 = 0
        
        for i in range(1, len(actuals)):
            actual_higher = actuals.iloc[i] > day_0_actual
            forecast_higher = forecasts.iloc[i] > day_0_forecast
            
            if actual_higher == forecast_higher:
                correct_from_day0 += 1
            total_from_day0 += 1
        
        if total_from_day0 > 0:
            metrics['directional_accuracy_from_day0'] = float(correct_from_day0 / total_from_day0 * 100)
    
    return metrics

In [None]:
def validate_predictions(predictions_df, prices_df, commodity, model_version, target_accuracy, n_horizons=14):
    """
    Validate that generated predictions have target accuracy.
    Returns validation metrics dictionary for saving to Delta table.
    """
    print(f"\n  Validating predictions (forecast_agent-aligned metrics)...")
    
    # Group by timestamp and day_ahead, compute median
    medians = predictions_df.groupby(['timestamp', 'day_ahead'])['predicted_price'].median().reset_index()
    medians.columns = ['timestamp', 'day_ahead', 'median_pred']
    
    # Get actual future prices
    prices_df = prices_df.copy()
    prices_df['date'] = pd.to_datetime(prices_df['date'])
    
    # Merge predictions with actuals
    results = []
    for _, row in medians.iterrows():
        timestamp = row['timestamp']
        day_ahead = int(row['day_ahead'])
        median_pred = row['median_pred']
        
        future_date = timestamp + pd.Timedelta(days=day_ahead)
        actual_row = prices_df[prices_df['date'] == future_date]
        
        if len(actual_row) > 0:
            actual_price = actual_row['price'].values[0]
            ape = abs(median_pred - actual_price) / actual_price
            ae = abs(median_pred - actual_price)
            results.append({
                'timestamp': timestamp,
                'day_ahead': day_ahead,
                'median_pred': median_pred,
                'actual': actual_price,
                'ape': ape,
                'ae': ae
            })
    
    if len(results) == 0:
        print(f"    ⚠️  Could not validate - no matching actuals found")
        return None
    
    results_df = pd.DataFrame(results)
    target_mape = 1.0 - target_accuracy
    
    # Calculate overall metrics
    overall_mae = results_df['ae'].mean()
    overall_mape = results_df['ape'].mean()
    median_ape = results_df['ape'].median()
    pct90_ape = results_df['ape'].quantile(0.9)
    
    print(f"\n    Overall Metrics:")
    print(f"      MAE:  ${overall_mae:.2f}")
    print(f"      MAPE: {overall_mape:.1%} (target: {target_mape:.1%})")
    print(f"      Median APE: {median_ape:.1%}")
    print(f"      90th pct APE: {pct90_ape:.1%}")
    
    # Per-horizon metrics
    print(f"\n    Per-Horizon Metrics (each should meet target {target_mape:.1%}):")
    per_horizon = results_df.groupby('day_ahead').agg({
        'ae': 'mean',
        'ape': 'mean',
        'timestamp': 'count'
    }).rename(columns={'timestamp': 'count'})
    
    per_horizon_list = []
    for horizon in sorted(per_horizon.index):
        horizon_mae = per_horizon.loc[horizon, 'ae']
        horizon_mape = per_horizon.loc[horizon, 'ape']
        horizon_count = int(per_horizon.loc[horizon, 'count'])
        status = '✓' if horizon_mape <= target_mape * 1.15 else '⚠️'
        print(f"      Day {horizon:2d}: MAE=${horizon_mae:5.2f}, MAPE={horizon_mape:5.1%} ({horizon_count:4d} samples) {status}")
        
        per_horizon_list.append({
            'day_ahead': int(horizon),
            'mae': float(horizon_mae),
            'mape': float(horizon_mape),
            'n_samples': int(horizon_count)
        })
    
    # Calculate directional accuracy
    timestamps = results_df['timestamp'].unique()
    all_dir_acc = []
    all_dir_acc_day0 = []
    
    for ts in timestamps:
        ts_data = results_df[results_df['timestamp'] == ts].sort_values('day_ahead')
        if len(ts_data) >= 2:
            actuals_series = pd.Series(ts_data['actual'].values)
            forecasts_series = pd.Series(ts_data['median_pred'].values)
            
            dir_metrics = calculate_directional_accuracy(actuals_series, forecasts_series)
            
            if 'directional_accuracy' in dir_metrics:
                all_dir_acc.append(dir_metrics['directional_accuracy'])
            if 'directional_accuracy_from_day0' in dir_metrics:
                all_dir_acc_day0.append(dir_metrics['directional_accuracy_from_day0'])
    
    dir_acc = np.mean(all_dir_acc) if all_dir_acc else None
    dir_acc_day0 = np.mean(all_dir_acc_day0) if all_dir_acc_day0 else None
    
    if all_dir_acc:
        print(f"\n    Directional Accuracy:")
        print(f"      Day-to-day: {dir_acc:.1f}%")
        if all_dir_acc_day0:
            print(f"      From Day 0: {dir_acc_day0:.1f}% (primary trading metric)")
    
    # Calculate CRPS and coverage (for non-100% accurate)
    crps_mean = None
    coverage_80 = None
    
    if target_accuracy < 1.0:
        print(f"\n    Probabilistic Metrics:")
        
        # Sample timestamps for CRPS (computationally expensive)
        sample_timestamps = np.random.choice(timestamps, size=min(50, len(timestamps)), replace=False)
        crps_values = []
        
        for ts in sample_timestamps:
            ts_predictions = predictions_df[predictions_df['timestamp'] == ts]
            forecast_matrix = ts_predictions.pivot_table(
                index='run_id', 
                columns='day_ahead', 
                values='predicted_price'
            ).values
            
            ts_actuals = results_df[results_df['timestamp'] == ts].sort_values('day_ahead')['actual'].values
            
            if len(ts_actuals) == forecast_matrix.shape[1]:
                crps = calculate_crps(ts_actuals, forecast_matrix)
                if crps is not None:
                    crps_values.append(crps)
        
        if crps_values:
            crps_mean = float(np.mean(crps_values))
            print(f"      CRPS: ${crps_mean:.2f} (lower is better)")
        
        # Coverage
        intervals = predictions_df.groupby(['timestamp', 'day_ahead'])['predicted_price'].agg(
            p10=lambda x: x.quantile(0.1),
            p90=lambda x: x.quantile(0.9)
        ).reset_index()
        
        validation = results_df.merge(intervals, on=['timestamp', 'day_ahead'])
        coverage_80 = float(((validation['actual'] >= validation['p10']) & 
                      (validation['actual'] <= validation['p90'])).mean())
        
        print(f"      80% interval coverage: {coverage_80:.1%} (target: ~80%)")
    
    print(f"  ✓ Validation complete")
    
    # Return metrics dictionary
    return {
        'commodity': commodity,
        'model_version': model_version,
        'generation_timestamp': datetime.now(),
        'target_accuracy': float(target_accuracy),
        'target_mape': float(target_mape),
        'achieved_mae': float(overall_mae),
        'achieved_mape': float(overall_mape),
        'median_ape': float(median_ape),
        'pct90_ape': float(pct90_ape),
        'directional_accuracy': dir_acc,
        'directional_accuracy_day0': dir_acc_day0,
        'crps': crps_mean,
        'coverage_80': coverage_80,
        'n_samples': len(results_df),
        'per_horizon_metrics': json.dumps(per_horizon_list)
    }

In [None]:
def save_validation_metrics(metrics_list, table_name):
    """
    Save validation metrics to Delta table.
    """
    if not metrics_list:
        print("\n⚠️  No validation metrics to save")
        return
    
    print(f"\nSaving validation metrics to {table_name}...")
    
    metrics_df = pd.DataFrame(metrics_list)
    metrics_spark = spark.createDataFrame(metrics_df)
    
    # Append to existing table (or create if doesn't exist)
    metrics_spark.write.mode("append").option("mergeSchema", "true").saveAsTable(table_name)
    
    print(f"✓ Saved {len(metrics_list)} validation metric records")
    print(f"\n  Query with: SELECT * FROM {table_name} ORDER BY generation_timestamp DESC")

## Process All Commodities

In [None]:
def process_single_commodity(commodity_name, prices_raw_pd, analysis_config, output_schema, 
                            accuracy_levels, synthetic_start_date):
    """
    Process a single commodity with multiple calibrated accuracy levels.
    Returns validation metrics for all accuracy levels.
    """
    print(f"\n{'='*80}")
    print(f"PROCESSING: {commodity_name.upper()}")
    print(f"{'='*80}")
    
    # Filter and prepare prices
    print(f"\nPreparing price data...")
    prices_full = prices_raw_pd[prices_raw_pd['commodity'].str.lower() == commodity_name.lower()].copy()
    prices_full['date'] = pd.to_datetime(prices_full['date'])
    prices_full['price'] = prices_full['close']
    prices_full = prices_full[['date', 'price']].sort_values('date').reset_index(drop=True)
    
    print(f"✓ Full price history: {len(prices_full)} days")
    print(f"  Date range: {prices_full['date'].min()} to {prices_full['date'].max()}")
    
    # Filter to synthetic date range
    print(f"\nFiltering to {synthetic_start_date}+ for synthetic predictions...")
    prices = prices_full[prices_full['date'] >= synthetic_start_date].copy().reset_index(drop=True)
    print(f"✓ Filtered to {len(prices)} days")
    
    # Generate predictions for all accuracy levels
    print(f"\nGenerating calibrated predictions for {len(accuracy_levels)} accuracy levels...")
    
    all_predictions = []
    validation_metrics = []
    
    for accuracy in accuracy_levels:
        model_version = f"synthetic_acc{int(accuracy*100)}"
        
        print(f"\n  {model_version}: {accuracy:.0%} accurate (MAPE = {(1-accuracy):.0%})")
        
        start_time = time.time()
        
        predictions_df = generate_calibrated_predictions(
            prices,
            model_version=model_version,
            target_accuracy=accuracy,
            n_runs=analysis_config['prediction_runs'],
            n_horizons=analysis_config['forecast_horizon'],
            chunk_size=20
        )
        
        elapsed = time.time() - start_time
        print(f"    ✓ Generated {len(predictions_df):,} rows in {elapsed:.1f}s")
        
        # Validate accuracy and collect metrics
        metrics = validate_predictions(
            predictions_df, 
            prices, 
            commodity_name,
            model_version,
            accuracy, 
            analysis_config['forecast_horizon']
        )
        
        if metrics:
            validation_metrics.append(metrics)
        
        all_predictions.append(predictions_df)
        
        del predictions_df
        gc.collect()
    
    # Combine all accuracy levels
    print(f"\nCombining all accuracy levels...")
    combined_predictions = pd.concat(all_predictions, ignore_index=True)
    print(f"✓ Combined: {len(combined_predictions):,} total rows")
    
    del all_predictions
    gc.collect()
    
    # Save to Delta table
    predictions_table = f"{output_schema}.predictions_{commodity_name.lower()}"
    
    print(f"\nSaving to Delta table: {predictions_table}")
    predictions_spark = spark.createDataFrame(combined_predictions)
    predictions_spark.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(predictions_table)
    
    saved_count = spark.table(predictions_table).count()
    print(f"✓ Saved and verified: {saved_count:,} rows")
    
    del combined_predictions
    gc.collect()
    
    print(f"\n✓ {commodity_name.upper()} COMPLETE")
    
    return {
        'commodity': commodity_name,
        'n_dates': len(prices),
        'n_accuracy_levels': len(accuracy_levels),
        'table': predictions_table,
        'validation_metrics': validation_metrics
    }

In [None]:
# Process all commodities
all_results = []
all_validation_metrics = []

for commodity_name in COMMODITY_CONFIGS.keys():
    try:
        result = process_single_commodity(
            commodity_name,
            market_df,
            ANALYSIS_CONFIG,
            OUTPUT_SCHEMA,
            ACCURACY_LEVELS,
            SYNTHETIC_START_DATE
        )
        
        all_results.append({
            'commodity': result['commodity'],
            'n_dates': result['n_dates'],
            'n_accuracy_levels': result['n_accuracy_levels'],
            'table': result['table']
        })
        
        # Collect validation metrics
        if result.get('validation_metrics'):
            all_validation_metrics.extend(result['validation_metrics'])
        
    except Exception as e:
        print(f"\n❌ Error processing {commodity_name.upper()}: {e}")
        import traceback
        traceback.print_exc()
        print(f"   Skipping...")

In [None]:
# Save all validation metrics to Delta table
save_validation_metrics(all_validation_metrics, VALIDATION_METRICS_TABLE)

## Summary

In [None]:
print("\n" + "="*80)
print("CALIBRATED PREDICTION GENERATION COMPLETE")
print("="*80)

if len(all_results) > 0:
    summary_df = pd.DataFrame(all_results)
    print(f"\nSuccessfully processed {len(all_results)} commodities:")
    print(summary_df.to_string(index=False))
    
    print(f"\nPrediction tables created:")
    for table in sorted(summary_df['table'].unique()):
        print(f"  - {table}")
        model_versions = spark.table(table).select("model_version").distinct().collect()
        for mv in model_versions:
            acc = int(mv.model_version.replace('synthetic_acc', ''))
            mape = 100 - acc
            print(f"      • {mv.model_version}: {acc}% accurate (MAPE = {mape}%)")
    
    print(f"\n✓ Validation metrics saved to: {VALIDATION_METRICS_TABLE}")
    print(f"  Total validation records: {len(all_validation_metrics)}")
    
    print(f"\n✓ Key features:")
    print(f"  1. Median predictions have target MAPE (e.g., 90% accurate = 10% MAPE)")
    print(f"  2. Prediction intervals properly calibrated (80% interval ≈ 80% coverage)")
    print(f"  3. Includes 100% accurate scenario for algorithm testing")
    print(f"  4. Uses log-normal errors for realistic multiplicative noise")
    print(f"  5. Validation metrics aligned with forecast_agent:")
    print(f"     - MAE (Mean Absolute Error)")
    print(f"     - MAPE (Mean Absolute Percentage Error)")
    print(f"     - Directional Accuracy (day-to-day and from Day 0)")
    print(f"     - CRPS (Continuous Ranked Probability Score)")
    print(f"     - Calibration (80% interval coverage)")
    print(f"  6. Validation metrics saved to Delta table for later review")
else:
    print("\n⚠️  No commodities were successfully processed")

print("\n✓ Calibrated prediction generation complete")