# Commodity Price Forecasting - Example Notebook

This notebook demonstrates the complete workflow for the commodity forecasting competition:
1. Data loading and preprocessing
2. Feature engineering
3. Model training and evaluation
4. Submission preparation


In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import our modules
from src.data_processing.data_loader import DataLoader, DEFAULT_DATA_FILES
from src.feature_engineering.features import FeatureEngineer
from src.models.ensemble_models import create_default_ensemble, create_custom_ensemble
from src.evaluation.metrics import CompetitionMetrics, ModelValidator, evaluate_model_performance
from src.utils.submission import CompetitionPipeline, ModelPersistence

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Data Loading and Exploration


In [None]:
# Initialize data loader
data_loader = DataLoader(data_path="../data/")

# For demonstration, create sample data
# In actual competition, you would load real data files

def create_sample_data():
    """Create sample data for demonstration purposes."""
    
    dates = pd.date_range(start='2020-01-01', end='2024-12-31', freq='D')
    n_days = len(dates)
    
    # LME data (commodity prices)
    lme_data = pd.DataFrame({
        'date': dates,
        'copper_price': 8000 + np.cumsum(np.random.randn(n_days) * 50),
        'aluminum_price': 2000 + np.cumsum(np.random.randn(n_days) * 20),
        'zinc_price': 3000 + np.cumsum(np.random.randn(n_days) * 30),
    }).set_index('date')
    
    # JPX data (Japanese stocks)
    jpx_data = pd.DataFrame({
        'date': dates,
        'nikkei_close': 25000 + np.cumsum(np.random.randn(n_days) * 200),
        'topix_close': 1800 + np.cumsum(np.random.randn(n_days) * 15),
    }).set_index('date')
    
    # US Stock data
    us_stock_data = pd.DataFrame({
        'date': dates,
        'sp500_close': 4000 + np.cumsum(np.random.randn(n_days) * 50),
        'nasdaq_close': 12000 + np.cumsum(np.random.randn(n_days) * 150),
    }).set_index('date')
    
    # Forex data
    forex_data = pd.DataFrame({
        'date': dates,
        'usd_jpy_rate': 110 + np.cumsum(np.random.randn(n_days) * 0.5),
        'eur_usd_rate': 1.2 + np.cumsum(np.random.randn(n_days) * 0.01),
    }).set_index('date')
    
    return {
        'lme': lme_data,
        'jpx': jpx_data,
        'us_stock': us_stock_data,
        'forex': forex_data
    }

# Create sample data
sample_data = create_sample_data()

# Display data info
for market, df in sample_data.items():
    print(f"\n{market.upper()} Data Shape: {df.shape}")
    print(df.head())


In [None]:
# Align all data by common dates
aligned_data = data_loader.align_data_by_date(sample_data)
print(f"Aligned data shape: {aligned_data.shape}")
print(f"Date range: {aligned_data.index.min()} to {aligned_data.index.max()}")

# Clean data
clean_data = data_loader.clean_data(aligned_data, fill_method='forward')
print(f"Clean data shape: {clean_data.shape}")

# Display first few rows
clean_data.head()


## 2. Feature Engineering


In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer()

# Define asset pairs for price difference features
asset_pairs = [
    ('lme_copper_price', 'lme_aluminum_price'),
    ('lme_copper_price', 'lme_zinc_price'),
    ('jpx_nikkei_close', 'jpx_topix_close'),
    ('us_stock_sp500_close', 'us_stock_nasdaq_close'),
]

# Define market prefixes
market_prefixes = ['lme', 'jpx', 'us_stock', 'forex']

# Build feature pipeline
features = feature_engineer.build_feature_pipeline(
    clean_data,
    asset_pairs=asset_pairs,
    market_prefixes=market_prefixes
)

print(f"Features shape: {features.shape}")
print(f"Number of features: {len(features.columns)}")


In [None]:
# Create target variable - predict next day copper return
target = clean_data['lme_copper_price'].pct_change().shift(-1)

# Remove NaN values
mask = ~(features.isnull().any(axis=1) | target.isnull())
X = features[mask]
y = target[mask]

print(f"Training data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target statistics:")
print(y.describe())


In [None]:
# Feature selection
selected_features = feature_engineer.select_features(
    X, y, 
    method='correlation', 
    max_features=50
)

X_selected = X[selected_features]
print(f"Selected features shape: {X_selected.shape}")
print(f"Selected features: {selected_features[:10]}...")  # Show first 10


## 3. Model Training and Evaluation


In [None]:
# Split data for training and testing
split_date = '2024-01-01'
train_mask = X_selected.index < split_date
test_mask = X_selected.index >= split_date

X_train, X_test = X_selected[train_mask], X_selected[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


In [None]:
# Create and train ensemble model
ensemble_model = create_default_ensemble()
ensemble_model.fit(X_train, y_train)

print("Ensemble model training completed!")


In [None]:
# Make predictions
y_pred_train = ensemble_model.predict(X_train)
y_pred_test = ensemble_model.predict(X_test)

# Evaluate performance
train_metrics = evaluate_model_performance(y_pred_train, y_train.values)
test_metrics = evaluate_model_performance(y_pred_test, y_test.values)

print("Training Performance:")
for metric, value in train_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nTest Performance:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value:.4f}")


## 4. Visualization and Analysis


In [None]:
# Plot predictions vs actual
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Training predictions
axes[0].scatter(y_train.values, y_pred_train, alpha=0.5)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
axes[0].set_xlabel('Actual Returns')
axes[0].set_ylabel('Predicted Returns')
axes[0].set_title('Training Set: Predicted vs Actual Returns')
axes[0].grid(True)

# Test predictions
axes[1].scatter(y_test.values, y_pred_test, alpha=0.5, color='orange')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1].set_xlabel('Actual Returns')
axes[1].set_ylabel('Predicted Returns')
axes[1].set_title('Test Set: Predicted vs Actual Returns')
axes[1].grid(True)

plt.tight_layout()
plt.show()


## 5. Model Persistence and Submission


In [None]:
# Save the trained model
model_metadata = {
    'selected_features': selected_features,
    'feature_engineer_config': {
        'asset_pairs': asset_pairs,
        'market_prefixes': market_prefixes
    },
    'performance_metrics': test_metrics,
    'training_date_range': (X_train.index.min(), X_train.index.max()),
    'test_date_range': (X_test.index.min(), X_test.index.max())
}

model_path = '../submissions/trained_ensemble_model.pkl'
ModelPersistence.save_model(ensemble_model, model_path, model_metadata)

print(f"Model saved to {model_path}")


In [None]:
# Create competition pipeline and sample submission
pipeline = CompetitionPipeline(ensemble_model, feature_engineer, data_loader)

sample_submission = pd.DataFrame({
    'id': range(len(y_test)),
    'prediction': y_pred_test
})

submission_path = '../submissions/sample_submission.csv'
sample_submission.to_csv(submission_path, index=False)

print(f"Sample submission saved to {submission_path}")
print(sample_submission.head())

# Validate submission format
is_valid = pipeline.validate_submission(submission_path)
print(f"Submission validation: {'PASSED' if is_valid else 'FAILED'}")


## 6. Summary and Next Steps

This notebook demonstrated the complete workflow for the commodity forecasting competition:

### What we accomplished:
1. ✅ Data loading and preprocessing from multiple markets
2. ✅ Comprehensive feature engineering with price-difference series
3. ✅ Ensemble model training with multiple algorithms
4. ✅ Evaluation using competition metrics (Sharpe ratio variant)
5. ✅ Model persistence and submission preparation

### For the actual competition:
1. Replace sample data with real competition datasets
2. Tune hyperparameters using the validation framework
3. Experiment with additional feature engineering techniques
4. Test different ensemble configurations
5. Implement the live submission pipeline

### Key Performance Metrics:
- **Competition Metric (Sharpe Variant)**: Primary evaluation criterion
- **Spearman Correlation**: Ranking accuracy
- **Directional Accuracy**: Trend prediction capability

The framework is ready for competition deployment! 🚀
