# Mitsui Commodity Prediction Challenge - Competition Notebook

This notebook demonstrates the complete workflow for the **Mitsui Commodity Prediction Challenge** using the actual competition data format:

## Competition Overview
- **Multiple financial time series** from global markets (LME, JPX, US, FX)
- **424 target variables** (target_0 to target_423) 
- **Log returns and price differences** between instrument pairs
- **Sharpe ratio variant metric** for evaluation
- **Two phases**: Training (3 months historical) + Forecasting (90 days live)

## Data Structure
- `train.csv`: Historical financial data with date_id + time series identifiers
- `train_labels.csv`: Target variables (target_0 to target_423)
- `target_pairs.csv`: Details of target calculations (pairs, lags)
- `test.csv`: Test set with is_scored column

Let's explore and model this data!


In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import our competition-specific modules
from src.data_processing.competition_data_loader import CompetitionDataLoader, explore_competition_data
from src.feature_engineering.features import FeatureEngineer
from src.models.ensemble_models import create_default_ensemble
from src.evaluation.metrics import CompetitionMetrics, evaluate_model_performance
from src.utils.submission import ModelPersistence

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")

print("📊 Mitsui Commodity Prediction Challenge Environment Ready!")


## 1. Data Loading and Exploration

**Note**: For this example, we'll create simulated data that matches the competition format. In the actual competition, you would have the real files:
- `data/train.csv`
- `data/train_labels.csv` 
- `data/target_pairs.csv`
- `data/test.csv`


In [None]:
# Create sample competition data structure
def create_competition_sample_data():
    """Create sample data matching the competition format."""
    
    # Date range (about 2 years of daily data)
    dates = pd.date_range(start='2022-01-01', end='2024-12-31', freq='D')
    n_days = len(dates)
    
    print(f"Creating sample data for {n_days} days...")
    
    # 1. Training data (train.csv format)
    train_data = pd.DataFrame({'date_id': dates})
    
    # LME time series (metals)
    for metal in ['COPPER', 'ALUMINUM', 'ZINC', 'LEAD', 'NICKEL']:
        base_price = {'COPPER': 8000, 'ALUMINUM': 2000, 'ZINC': 3000, 'LEAD': 2200, 'NICKEL': 18000}[metal]
        train_data[f'LME_{metal}_CLOSE'] = base_price + np.cumsum(np.random.randn(n_days) * base_price * 0.02)
        train_data[f'LME_{metal}_VOLUME'] = np.random.exponential(1000, n_days)
    
    # JPX time series (Japanese indices)
    train_data['JPX_NIKKEI_CLOSE'] = 27000 + np.cumsum(np.random.randn(n_days) * 200)
    train_data['JPX_TOPIX_CLOSE'] = 1900 + np.cumsum(np.random.randn(n_days) * 15)
    
    # US time series (US indices)
    train_data['US_SPX_CLOSE'] = 4200 + np.cumsum(np.random.randn(n_days) * 50)
    train_data['US_NDX_CLOSE'] = 12500 + np.cumsum(np.random.randn(n_days) * 150)
    train_data['US_DJI_CLOSE'] = 34000 + np.cumsum(np.random.randn(n_days) * 300)
    
    # FX time series (foreign exchange)
    train_data['FX_USDJPY_CLOSE'] = 130 + np.cumsum(np.random.randn(n_days) * 0.5)
    train_data['FX_EURUSD_CLOSE'] = 1.1 + np.cumsum(np.random.randn(n_days) * 0.01)
    train_data['FX_GBPUSD_CLOSE'] = 1.3 + np.cumsum(np.random.randn(n_days) * 0.01)
    
    # 2. Training labels (train_labels.csv format)
    train_labels = pd.DataFrame({'date_id': dates})
    
    # Create 424 target variables (as in real competition)
    for i in range(20):  # Sample of targets for demo
        # Simulate log returns and price differences
        if i < 10:
            # Log returns of individual instruments
            base_col = train_data.columns[i % 10 + 1]  # Skip date_id
            if base_col in train_data.columns:
                returns = train_data[base_col].pct_change()
                train_labels[f'target_{i}'] = np.log(1 + returns).shift(-1)  # Next day log return
        else:
            # Price differences between pairs
            col1 = train_data.columns[(i % 5) + 1]
            col2 = train_data.columns[(i % 5) + 6]
            if col1 in train_data.columns and col2 in train_data.columns:
                diff = train_data[col1] - train_data[col2]
                train_labels[f'target_{i}'] = diff.pct_change().shift(-1)
    
    # 3. Target pairs metadata (target_pairs.csv format)
    target_pairs = []
    for i in range(20):
        if i < 10:
            target_pairs.append({
                'target': f'target_{i}',
                'lag': 1,
                'pair': train_data.columns[i % 10 + 1]
            })
        else:
            col1 = train_data.columns[(i % 5) + 1]
            col2 = train_data.columns[(i % 5) + 6]
            target_pairs.append({
                'target': f'target_{i}',
                'lag': 1,
                'pair': f'{col1}-{col2}'
            })
    
    target_pairs_df = pd.DataFrame(target_pairs)
    
    # 4. Test data (test.csv format)
    test_dates = pd.date_range(start='2025-01-01', periods=90, freq='D')
    test_data = pd.DataFrame({'date_id': test_dates})
    
    # Add same time series structure as training
    for col in train_data.columns[1:]:
        last_value = train_data[col].iloc[-1]
        test_data[col] = last_value + np.cumsum(np.random.randn(90) * last_value * 0.01)
    
    # Add is_scored column (first 60 days scored, last 30 not scored)
    test_data['is_scored'] = False
    test_data.loc[:59, 'is_scored'] = True
    
    return train_data, train_labels, target_pairs_df, test_data

# Create sample data
train_data, train_labels, target_pairs_df, test_data = create_competition_sample_data()

print(f"✅ Sample data created:")
print(f"  Train data: {train_data.shape}")
print(f"  Train labels: {train_labels.shape}")
print(f"  Target pairs: {target_pairs_df.shape}")
print(f"  Test data: {test_data.shape}")

# Display sample
print(f"\nSample training data:")
print(train_data.head())
