# Tactical Trader Agent Training

This notebook trains the Short-term Tactician agent for immediate execution and tactical trading decisions.

## Agent Overview:
The Short-term Tactician specializes in:
- Immediate price action analysis
- Entry/exit timing optimization
- Execution quality assessment
- Technical indicator-based trading (RSI, MACD, Bollinger Bands, etc.)

## Training Strategy:
- Supervised pre-training on profitable trade labels
- Reinforcement learning fine-tuning for risk-adjusted returns
- Position sizing and risk management integration

## Key Features:
- 5-minute timeframe analysis (60×7 matrix)
- Integration with synergy detection patterns
- Focus on technical indicators and timing

## 1. Environment Setup

In [None]:
# Environment setup and imports
import torch
import os
import sys
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
from tqdm import tqdm
import structlog
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configure structured logging
structlog.configure(
    processors=[
        structlog.stdlib.filter_by_level,
        structlog.stdlib.add_logger_name,
        structlog.stdlib.add_log_level,
        structlog.stdlib.PositionalArgumentsFormatter(),
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
        structlog.processors.format_exc_info,
        structlog.dev.ConsoleRenderer()
    ],
    context_class=dict,
    logger_factory=structlog.stdlib.LoggerFactory(),
    cache_logger_on_first_use=True,
)

logger = structlog.get_logger()

# GPU check and memory optimization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Enable memory optimization for Colab Pro
    torch.cuda.empty_cache()
    torch.backends.cudnn.benchmark = True
else:
    print("⚠️ No GPU available, using CPU")

# Set paths
BASE_PATH = Path("/home/QuantNova/AlgoSpace")
sys.path.insert(0, str(BASE_PATH))
sys.path.insert(0, str(BASE_PATH / "src"))

# Create necessary directories
MODELS_PATH = BASE_PATH / "models" / "agents"
RESULTS_PATH = BASE_PATH / "results" / "tactical_agent"
CHECKPOINT_PATH = BASE_PATH / "checkpoints" / "tactical_agent"
MODELS_PATH.mkdir(parents=True, exist_ok=True)
RESULTS_PATH.mkdir(parents=True, exist_ok=True)
CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)

print(f"✅ Base path: {BASE_PATH}")
print(f"✅ Models path: {MODELS_PATH}")
print(f"✅ Results path: {RESULTS_PATH}")
print(f"✅ Checkpoint path: {CHECKPOINT_PATH}")

In [None]:
# Import dependencies
try:
    # Core ML libraries
    import torch.nn as nn
    import torch.optim as optim
    import torch.nn.functional as F
    from torch.utils.data import DataLoader, TensorDataset, Dataset
    
    # Scientific computing
    from scipy import stats
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    from sklearn.model_selection import train_test_split
    
    # Technical indicators
    try:
        import talib
    except ImportError:
        print("Installing TA-Lib...")
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "TA-Lib"])
        import talib
    
    # Visualization
    import matplotlib.patches as mpatches
    from matplotlib.gridspec import GridSpec
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Local imports
    from agents.marl.agents.short_term_tactician import ShortTermTactician
    from agents.synergy.detector import SynergyDetector
    from training.data_prep import MarketDataPipeline
    from training.rewards.reward_functions import TacticalReward
    from training.environments.trading_env import TradingEnvironment
    
    print("✅ All dependencies loaded successfully")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Installing missing dependencies...")
    
    # Install missing packages
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scipy", "scikit-learn", "plotly"])
    
    # Retry imports
    from scipy import stats
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    from sklearn.model_selection import train_test_split
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

## 2. Load and Prepare Training Data

In [None]:
# Data loading configuration
DATA_PATH = BASE_PATH / "data" / "historical"

# Load 5-minute data
print("📂 Loading 5-minute market data...")

data_file = DATA_PATH / "ES - 5 min.csv"
if data_file.exists():
    df_5min = pd.read_csv(data_file)
    print(f"✅ Loaded {len(df_5min)} 5-minute bars")
    print(f"   Date range: {df_5min.iloc[0]['Date']} to {df_5min.iloc[-1]['Date']}")
else:
    print(f"❌ Data file not found: {data_file}")
    print("Generating synthetic data for demonstration...")
    
    # Generate synthetic 5-minute data
    dates = pd.date_range(start='2023-01-01', end='2024-01-01', freq='5min')
    # Filter for market hours only (9:30 AM - 4:00 PM EST)
    dates = dates[(dates.hour >= 9) & ((dates.hour < 16) | ((dates.hour == 16) & (dates.minute == 0)))]
    dates = dates[(dates.hour > 9) | ((dates.hour == 9) & (dates.minute >= 30))]
    
    n_bars = len(dates)
    base_price = 4000
    
    # Generate realistic price movements
    returns = np.random.normal(0, 0.0005, n_bars)
    prices = base_price * np.exp(np.cumsum(returns))
    
    df_5min = pd.DataFrame({
        'Date': dates,
        'Open': prices * (1 + np.random.uniform(-0.001, 0.001, n_bars)),
        'High': prices * (1 + np.random.uniform(0, 0.002, n_bars)),
        'Low': prices * (1 - np.random.uniform(0, 0.002, n_bars)),
        'Close': prices,
        'Volume': np.random.lognormal(10, 0.5, n_bars)
    })
    
    print(f"✅ Generated {len(df_5min)} synthetic 5-minute bars")

# Display data info
print("\n📊 Data Overview:")
print(df_5min.head())
print(f"\nShape: {df_5min.shape}")
print(f"\nData types:\n{df_5min.dtypes}")

In [None]:
# Calculate technical indicators
def calculate_technical_indicators(df):
    """Calculate comprehensive technical indicators for tactical trading."""
    
    print("📈 Calculating technical indicators...")
    
    # Price data
    close = df['Close'].values
    high = df['High'].values
    low = df['Low'].values
    volume = df['Volume'].values
    
    # Trend Indicators
    df['EMA_9'] = talib.EMA(close, timeperiod=9)
    df['EMA_21'] = talib.EMA(close, timeperiod=21)
    df['EMA_50'] = talib.EMA(close, timeperiod=50)
    df['SMA_20'] = talib.SMA(close, timeperiod=20)
    
    # MACD
    df['MACD'], df['MACD_signal'], df['MACD_hist'] = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
    
    # RSI
    df['RSI'] = talib.RSI(close, timeperiod=14)
    
    # Bollinger Bands
    df['BB_upper'], df['BB_middle'], df['BB_lower'] = talib.BBANDS(close, timeperiod=20, nbdevup=2, nbdevdn=2)
    df['BB_width'] = df['BB_upper'] - df['BB_lower']
    df['BB_percent'] = (close - df['BB_lower']) / (df['BB_upper'] - df['BB_lower'])
    
    # Stochastic
    df['STOCH_K'], df['STOCH_D'] = talib.STOCH(high, low, close, fastk_period=14, slowk_period=3, slowd_period=3)
    
    # ATR (Average True Range)
    df['ATR'] = talib.ATR(high, low, close, timeperiod=14)
    
    # ADX (Average Directional Index)
    df['ADX'] = talib.ADX(high, low, close, timeperiod=14)
    
    # Volume indicators
    df['OBV'] = talib.OBV(close, volume)
    df['AD'] = talib.AD(high, low, close, volume)
    
    # Momentum indicators
    df['MOM'] = talib.MOM(close, timeperiod=10)
    df['ROC'] = talib.ROC(close, timeperiod=10)
    
    # Pattern recognition helpers
    df['CDLHAMMER'] = talib.CDLHAMMER(df['Open'], high, low, close)
    df['CDLDOJI'] = talib.CDLDOJI(df['Open'], high, low, close)
    df['CDLENGULFING'] = talib.CDLENGULFING(df['Open'], high, low, close)
    
    # Support/Resistance levels (simplified)
    df['PIVOT'] = (high + low + close) / 3
    df['R1'] = 2 * df['PIVOT'] - low
    df['S1'] = 2 * df['PIVOT'] - high
    
    # Price position relative to moving averages
    df['Price_to_EMA9'] = (close - df['EMA_9']) / df['EMA_9']
    df['Price_to_EMA21'] = (close - df['EMA_21']) / df['EMA_21']
    df['Price_to_SMA20'] = (close - df['SMA_20']) / df['SMA_20']
    
    # Volatility measures
    df['Returns'] = df['Close'].pct_change()
    df['Volatility'] = df['Returns'].rolling(window=20).std() * np.sqrt(252 * 78)  # Annualized
    
    # Volume analysis
    df['Volume_SMA'] = df['Volume'].rolling(window=20).mean()
    df['Volume_Ratio'] = df['Volume'] / df['Volume_SMA']
    
    # Drop NaN values
    df = df.dropna()
    
    print(f"✅ Calculated {len(df.columns) - 6} technical indicators")
    print(f"   Final dataset: {len(df)} bars")
    
    return df

# Calculate indicators
df_5min = calculate_technical_indicators(df_5min)

## 3. Create Training Labels

In [None]:
def create_trading_labels(df, profit_threshold=0.002, stop_loss=0.001, holding_periods=[12, 24, 36]):
    """
    Create trading labels based on future price movements.
    
    Labels:
    - 0: No trade (choppy/uncertain)
    - 1: Long opportunity
    - 2: Short opportunity
    """
    print("🏷️ Creating trading labels...")
    
    labels = []
    label_metadata = []
    
    for i in range(len(df) - max(holding_periods)):
        current_price = df.iloc[i]['Close']
        
        # Check multiple holding periods
        long_profitable = False
        short_profitable = False
        best_return = 0
        best_period = 0
        
        for period in holding_periods:
            future_prices = df.iloc[i+1:i+period+1]['Close'].values
            
            if len(future_prices) == 0:
                continue
            
            # Check for profitable long
            max_price = np.max(future_prices)
            min_price = np.min(future_prices)
            
            long_return = (max_price - current_price) / current_price
            long_risk = (current_price - min_price) / current_price
            
            short_return = (current_price - min_price) / current_price
            short_risk = (max_price - current_price) / current_price
            
            # Check if trade would be profitable with risk management
            if long_return > profit_threshold and long_risk < stop_loss * 2:
                long_profitable = True
                if long_return > best_return:
                    best_return = long_return
                    best_period = period
            
            if short_return > profit_threshold and short_risk < stop_loss * 2:
                short_profitable = True
                if short_return > best_return:
                    best_return = short_return
                    best_period = period
        
        # Determine label based on technical confirmation
        rsi = df.iloc[i]['RSI']
        macd_hist = df.iloc[i]['MACD_hist']
        bb_percent = df.iloc[i]['BB_percent']
        
        if long_profitable and rsi < 70 and macd_hist > 0:
            label = 1  # Long
        elif short_profitable and rsi > 30 and macd_hist < 0:
            label = 2  # Short
        else:
            label = 0  # No trade
        
        labels.append(label)
        label_metadata.append({
            'best_return': best_return,
            'best_period': best_period,
            'rsi': rsi,
            'macd_hist': macd_hist,
            'bb_percent': bb_percent
        })
    
    # Pad the end
    labels.extend([0] * max(holding_periods))
    
    df['Label'] = labels
    
    # Print label distribution
    label_counts = pd.Series(labels).value_counts()
    print(f"\n📊 Label Distribution:")
    print(f"   No Trade: {label_counts.get(0, 0)} ({label_counts.get(0, 0)/len(labels)*100:.1f}%)")
    print(f"   Long: {label_counts.get(1, 0)} ({label_counts.get(1, 0)/len(labels)*100:.1f}%)")
    print(f"   Short: {label_counts.get(2, 0)} ({label_counts.get(2, 0)/len(labels)*100:.1f}%)")
    
    return df, label_metadata

# Create labels
df_5min, label_metadata = create_trading_labels(df_5min)

## 4. Create Tactical Trading Dataset

In [None]:
class TacticalTradingDataset(Dataset):
    """Custom dataset for tactical trading with technical indicators."""
    
    def __init__(self, df, window_size=60, transform=None):
        """
        Initialize dataset.
        
        Args:
            df: DataFrame with OHLCV and technical indicators
            window_size: Number of 5-minute bars (60 = 5 hours)
            transform: Optional data transformations
        """
        self.df = df.reset_index(drop=True)
        self.window_size = window_size
        self.transform = transform
        
        # Select key features for the 60×7 matrix
        self.feature_columns = [
            'Open', 'High', 'Low', 'Close', 'Volume',
            'EMA_21', 'ATR'
        ]
        
        # Additional technical features for enhanced analysis
        self.tech_feature_columns = [
            'RSI', 'MACD', 'MACD_signal', 'BB_percent',
            'STOCH_K', 'ADX', 'Volume_Ratio',
            'Price_to_EMA9', 'Price_to_EMA21'
        ]
        
        # Normalize features
        self.scaler = StandardScaler()
        self.price_scaler = MinMaxScaler()
        
        # Fit scalers
        self.scaler.fit(df[self.tech_feature_columns].dropna())
        self.price_scaler.fit(df[['Close']].dropna())
        
        # Valid indices (ensure we have enough history)
        self.valid_indices = list(range(window_size, len(df)))
        
        logger.info(f"Created dataset with {len(self.valid_indices)} samples")
    
    def __len__(self):
        return len(self.valid_indices)
    
    def __getitem__(self, idx):
        """Get a single sample."""
        real_idx = self.valid_indices[idx]
        
        # Get window of data
        window_data = self.df.iloc[real_idx - self.window_size:real_idx]
        
        # Create market matrix (60×7)
        market_matrix = window_data[self.feature_columns].values
        
        # Normalize OHLCV
        market_matrix[:, :4] = (market_matrix[:, :4] - market_matrix[0, 3]) / market_matrix[0, 3]  # Relative to first close
        market_matrix[:, 4] = market_matrix[:, 4] / market_matrix[:, 4].mean()  # Volume relative to mean
        market_matrix[:, 5] = (market_matrix[:, 5] - market_matrix[0, 3]) / market_matrix[0, 3]  # EMA relative
        market_matrix[:, 6] = market_matrix[:, 6] / market_matrix[0, 3]  # ATR relative
        
        # Get technical indicators
        tech_features = window_data[self.tech_feature_columns].iloc[-1].values
        tech_features = self.scaler.transform(tech_features.reshape(1, -1)).flatten()
        
        # Get label
        label = self.df.iloc[real_idx]['Label']
        
        # Create sample
        sample = {
            'market_matrix': torch.FloatTensor(market_matrix),
            'tech_features': torch.FloatTensor(tech_features),
            'label': torch.LongTensor([label]).squeeze(),
            'timestamp': real_idx,
            'current_price': self.df.iloc[real_idx]['Close']
        }
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

# Split data into train/val/test
print("\n📊 Creating train/validation/test splits...")

# Use time-based split to avoid look-ahead bias
n_samples = len(df_5min)
train_end = int(n_samples * 0.7)
val_end = int(n_samples * 0.85)

train_df = df_5min.iloc[:train_end]
val_df = df_5min.iloc[train_end:val_end]
test_df = df_5min.iloc[val_end:]

# Create datasets
train_dataset = TacticalTradingDataset(train_df)
val_dataset = TacticalTradingDataset(val_df)
test_dataset = TacticalTradingDataset(test_df)

print(f"\n✅ Datasets created:")
print(f"   Train: {len(train_dataset)} samples")
print(f"   Val: {len(val_dataset)} samples")
print(f"   Test: {len(test_dataset)} samples")

## 5. Initialize Tactical Trader Model

In [None]:
# Load configuration
config = {
    'window': 60,  # 60 5-minute bars = 5 hours
    'input_features': 7,  # OHLCV + EMA21 + ATR
    'hidden_dim': 256,
    'n_heads': 8,
    'n_layers': 4,
    'dropout': 0.1,
    'learning_rate': 1e-3,
    'weight_decay': 1e-5,
    'batch_size': 32,
    'gradient_accumulation': 4,  # For memory optimization
}

# Initialize Short-term Tactician
print("🏗️ Initializing Short-term Tactician model...")
model = ShortTermTactician(config).to(device)

# Add tactical classification head for supervised training
class TacticalClassificationHead(nn.Module):
    """Classification head for tactical trading decisions."""
    
    def __init__(self, input_dim=256, tech_features_dim=9, hidden_dim=128, n_classes=3):
        super().__init__()
        
        # Combine embedded features with technical indicators
        combined_dim = input_dim + tech_features_dim
        
        self.layers = nn.Sequential(
            nn.Linear(combined_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim // 2, n_classes)
        )
        
        # Position sizing head
        self.position_size_head = nn.Sequential(
            nn.Linear(combined_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()  # Output between 0 and 1
        )
    
    def forward(self, embedded_features, tech_features):
        # Combine features
        combined = torch.cat([embedded_features, tech_features], dim=-1)
        
        # Get action logits
        action_logits = self.layers(combined)
        
        # Get position size
        position_size = self.position_size_head(combined)
        
        return action_logits, position_size

# Attach classification head
model.tactical_classification_head = TacticalClassificationHead(
    input_dim=256,
    tech_features_dim=9,
    n_classes=3
).to(device)

# Model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n✅ Model initialized successfully:")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Device: {device}")
print(f"\n📊 Model Architecture:")
print(f"   - Window size: {config['window']} bars (5 hours)")
print(f"   - Input features: {config['input_features']}")
print(f"   - Hidden dimension: {config['hidden_dim']}")
print(f"   - Attention heads: {config['n_heads']}")
print(f"   - Transformer layers: {config['n_layers']}")

## 6. Supervised Pre-training

In [None]:
# Create data loaders with memory optimization
train_loader = DataLoader(
    train_dataset, 
    batch_size=config['batch_size'], 
    shuffle=True, 
    num_workers=2,
    pin_memory=True if device.type == 'cuda' else False
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=config['batch_size'], 
    shuffle=False, 
    num_workers=2,
    pin_memory=True if device.type == 'cuda' else False
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=config['batch_size'], 
    shuffle=False, 
    num_workers=2,
    pin_memory=True if device.type == 'cuda' else False
)

# Training setup with class weights for imbalanced data
train_labels = [train_dataset[i]['label'].item() for i in range(len(train_dataset))]
class_counts = np.bincount(train_labels)
class_weights = 1.0 / (class_counts + 1e-5)
class_weights = class_weights / class_weights.sum() * len(class_weights)
class_weights = torch.FloatTensor(class_weights).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=config['learning_rate'],
    epochs=50,
    steps_per_epoch=len(train_loader) // config['gradient_accumulation']
)

# Training metrics
history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': [],
    'position_sizes': []
}

print(f"📚 Training setup complete:")
print(f"   Optimizer: AdamW (lr={config['learning_rate']}, wd={config['weight_decay']})")
print(f"   Scheduler: OneCycleLR")
print(f"   Batch size: {config['batch_size']}")
print(f"   Gradient accumulation: {config['gradient_accumulation']}")
print(f"   Effective batch size: {config['batch_size'] * config['gradient_accumulation']}")
print(f"   Train batches: {len(train_loader)}")
print(f"   Val batches: {len(val_loader)}")
print(f"\n   Class weights: {class_weights.cpu().numpy()}")

In [None]:
# Training functions with gradient accumulation
def train_epoch(model, loader, criterion, optimizer, device, accumulation_steps=4):
    """Train for one epoch with gradient accumulation."""
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_position_sizes = []
    
    optimizer.zero_grad()
    
    progress_bar = tqdm(loader, desc="Training", leave=False)
    for batch_idx, batch in enumerate(progress_bar):
        # Move data to device
        market_data = batch['market_matrix'].to(device)
        tech_features = batch['tech_features'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass through embedder
        x = market_data.transpose(1, 2)  # Shape: (batch, features, sequence)
        embedded = model.embedder(x)  # Shape: (batch, sequence, hidden_dim)
        
        # Global pooling to get representation
        representation = embedded.mean(dim=1)  # Shape: (batch, hidden_dim)
        
        # Classification with technical features
        logits, position_size = model.tactical_classification_head(representation, tech_features)
        loss = criterion(logits, labels)
        
        # Scale loss for gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()
        
        # Update weights every accumulation_steps
        if (batch_idx + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
        
        # Track metrics
        total_loss += loss.item() * accumulation_steps
        _, preds = torch.max(logits, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_position_sizes.extend(position_size.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({'loss': f'{loss.item() * accumulation_steps:.4f}'})
        
        # Clear cache periodically for memory management
        if batch_idx % 50 == 0 and device.type == 'cuda':
            torch.cuda.empty_cache()
    
    # Calculate metrics
    accuracy = 100 * np.mean(np.array(all_preds) == np.array(all_labels))
    avg_loss = total_loss / len(loader)
    
    # Calculate F1 score
    from sklearn.metrics import f1_score
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    # Average position size
    avg_position_size = np.mean(all_position_sizes)
    
    return avg_loss, accuracy, f1, avg_position_size

def validate(model, loader, criterion, device):
    """Validate the model."""
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
    all_position_sizes = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating", leave=False):
            market_data = batch['market_matrix'].to(device)
            tech_features = batch['tech_features'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            x = market_data.transpose(1, 2)
            embedded = model.embedder(x)
            representation = embedded.mean(dim=1)
            logits, position_size = model.tactical_classification_head(representation, tech_features)
            
            # Calculate loss
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            # Get predictions
            probs = F.softmax(logits, dim=1)
            _, preds = torch.max(logits, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_position_sizes.extend(position_size.cpu().numpy())
    
    # Calculate metrics
    accuracy = 100 * np.mean(np.array(all_preds) == np.array(all_labels))
    avg_loss = total_loss / len(loader)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    avg_position_size = np.mean(all_position_sizes)
    
    return avg_loss, accuracy, f1, all_preds, all_labels, all_probs, avg_position_size

In [None]:
# Training loop with checkpointing
n_epochs = 50
best_val_f1 = 0
patience = 10
patience_counter = 0

print("\n🚀 Starting supervised pre-training...\n")

for epoch in range(n_epochs):
    # Train
    train_loss, train_acc, train_f1, train_pos_size = train_epoch(
        model, train_loader, criterion, optimizer, device, 
        accumulation_steps=config['gradient_accumulation']
    )
    
    # Validate
    val_loss, val_acc, val_f1, _, _, _, val_pos_size = validate(
        model, val_loader, criterion, device
    )
    
    # Update scheduler
    scheduler.step()
    
    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)
    history['position_sizes'].append({'train': train_pos_size, 'val': val_pos_size})
    
    # Save checkpoint
    if (epoch + 1) % 5 == 0:
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'history': history,
            'config': config
        }
        torch.save(checkpoint, CHECKPOINT_PATH / f'checkpoint_epoch_{epoch+1}.pt')
    
    # Save best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_f1': val_f1,
            'val_acc': val_acc,
            'config': config
        }, MODELS_PATH / 'tactical_trader_pretrained.pt')
        print(f"  💾 New best model saved (F1: {val_f1:.4f})")
    else:
        patience_counter += 1
    
    # Print progress
    print(f"Epoch {epoch+1}/{n_epochs}:")
    print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%, F1: {train_f1:.4f}, Pos Size: {train_pos_size:.3f}")
    print(f"  Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%, F1: {val_f1:.4f}, Pos Size: {val_pos_size:.3f}")
    print(f"  LR: {scheduler.get_last_lr()[0]:.6f}")
    
    # Early stopping
    if patience_counter >= patience:
        print(f"\n⚠️ Early stopping triggered after {epoch+1} epochs")
        break
    
    # Clear cache
    if device.type == 'cuda':
        torch.cuda.empty_cache()

print(f"\n✅ Pre-training complete! Best validation F1: {best_val_f1:.4f}")

## 7. Model Evaluation

In [None]:
# Load best model
checkpoint = torch.load(MODELS_PATH / 'tactical_trader_pretrained.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"📊 Evaluating best model from epoch {checkpoint['epoch'] + 1}")
print(f"   Validation F1: {checkpoint['val_f1']:.4f}")
print(f"   Validation Accuracy: {checkpoint['val_acc']:.2f}%")

# Test set evaluation
test_loss, test_acc, test_f1, test_preds, test_labels, test_probs, test_pos_size = validate(
    model, test_loader, criterion, device
)

print(f"\n📈 Test Set Performance:")
print(f"   Loss: {test_loss:.4f}")
print(f"   Accuracy: {test_acc:.2f}%")
print(f"   F1 Score: {test_f1:.4f}")
print(f"   Avg Position Size: {test_pos_size:.3f}")

# Confusion matrix
action_names = ['No Trade', 'Long', 'Short']
cm = confusion_matrix(test_labels, test_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=action_names, yticklabels=action_names)
plt.title('Tactical Trading Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig(RESULTS_PATH / 'tactical_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Classification report
print("\n📋 Classification Report:")
print(classification_report(test_labels, test_preds, target_names=action_names))

# Per-class performance analysis
print("\n📊 Per-Class Performance:")
for i, name in enumerate(action_names):
    class_mask = np.array(test_labels) == i
    if class_mask.sum() > 0:
        class_acc = (np.array(test_preds)[class_mask] == i).mean() * 100
        class_samples = class_mask.sum()
        print(f"   {name}: {class_acc:.1f}% accuracy ({class_samples} samples)")

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
ax1 = axes[0, 0]
ax1.plot(history['train_loss'], label='Train', linewidth=2)
ax1.plot(history['val_loss'], label='Validation', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy
ax2 = axes[0, 1]
ax2.plot(history['train_acc'], label='Train', linewidth=2)
ax2.plot(history['val_acc'], label='Validation', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Trading Decision Accuracy', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# F1 Score
ax3 = axes[1, 0]
ax3.plot(history['train_f1'], label='Train', linewidth=2)
ax3.plot(history['val_f1'], label='Validation', linewidth=2)
ax3.set_xlabel('Epoch')
ax3.set_ylabel('F1 Score')
ax3.set_title('F1 Score (Weighted)', fontsize=14, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Position Sizes
ax4 = axes[1, 1]
train_pos_sizes = [h['train'] for h in history['position_sizes']]
val_pos_sizes = [h['val'] for h in history['position_sizes']]
ax4.plot(train_pos_sizes, label='Train', linewidth=2)
ax4.plot(val_pos_sizes, label='Validation', linewidth=2)
ax4.set_xlabel('Epoch')
ax4.set_ylabel('Average Position Size')
ax4.set_title('Position Sizing Evolution', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_PATH / 'tactical_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Reinforcement Learning Fine-tuning

In [None]:
# RL Environment for Tactical Trading
class TacticalTradingEnv:
    """Trading environment for tactical short-term trading."""
    
    def __init__(self, data, initial_capital=100000, transaction_cost=0.0005, max_position_size=0.2):
        self.data = data
        self.initial_capital = initial_capital
        self.transaction_cost = transaction_cost
        self.max_position_size = max_position_size
        self.reset()
    
    def reset(self):
        """Reset environment to initial state."""
        self.capital = self.initial_capital
        self.position = 0  # Current position in shares
        self.position_value = 0
        self.current_idx = 60  # Start after first window
        self.episode_rewards = []
        self.trades = []
        self.portfolio_values = [self.initial_capital]
        
        # Risk management
        self.max_drawdown = 0
        self.peak_value = self.initial_capital
        
        return self._get_observation()
    
    def _get_observation(self):
        """Get current market observation."""
        # Get 60-bar window
        window = self.data[self.current_idx - 60:self.current_idx]
        return window
    
    def step(self, action, position_size=0.1):
        """Execute action and return next state."""
        # Actions: 0=hold, 1=buy/long, 2=sell/short
        prev_value = self.capital + self.position_value
        
        # Get current and next price
        current_price = self.data[self.current_idx, 3]  # Close price
        next_price = self.data[self.current_idx + 1, 3] if self.current_idx + 1 < len(self.data) else current_price
        
        # Risk-adjusted position sizing
        atr = self.data[self.current_idx, 6]  # ATR for volatility
        risk_adjusted_size = min(position_size, self.max_position_size) * (1 - atr / current_price * 10)
        risk_adjusted_size = max(0.05, risk_adjusted_size)  # Minimum 5% position
        
        # Execute trade
        if action == 1 and self.position <= 0:  # Buy
            # Close short if exists
            if self.position < 0:
                buy_cost = abs(self.position) * next_price * (1 + self.transaction_cost)
                self.capital -= buy_cost
                profit = self.position_value - buy_cost
                self.capital += self.position_value
                self.position = 0
                self.position_value = 0
            
            # Open long position
            position_capital = self.capital * risk_adjusted_size
            shares = position_capital / (current_price * (1 + self.transaction_cost))
            self.position = shares
            self.capital -= shares * current_price * (1 + self.transaction_cost)
            self.position_value = shares * current_price
            self.trades.append({
                'type': 'BUY',
                'idx': self.current_idx,
                'price': current_price,
                'shares': shares,
                'size': risk_adjusted_size
            })
            
        elif action == 2 and self.position >= 0:  # Sell/Short
            # Close long if exists
            if self.position > 0:
                sell_value = self.position * next_price * (1 - self.transaction_cost)
                self.capital += sell_value
                self.position = 0
                self.position_value = 0
            
            # Open short position
            position_capital = self.capital * risk_adjusted_size
            shares = position_capital / (current_price * (1 + self.transaction_cost))
            self.position = -shares
            self.capital += shares * current_price * (1 - self.transaction_cost)
            self.position_value = -shares * current_price
            self.trades.append({
                'type': 'SELL',
                'idx': self.current_idx,
                'price': current_price,
                'shares': shares,
                'size': risk_adjusted_size
            })
        
        # Update position value
        if self.position > 0:  # Long
            self.position_value = self.position * next_price
        elif self.position < 0:  # Short
            self.position_value = self.position * next_price
        
        # Calculate portfolio value and reward
        current_value = self.capital + self.position_value
        reward = (current_value - prev_value) / prev_value
        
        # Risk-adjusted reward (Sharpe-like)
        if len(self.episode_rewards) > 0:
            recent_returns = self.episode_rewards[-20:] if len(self.episode_rewards) >= 20 else self.episode_rewards
            volatility = np.std(recent_returns) if len(recent_returns) > 1 else 0.01
            sharpe_reward = reward / (volatility + 1e-6)
            reward = 0.7 * reward + 0.3 * sharpe_reward * 0.01  # Blend raw and risk-adjusted
        
        self.episode_rewards.append(reward)
        self.portfolio_values.append(current_value)
        
        # Update drawdown
        if current_value > self.peak_value:
            self.peak_value = current_value
        drawdown = (self.peak_value - current_value) / self.peak_value
        self.max_drawdown = max(self.max_drawdown, drawdown)
        
        # Move to next step
        self.current_idx += 1
        done = self.current_idx >= len(self.data) - 1
        
        # Get next observation
        next_obs = self._get_observation() if not done else None
        
        info = {
            'capital': self.capital,
            'position': self.position,
            'portfolio_value': current_value,
            'total_return': (current_value - self.initial_capital) / self.initial_capital,
            'max_drawdown': self.max_drawdown,
            'n_trades': len(self.trades)
        }
        
        return next_obs, reward, done, info

In [None]:
# PPO implementation for tactical trading
class TacticalPPO:
    """PPO algorithm for tactical trading with position sizing."""
    
    def __init__(self, model, lr=3e-4, gamma=0.99, eps_clip=0.2):
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.value_loss_coef = 0.5
        self.entropy_coef = 0.01
        
        # Add value head for RL
        self.value_head = nn.Sequential(
            nn.Linear(256 + 9, 128),  # embedded + tech features
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        ).to(device)
        
        self.value_optimizer = optim.Adam(self.value_head.parameters(), lr=lr)
    
    def get_action(self, state, tech_features, deterministic=False):
        """Get action and position size from policy."""
        with torch.no_grad():
            # Forward pass
            x = state.transpose(1, 2)
            embedded = self.model.embedder(x)
            representation = embedded.mean(dim=1)
            
            # Get action logits and position size
            action_logits, position_size = self.model.tactical_classification_head(
                representation, tech_features
            )
            
            # Sample action
            if deterministic:
                action = torch.argmax(action_logits, dim=1)
            else:
                dist = torch.distributions.Categorical(logits=action_logits)
                action = dist.sample()
            
            # Get value
            combined = torch.cat([representation, tech_features], dim=-1)
            value = self.value_head(combined)
            
            return action.item(), position_size.item(), value.item()
    
    def compute_returns(self, rewards, values, dones):
        """Compute discounted returns."""
        returns = []
        R = 0
        
        for step in reversed(range(len(rewards))):
            R = rewards[step] + self.gamma * R * (1 - dones[step])
            returns.insert(0, R)
        
        return torch.tensor(returns, dtype=torch.float32)
    
    def update(self, trajectories):
        """Update policy using collected trajectories."""
        # Prepare data
        states = torch.cat([t['states'] for t in trajectories])
        tech_features = torch.cat([t['tech_features'] for t in trajectories])
        actions = torch.cat([t['actions'] for t in trajectories])
        rewards = torch.cat([t['rewards'] for t in trajectories])
        old_log_probs = torch.cat([t['log_probs'] for t in trajectories])
        values = torch.cat([t['values'] for t in trajectories])
        position_sizes = torch.cat([t['position_sizes'] for t in trajectories])
        
        # Compute returns
        returns = self.compute_returns(rewards, values, torch.zeros_like(rewards))
        advantages = returns - values
        
        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        # PPO update
        for _ in range(4):  # PPO epochs
            # Forward pass
            x = states.transpose(1, 2)
            embedded = self.model.embedder(x)
            representation = embedded.mean(dim=1)
            
            # Policy output
            action_logits, new_position_sizes = self.model.tactical_classification_head(
                representation, tech_features
            )
            
            dist = torch.distributions.Categorical(logits=action_logits)
            new_log_probs = dist.log_prob(actions)
            entropy = dist.entropy().mean()
            
            # Value output
            combined = torch.cat([representation, tech_features], dim=-1)
            new_values = self.value_head(combined).squeeze()
            
            # PPO loss
            ratio = torch.exp(new_log_probs - old_log_probs)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            
            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = F.mse_loss(new_values, returns)
            position_loss = F.mse_loss(new_position_sizes.squeeze(), position_sizes)
            
            total_loss = policy_loss + self.value_loss_coef * value_loss - self.entropy_coef * entropy + 0.1 * position_loss
            
            # Update
            self.optimizer.zero_grad()
            self.value_optimizer.zero_grad()
            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
            self.optimizer.step()
            self.value_optimizer.step()
        
        return policy_loss.item(), value_loss.item(), position_loss.item()

# Initialize PPO
ppo = TacticalPPO(model, lr=1e-4)

print("✅ RL fine-tuning setup complete")
print("   Algorithm: PPO with position sizing")
print("   Learning rate: 1e-4")
print("   Gamma: 0.99")
print("   Epsilon clip: 0.2")

In [None]:
# RL training loop
n_episodes = 100
episode_returns = []
episode_lengths = []
win_rates = []
sharpe_ratios = []
max_drawdowns = []

# Use test data for RL fine-tuning
env_data = test_dataset.df.values
env = TacticalTradingEnv(env_data)

print("\n🚀 Starting RL fine-tuning...\n")

for episode in range(n_episodes):
    # Reset environment
    obs = env.reset()
    done = False
    trajectory = {
        'states': [],
        'tech_features': [],
        'actions': [],
        'rewards': [],
        'log_probs': [],
        'values': [],
        'position_sizes': []
    }
    
    # Collect trajectory
    while not done:
        # Convert observation to tensor
        state_tensor = torch.FloatTensor(obs[:, :7]).unsqueeze(0).to(device)  # Use first 7 features
        
        # Get technical features for current state
        current_idx = env.current_idx
        tech_features = test_dataset.scaler.transform(
            env_data[current_idx, 7:16].reshape(1, -1)  # Get tech indicators
        )
        tech_tensor = torch.FloatTensor(tech_features).to(device)
        
        # Get action and position size
        with torch.no_grad():
            # Forward pass
            x = state_tensor.transpose(1, 2)
            embedded = model.embedder(x)
            representation = embedded.mean(dim=1)
            
            # Get action and position size
            action_logits, position_size = model.tactical_classification_head(
                representation, tech_tensor
            )
            
            dist = torch.distributions.Categorical(logits=action_logits)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            
            # Get value
            combined = torch.cat([representation, tech_tensor], dim=-1)
            value = ppo.value_head(combined)
        
        # Step environment
        next_obs, reward, done, info = env.step(action.item(), position_size.item())
        
        # Store trajectory
        trajectory['states'].append(state_tensor)
        trajectory['tech_features'].append(tech_tensor)
        trajectory['actions'].append(action)
        trajectory['rewards'].append(torch.tensor([reward]))
        trajectory['log_probs'].append(log_prob)
        trajectory['values'].append(value.squeeze())
        trajectory['position_sizes'].append(torch.tensor([position_size.item()]))
        
        obs = next_obs
    
    # Process trajectory
    for key in trajectory:
        if trajectory[key]:
            trajectory[key] = torch.stack(trajectory[key]).to(device)
    
    # Update policy
    if len(trajectory['states']) > 0:
        policy_loss, value_loss, position_loss = ppo.update([trajectory])
    
    # Record metrics
    episode_return = info['total_return']
    episode_returns.append(episode_return)
    episode_lengths.append(info['n_trades'])
    max_drawdowns.append(info['max_drawdown'])
    
    # Calculate win rate
    winning_trades = sum(1 for r in env.episode_rewards if r > 0)
    total_trades = len([r for r in env.episode_rewards if r != 0])
    win_rate = winning_trades / total_trades if total_trades > 0 else 0
    win_rates.append(win_rate)
    
    # Calculate Sharpe ratio
    if len(env.episode_rewards) > 1:
        returns_array = np.array(env.episode_rewards)
        sharpe = np.mean(returns_array) / (np.std(returns_array) + 1e-6) * np.sqrt(252 * 78)  # Annualized
        sharpe_ratios.append(sharpe)
    else:
        sharpe_ratios.append(0)
    
    # Log progress
    if episode % 10 == 0:
        avg_return = np.mean(episode_returns[-10:])
        avg_win_rate = np.mean(win_rates[-10:])
        avg_sharpe = np.mean(sharpe_ratios[-10:])
        avg_drawdown = np.mean(max_drawdowns[-10:])
        print(f"Episode {episode}:")
        print(f"  Avg Return: {avg_return*100:.2f}%")
        print(f"  Avg Win Rate: {avg_win_rate*100:.1f}%")
        print(f"  Avg Sharpe: {avg_sharpe:.2f}")
        print(f"  Avg Max Drawdown: {avg_drawdown*100:.1f}%")
        print(f"  Trades: {info['n_trades']}")
    
    # Save checkpoint
    if (episode + 1) % 25 == 0:
        torch.save({
            'model_state_dict': model.state_dict(),
            'value_head_state_dict': ppo.value_head.state_dict(),
            'episode': episode,
            'metrics': {
                'avg_return': np.mean(episode_returns[-10:]),
                'avg_sharpe': np.mean(sharpe_ratios[-10:]),
                'avg_win_rate': np.mean(win_rates[-10:])
            },
            'config': config
        }, CHECKPOINT_PATH / f'rl_checkpoint_episode_{episode+1}.pt')

# Save fine-tuned model
torch.save({
    'model_state_dict': model.state_dict(),
    'value_head_state_dict': ppo.value_head.state_dict(),
    'episode': episode,
    'avg_return': np.mean(episode_returns[-10:]),
    'avg_sharpe': np.mean(sharpe_ratios[-10:]),
    'avg_win_rate': np.mean(win_rates[-10:]),
    'config': config
}, MODELS_PATH / 'tactical_trader_finetuned.pt')

print(f"\n✅ RL fine-tuning complete!")
print(f"   Final average return: {np.mean(episode_returns[-10:])*100:.2f}%")
print(f"   Final Sharpe ratio: {np.mean(sharpe_ratios[-10:]):.2f}")
print(f"   Final win rate: {np.mean(win_rates[-10:])*100:.1f}%")
print(f"   Final max drawdown: {np.mean(max_drawdowns[-10:])*100:.1f}%")

In [None]:
# Plot RL training results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Episode returns
ax1 = axes[0, 0]
ax1.plot(episode_returns, alpha=0.3, label='Episode Return')
ax1.plot(pd.Series(episode_returns).rolling(10).mean(), linewidth=2, label='10-Episode MA')
ax1.axhline(y=0, color='k', linestyle='--', alpha=0.5)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Return (%)')
ax1.set_title('Episode Returns', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Sharpe ratios
ax2 = axes[0, 1]
ax2.plot(sharpe_ratios, alpha=0.3, label='Sharpe Ratio')
ax2.plot(pd.Series(sharpe_ratios).rolling(10).mean(), linewidth=2, label='10-Episode MA')
ax2.axhline(y=1.0, color='g', linestyle='--', alpha=0.5, label='Good (1.0)')
ax2.axhline(y=2.0, color='b', linestyle='--', alpha=0.5, label='Excellent (2.0)')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Sharpe Ratio')
ax2.set_title('Risk-Adjusted Returns', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Win rates
ax3 = axes[1, 0]
ax3.plot(win_rates, alpha=0.3, label='Win Rate')
ax3.plot(pd.Series(win_rates).rolling(10).mean(), linewidth=2, label='10-Episode MA')
ax3.axhline(y=0.5, color='k', linestyle='--', alpha=0.5)
ax3.set_xlabel('Episode')
ax3.set_ylabel('Win Rate')
ax3.set_title('Trading Win Rate', fontsize=14, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Max drawdowns
ax4 = axes[1, 1]
ax4.plot(np.array(max_drawdowns) * 100, alpha=0.3, label='Max Drawdown')
ax4.plot(pd.Series(max_drawdowns).rolling(10).mean() * 100, linewidth=2, label='10-Episode MA')
ax4.axhline(y=10, color='y', linestyle='--', alpha=0.5, label='Target (10%)')
ax4.axhline(y=20, color='r', linestyle='--', alpha=0.5, label='Max Acceptable (20%)')
ax4.set_xlabel('Episode')
ax4.set_ylabel('Max Drawdown (%)')
ax4.set_title('Risk Management', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_PATH / 'rl_training_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Feature Analysis and Interpretability

In [None]:
# Analyze learned features and technical indicator importance
def analyze_tactical_features(model, dataset, n_samples=100):
    """Analyze what features the model has learned for tactical trading."""
    model.eval()
    
    feature_importance = {
        'No Trade': {'tech_features': [], 'embeddings': []},
        'Long': {'tech_features': [], 'embeddings': []},
        'Short': {'tech_features': [], 'embeddings': []}
    }
    
    action_names = ['No Trade', 'Long', 'Short']
    tech_feature_names = ['RSI', 'MACD', 'MACD_signal', 'BB_percent', 
                         'STOCH_K', 'ADX', 'Volume_Ratio', 
                         'Price_to_EMA9', 'Price_to_EMA21']
    
    attention_patterns = {i: [] for i in range(3)}
    
    with torch.no_grad():
        for idx in range(min(n_samples, len(dataset))):
            sample = dataset[idx]
            market_data = sample['market_matrix'].unsqueeze(0).to(device)
            tech_features = sample['tech_features'].unsqueeze(0).to(device)
            label = sample['label'].item()
            
            # Forward pass
            x = market_data.transpose(1, 2)
            embedded = model.embedder(x)
            
            # Extract attention weights if available
            if hasattr(model.embedder, 'attention_weights'):
                attention = model.embedder.attention_weights
                if attention is not None:
                    attention_patterns[label].append(attention.cpu().numpy())
            
            # Get representation
            representation = embedded.mean(dim=1)
            
            # Store features by action type
            action_name = action_names[label]
            feature_importance[action_name]['tech_features'].append(tech_features.cpu().numpy())
            feature_importance[action_name]['embeddings'].append(representation.cpu().numpy())
    
    # Analyze technical indicator importance
    tech_importance_by_action = {}
    
    for action in action_names:
        if feature_importance[action]['tech_features']:
            tech_array = np.vstack(feature_importance[action]['tech_features'])
            # Calculate feature importance as absolute mean
            importance = np.abs(tech_array).mean(axis=0)
            tech_importance_by_action[action] = dict(zip(tech_feature_names, importance))
    
    return feature_importance, attention_patterns, tech_importance_by_action

# Run analysis
print("🔍 Analyzing learned features...")
feature_importance, attention_patterns, tech_importance = analyze_tactical_features(model, test_dataset)

# Visualize technical indicator importance
fig, ax = plt.subplots(figsize=(12, 8))

actions = list(tech_importance.keys())
indicators = list(next(iter(tech_importance.values())).keys())
n_indicators = len(indicators)
n_actions = len(actions)

x = np.arange(n_indicators)
width = 0.25

for i, action in enumerate(actions):
    values = [tech_importance[action][ind] for ind in indicators]
    ax.bar(x + i * width, values, width, label=action)

ax.set_xlabel('Technical Indicators')
ax.set_ylabel('Feature Importance')
ax.set_title('Technical Indicator Importance by Action', fontsize=16, fontweight='bold')
ax.set_xticks(x + width)
ax.set_xticklabels(indicators, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_PATH / 'technical_indicator_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# Print top indicators for each action
print("\n📊 Top Technical Indicators by Action:")
for action in actions:
    sorted_indicators = sorted(tech_importance[action].items(), key=lambda x: x[1], reverse=True)
    print(f"\n{action}:")
    for ind, importance in sorted_indicators[:3]:
        print(f"  - {ind}: {importance:.3f}")

## 10. Training Summary and Model Export

In [None]:
# Generate comprehensive training summary
summary = f"""
# Tactical Trader Agent Training Summary

## Training Configuration
- Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- Device: {device}
- Model Parameters: {total_params:,}
- Window Size: {config['window']} bars (5-minute)
- Input Features: {config['input_features']}

## Supervised Pre-training Results
- Epochs: {len(history['train_loss'])}
- Best Validation F1: {best_val_f1:.4f}
- Test Set Performance:
  - Accuracy: {test_acc:.2f}%
  - F1 Score: {test_f1:.4f}
  - Average Position Size: {test_pos_size:.3f}

## Trading Action Performance
"""

# Add per-class performance
for i, name in enumerate(action_names):
    class_mask = np.array(test_labels) == i
    if class_mask.sum() > 0:
        class_acc = (np.array(test_preds)[class_mask] == i).mean() * 100
        summary += f"- {name}: {class_acc:.1f}% accuracy\n"

summary += f"""
## Reinforcement Learning Fine-tuning
- Episodes: {n_episodes}
- Final Average Return: {np.mean(episode_returns[-10:])*100:.2f}%
- Final Sharpe Ratio: {np.mean(sharpe_ratios[-10:]):.2f}
- Final Win Rate: {np.mean(win_rates[-10:])*100:.1f}%
- Final Max Drawdown: {np.mean(max_drawdowns[-10:])*100:.1f}%
- Average Trades per Episode: {np.mean(episode_lengths[-10:]):.1f}

## Model Architecture
- Embedder: Transformer-based (7 features → 256 dim)
- Attention: {config['n_heads']} heads, {config['n_layers']} layers
- Policy Head: 3 actions (hold, long, short)
- Position Sizing: Adaptive (5% - 20%)
- Timing Head: 0-5 bar delay capability

## Technical Indicators Used
- Trend: EMA(9,21,50), MACD
- Momentum: RSI, Stochastic, ROC
- Volatility: Bollinger Bands, ATR
- Volume: OBV, AD, Volume Ratio
- Market Structure: ADX, Pivot Points

## Risk Management Features
- Position sizing based on ATR
- Maximum position size: 20%
- Sharpe-adjusted rewards
- Drawdown monitoring

## Output Files
- Pre-trained Model: {MODELS_PATH}/tactical_trader_pretrained.pt
- Fine-tuned Model: {MODELS_PATH}/tactical_trader_finetuned.pt
- Training History: {RESULTS_PATH}/tactical_training_history.png
- Confusion Matrix: {RESULTS_PATH}/tactical_confusion_matrix.png
- RL Metrics: {RESULTS_PATH}/rl_training_metrics.png
- Feature Analysis: {RESULTS_PATH}/technical_indicator_importance.png

## Integration Notes
- Compatible with MatrixAssembler5m output
- Integrates with SynergyDetector patterns
- Provides timing signals for execution
- Weights 30% in multi-agent decision making
"""

print(summary)

# Save summary
with open(RESULTS_PATH / 'training_summary.txt', 'w') as f:
    f.write(summary)

# Export model configuration
model_config = {
    'architecture': 'ShortTermTactician',
    'config': config,
    'input_shape': (60, 7),  # 5m bars × features
    'tech_features': 9,  # Number of technical indicators
    'output_heads': {
        'action': 3,
        'confidence': 1,
        'timing': 5,
        'reasoning': 48,
        'position_size': 1
    },
    'training_metrics': {
        'pretrain_f1': best_val_f1,
        'test_accuracy': test_acc,
        'rl_return': np.mean(episode_returns[-10:]),
        'rl_sharpe': np.mean(sharpe_ratios[-10:]),
        'rl_win_rate': np.mean(win_rates[-10:]),
        'rl_max_drawdown': np.mean(max_drawdowns[-10:])
    },
    'synergy_integration': True,
    'agent_weight': 0.3
}

with open(MODELS_PATH / 'tactical_trader_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)

print("\n✅ Training complete! All models and results saved.")
print(f"📁 Models directory: {MODELS_PATH}")
print(f"📊 Results directory: {RESULTS_PATH}")

## 11. Model Deployment Integration

In [None]:
# Example integration code
integration_example = """
# Integration with AlgoSpace MARL System

## 1. Load the trained model in your trading system:

```python
from agents.marl.agents.short_term_tactician import ShortTermTactician
import torch

# Initialize tactical trader
config = {
    'window': 60,
    'input_features': 7,
    'hidden_dim': 256,
    'n_heads': 8,
    'n_layers': 4,
    'dropout': 0.1
}

tactical_agent = ShortTermTactician(config)

# Load pre-trained weights
checkpoint = torch.load('models/agents/tactical_trader_finetuned.pt')
tactical_agent.load_state_dict(checkpoint['model_state_dict'])
tactical_agent.eval()
```

## 2. Use in MARL consensus:

```python
from training.marl_trainer import MARLTrainer

# Configure MARL system
marl_config = {
    'agents': {
        'structure_analyzer': structure_agent,
        'regime_detector': regime_agent,
        'tactical_trader': tactical_agent
    },
    'consensus_weights': {
        'structure_analyzer': 0.4,
        'regime_detector': 0.3,
        'tactical_trader': 0.3
    }
}

# Initialize MARL trainer
trainer = MARLTrainer(marl_config)
```

## 3. Real-time trading integration:

```python
# Process incoming market data
market_data_5m = matrix_assembler.process_5m_data(raw_data)

# Calculate technical indicators
tech_indicators = calculate_technical_indicators(market_data_5m)

# Get tactical trading decision
with torch.no_grad():
    tactical_output = tactical_agent({
        'market_matrix': market_data_5m,
        'regime_embedding': regime_embedding,
        'synergy_context': synergy_context
    })

# Extract trading signals
action = tactical_output['action']  # [pass, long, short]
confidence = tactical_output['confidence']
timing = tactical_output['timing_recommendation']  # Bars to wait
position_size = tactical_output.get('position_size', 0.1)

# Execute with timing
if confidence > 0.7 and timing == 0:  # Execute now
    execute_trade(
        action=action,
        position_size=calculate_risk_adjusted_size(position_size, current_atr)
    )
elif timing > 0:
    schedule_trade(action, timing, position_size)
```

## 4. Risk management integration:

```python
# Calculate position size with risk limits
def calculate_risk_adjusted_size(base_size, atr, max_risk=0.02):
    # Risk per trade
    account_value = get_account_value()
    risk_amount = account_value * max_risk
    
    # Position size based on ATR
    stop_distance = 2 * atr
    shares = risk_amount / stop_distance
    
    # Apply limits
    max_position = account_value * 0.2  # 20% max
    position_value = min(shares * current_price, max_position)
    
    return position_value / current_price
```

## 5. Performance monitoring:

```python
# Track tactical agent performance
metrics_tracker.log({
    'agent': 'tactical_trader',
    'action': action,
    'confidence': confidence,
    'position_size': position_size,
    'timing': timing,
    'technical_signals': {
        'rsi': tech_indicators['RSI'],
        'macd': tech_indicators['MACD'],
        'bb_percent': tech_indicators['BB_percent']
    },
    'synergy_alignment': synergy_context['synergy_type']
})
```
"""

print(integration_example)

# Save integration guide
with open(RESULTS_PATH / 'integration_guide.md', 'w') as f:
    f.write(integration_example)

print("\n✅ Tactical Trader Agent Training notebook complete!")
print("\n📚 Next Steps:")
print("1. Review the training summary and metrics")
print("2. Test the model with live market data")
print("3. Integrate with MARL consensus mechanism")
print("4. Monitor real-time performance")
print("5. Fine-tune based on live trading results")