# Day 6: Transformer Trading System

## Week 15: Attention & Transformers in Finance

### Learning Objectives
- Build a complete transformer-based trading model from scratch
- Implement feature engineering specifically designed for transformers
- Create a full backtesting framework with realistic assumptions
- Analyze trading performance with comprehensive metrics

### Topics Covered
1. **Transformer Architecture** - Multi-head attention for financial time series
2. **Feature Engineering** - Technical indicators, returns, and volatility features
3. **Backtesting Engine** - Walk-forward validation with transaction costs
4. **Performance Analysis** - Sharpe ratio, drawdowns, and risk metrics

---
## Part 1: Environment Setup & Data Loading

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# PyTorch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Sklearn imports
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Download financial data
tickers = ['SPY', 'QQQ', 'IWM']  # S&P 500, Nasdaq 100, Russell 2000
start_date = '2015-01-01'
end_date = '2024-12-31'

# Download data
data_dict = {}
for ticker in tickers:
    df = yf.download(ticker, start=start_date, end=end_date, progress=False)
    data_dict[ticker] = df['Close']

# Combine into single DataFrame
prices = pd.DataFrame(data_dict)
prices.columns = prices.columns.droplevel(1) if isinstance(prices.columns, pd.MultiIndex) else prices.columns
prices = prices.dropna()

print(f"Data shape: {prices.shape}")
print(f"Date range: {prices.index[0]} to {prices.index[-1]}")
prices.tail()

---
## Part 2: Feature Engineering for Transformers

Transformers benefit from rich feature sets. We'll create:
- **Price-based features**: Returns, log returns
- **Technical indicators**: Moving averages, RSI, MACD, Bollinger Bands
- **Volatility features**: Rolling std, ATR proxies
- **Cross-asset features**: Relative strength, correlations

In [None]:
class FeatureEngineer:
    """
    Feature engineering pipeline for transformer trading models.
    Creates technical indicators and derived features from price data.
    """
    
    def __init__(self, target_ticker='SPY'):
        self.target_ticker = target_ticker
        self.feature_names = []
    
    def calculate_returns(self, prices, periods=[1, 5, 10, 20]):
        """Calculate returns over multiple periods."""
        features = pd.DataFrame(index=prices.index)
        
        for ticker in prices.columns:
            for period in periods:
                features[f'{ticker}_ret_{period}d'] = prices[ticker].pct_change(period)
                features[f'{ticker}_log_ret_{period}d'] = np.log(prices[ticker] / prices[ticker].shift(period))
        
        return features
    
    def calculate_moving_averages(self, prices, windows=[5, 10, 20, 50]):
        """Calculate moving averages and MA crossover signals."""
        features = pd.DataFrame(index=prices.index)
        
        for ticker in prices.columns:
            for window in windows:
                ma = prices[ticker].rolling(window).mean()
                features[f'{ticker}_ma_{window}'] = (prices[ticker] - ma) / ma  # Normalized distance
            
            # MA crossover signals
            features[f'{ticker}_ma_5_20_cross'] = (
                prices[ticker].rolling(5).mean() - prices[ticker].rolling(20).mean()
            ) / prices[ticker]
        
        return features
    
    def calculate_rsi(self, prices, periods=[14, 28]):
        """Calculate Relative Strength Index."""
        features = pd.DataFrame(index=prices.index)
        
        for ticker in prices.columns:
            for period in periods:
                delta = prices[ticker].diff()
                gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
                loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
                rs = gain / loss
                rsi = 100 - (100 / (1 + rs))
                features[f'{ticker}_rsi_{period}'] = (rsi - 50) / 50  # Normalized to [-1, 1]
        
        return features
    
    def calculate_macd(self, prices):
        """Calculate MACD indicator."""
        features = pd.DataFrame(index=prices.index)
        
        for ticker in prices.columns:
            ema_12 = prices[ticker].ewm(span=12, adjust=False).mean()
            ema_26 = prices[ticker].ewm(span=26, adjust=False).mean()
            macd = ema_12 - ema_26
            signal = macd.ewm(span=9, adjust=False).mean()
            
            features[f'{ticker}_macd'] = macd / prices[ticker]  # Normalized
            features[f'{ticker}_macd_signal'] = (macd - signal) / prices[ticker]
        
        return features
    
    def calculate_bollinger_bands(self, prices, window=20, num_std=2):
        """Calculate Bollinger Bands position."""
        features = pd.DataFrame(index=prices.index)
        
        for ticker in prices.columns:
            ma = prices[ticker].rolling(window).mean()
            std = prices[ticker].rolling(window).std()
            upper = ma + num_std * std
            lower = ma - num_std * std
            
            # Position within bands (-1 to 1)
            features[f'{ticker}_bb_position'] = (prices[ticker] - ma) / (num_std * std)
            features[f'{ticker}_bb_width'] = (upper - lower) / ma
        
        return features
    
    def calculate_volatility(self, prices, windows=[5, 10, 20]):
        """Calculate volatility features."""
        features = pd.DataFrame(index=prices.index)
        
        for ticker in prices.columns:
            returns = prices[ticker].pct_change()
            
            for window in windows:
                features[f'{ticker}_vol_{window}d'] = returns.rolling(window).std() * np.sqrt(252)
            
            # Volatility ratio (short/long)
            features[f'{ticker}_vol_ratio'] = (
                returns.rolling(5).std() / returns.rolling(20).std()
            )
        
        return features
    
    def calculate_cross_asset_features(self, prices):
        """Calculate cross-asset relative features."""
        features = pd.DataFrame(index=prices.index)
        
        if len(prices.columns) > 1:
            # Relative strength vs other assets
            for i, ticker1 in enumerate(prices.columns):
                for ticker2 in prices.columns[i+1:]:
                    ratio = prices[ticker1] / prices[ticker2]
                    features[f'{ticker1}_{ticker2}_ratio_ret'] = ratio.pct_change(5)
            
            # Rolling correlation
            returns = prices.pct_change()
            for i, ticker1 in enumerate(prices.columns):
                for ticker2 in prices.columns[i+1:]:
                    features[f'{ticker1}_{ticker2}_corr_20'] = (
                        returns[ticker1].rolling(20).corr(returns[ticker2])
                    )
        
        return features
    
    def create_target(self, prices, horizon=1, threshold=0.0):
        """
        Create classification target.
        1 = positive return (buy signal)
        0 = negative return (sell/hold signal)
        """
        future_returns = prices[self.target_ticker].pct_change(horizon).shift(-horizon)
        target = (future_returns > threshold).astype(int)
        return target
    
    def build_features(self, prices):
        """Build complete feature set."""
        print("Building features...")
        
        feature_dfs = [
            self.calculate_returns(prices),
            self.calculate_moving_averages(prices),
            self.calculate_rsi(prices),
            self.calculate_macd(prices),
            self.calculate_bollinger_bands(prices),
            self.calculate_volatility(prices),
            self.calculate_cross_asset_features(prices)
        ]
        
        features = pd.concat(feature_dfs, axis=1)
        self.feature_names = features.columns.tolist()
        
        print(f"Created {len(self.feature_names)} features")
        return features

In [None]:
# Create features
fe = FeatureEngineer(target_ticker='SPY')
features = fe.build_features(prices)
target = fe.create_target(prices, horizon=1)

# Combine and clean data
data = features.copy()
data['target'] = target
data = data.dropna()

print(f"\nFinal dataset shape: {data.shape}")
print(f"Target distribution:\n{data['target'].value_counts(normalize=True)}")

# Display sample features
print(f"\nSample features:")
data.iloc[:5, :10]

In [None]:
# Visualize feature correlations
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Select subset of features for visualization
spy_features = [col for col in data.columns if col.startswith('SPY_') and not col.endswith('target')][:12]

# Correlation heatmap
corr_matrix = data[spy_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, 
            fmt='.2f', ax=axes[0], annot_kws={'size': 8})
axes[0].set_title('Feature Correlation Matrix (SPY Features)', fontsize=12)

# Feature importance (correlation with target)
target_corr = data.drop('target', axis=1).corrwith(data['target']).sort_values()
top_features = pd.concat([target_corr.head(10), target_corr.tail(10)])
colors = ['red' if x < 0 else 'green' for x in top_features.values]
axes[1].barh(range(len(top_features)), top_features.values, color=colors)
axes[1].set_yticks(range(len(top_features)))
axes[1].set_yticklabels(top_features.index, fontsize=8)
axes[1].set_xlabel('Correlation with Target')
axes[1].set_title('Top 20 Features by Target Correlation')
axes[1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)

plt.tight_layout()
plt.show()

---
## Part 3: Transformer Model Architecture

We'll build a custom Transformer model optimized for financial time series:
- **Positional Encoding**: Inject temporal information
- **Multi-Head Self-Attention**: Capture complex dependencies
- **Feed-Forward Networks**: Learn non-linear transformations
- **Classification Head**: Output trading signals

In [None]:
class PositionalEncoding(nn.Module):
    """
    Sinusoidal positional encoding for sequence position information.
    """
    
    def __init__(self, d_model, max_len=500, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """Add positional encoding to input."""
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class TransformerBlock(nn.Module):
    """
    Single transformer encoder block with multi-head attention.
    """
    
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        
        # Multi-head self-attention
        self.attention = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            dropout=dropout,
            batch_first=True
        )
        
        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        """Forward pass with residual connections."""
        # Self-attention with residual
        attn_out, attn_weights = self.attention(x, x, x, attn_mask=mask)
        x = self.norm1(x + self.dropout(attn_out))
        
        # Feed-forward with residual
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        
        return x, attn_weights


class TransformerTradingModel(nn.Module):
    """
    Complete Transformer model for trading signal prediction.
    
    Architecture:
    1. Input projection layer
    2. Positional encoding
    3. Stack of transformer blocks
    4. Global average pooling
    5. Classification head
    """
    
    def __init__(self, input_dim, d_model=64, n_heads=4, n_layers=3, 
                 d_ff=256, dropout=0.1, seq_len=20):
        super().__init__()
        
        self.input_dim = input_dim
        self.d_model = d_model
        self.seq_len = seq_len
        
        # Input projection
        self.input_projection = nn.Linear(input_dim, d_model)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model, max_len=seq_len, dropout=dropout)
        
        # Transformer blocks
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1),
            nn.Sigmoid()
        )
        
        # Store attention weights for analysis
        self.attention_weights = None
    
    def forward(self, x):
        """
        Forward pass.
        
        Args:
            x: Input tensor of shape (batch_size, seq_len, input_dim)
        
        Returns:
            Output probabilities of shape (batch_size, 1)
        """
        # Input projection
        x = self.input_projection(x)
        
        # Add positional encoding
        x = self.pos_encoder(x)
        
        # Pass through transformer blocks
        attention_weights_list = []
        for block in self.transformer_blocks:
            x, attn_weights = block(x)
            attention_weights_list.append(attn_weights)
        
        self.attention_weights = attention_weights_list
        
        # Global average pooling over sequence dimension
        x = x.mean(dim=1)
        
        # Classification
        output = self.classifier(x)
        
        return output
    
    def get_attention_weights(self):
        """Return stored attention weights."""
        return self.attention_weights


# Test model architecture
test_model = TransformerTradingModel(
    input_dim=len(fe.feature_names),
    d_model=64,
    n_heads=4,
    n_layers=3,
    seq_len=20
)

# Count parameters
total_params = sum(p.numel() for p in test_model.parameters())
trainable_params = sum(p.numel() for p in test_model.parameters() if p.requires_grad)

print(f"Model Architecture:")
print(f"  Input dimension: {len(fe.feature_names)}")
print(f"  Model dimension: 64")
print(f"  Attention heads: 4")
print(f"  Transformer layers: 3")
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

---
## Part 4: Dataset and DataLoader

In [None]:
class TradingDataset(Dataset):
    """
    PyTorch Dataset for trading data with sequence windowing.
    """
    
    def __init__(self, features, targets, seq_len=20):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets)
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.features) - self.seq_len
    
    def __getitem__(self, idx):
        # Get sequence of features
        X = self.features[idx:idx + self.seq_len]
        # Target is the label at the end of the sequence
        y = self.targets[idx + self.seq_len - 1]
        return X, y


def prepare_data(data, train_ratio=0.7, val_ratio=0.15, seq_len=20):
    """
    Prepare data with walk-forward split (no look-ahead bias).
    """
    # Separate features and target
    feature_cols = [col for col in data.columns if col != 'target']
    X = data[feature_cols].values
    y = data['target'].values
    
    # Calculate split indices
    n = len(X)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    # Split data chronologically
    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]
    
    # Normalize features (fit only on training data)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Handle any remaining NaN/Inf values
    X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
    X_val = np.nan_to_num(X_val, nan=0.0, posinf=0.0, neginf=0.0)
    X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)
    
    # Create datasets
    train_dataset = TradingDataset(X_train, y_train, seq_len)
    val_dataset = TradingDataset(X_val, y_val, seq_len)
    test_dataset = TradingDataset(X_test, y_test, seq_len)
    
    # Store dates for backtest
    dates = {
        'train': data.index[:train_end],
        'val': data.index[train_end:val_end],
        'test': data.index[val_end:]
    }
    
    return train_dataset, val_dataset, test_dataset, scaler, dates, feature_cols


# Prepare data
seq_len = 20
train_dataset, val_dataset, test_dataset, scaler, dates, feature_cols = prepare_data(
    data, train_ratio=0.7, val_ratio=0.15, seq_len=seq_len
)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Dataset sizes:")
print(f"  Training: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")
print(f"\nSequence length: {seq_len}")
print(f"Batch size: {batch_size}")
print(f"\nDate ranges:")
print(f"  Train: {dates['train'][0]} to {dates['train'][-1]}")
print(f"  Val: {dates['val'][0]} to {dates['val'][-1]}")
print(f"  Test: {dates['test'][0]} to {dates['test'][-1]}")

---
## Part 5: Model Training

In [None]:
class TransformerTrainer:
    """
    Training class for the Transformer trading model.
    """
    
    def __init__(self, model, device, learning_rate=1e-4, weight_decay=1e-5):
        self.model = model.to(device)
        self.device = device
        
        # Loss function with class weights for imbalanced data
        self.criterion = nn.BCELoss()
        
        # Optimizer with weight decay
        self.optimizer = optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=weight_decay
        )
        
        # Learning rate scheduler
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', factor=0.5, patience=5, verbose=True
        )
        
        # Training history
        self.history = {
            'train_loss': [], 'val_loss': [],
            'train_acc': [], 'val_acc': []
        }
    
    def train_epoch(self, train_loader):
        """Train for one epoch."""
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for X, y in train_loader:
            X, y = X.to(self.device), y.to(self.device)
            
            # Forward pass
            self.optimizer.zero_grad()
            outputs = self.model(X).squeeze()
            loss = self.criterion(outputs, y)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            # Statistics
            total_loss += loss.item() * X.size(0)
            predictions = (outputs > 0.5).float()
            correct += (predictions == y).sum().item()
            total += y.size(0)
        
        return total_loss / total, correct / total
    
    def validate(self, val_loader):
        """Validate the model."""
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(self.device), y.to(self.device)
                
                outputs = self.model(X).squeeze()
                loss = self.criterion(outputs, y)
                
                total_loss += loss.item() * X.size(0)
                predictions = (outputs > 0.5).float()
                correct += (predictions == y).sum().item()
                total += y.size(0)
        
        return total_loss / total, correct / total
    
    def train(self, train_loader, val_loader, epochs=50, early_stopping_patience=10):
        """Full training loop with early stopping."""
        best_val_loss = float('inf')
        patience_counter = 0
        best_model_state = None
        
        print(f"Starting training for {epochs} epochs...\n")
        
        for epoch in range(epochs):
            # Train
            train_loss, train_acc = self.train_epoch(train_loader)
            
            # Validate
            val_loss, val_acc = self.validate(val_loader)
            
            # Update scheduler
            self.scheduler.step(val_loss)
            
            # Store history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['train_acc'].append(train_acc)
            self.history['val_acc'].append(val_acc)
            
            # Print progress
            if (epoch + 1) % 5 == 0 or epoch == 0:
                print(f"Epoch {epoch+1:3d}/{epochs} | "
                      f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
                      f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
            
            # Early stopping check
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = self.model.state_dict().copy()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= early_stopping_patience:
                    print(f"\nEarly stopping triggered at epoch {epoch+1}")
                    break
        
        # Restore best model
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)
            print(f"\nRestored best model with validation loss: {best_val_loss:.4f}")
        
        return self.history

In [None]:
# Initialize model
model = TransformerTradingModel(
    input_dim=len(feature_cols),
    d_model=64,
    n_heads=4,
    n_layers=3,
    d_ff=256,
    dropout=0.2,
    seq_len=seq_len
)

# Initialize trainer
trainer = TransformerTrainer(
    model=model,
    device=device,
    learning_rate=1e-4,
    weight_decay=1e-5
)

# Train model
history = trainer.train(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=50,
    early_stopping_patience=10
)

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curves
axes[0].plot(history['train_loss'], label='Train Loss', linewidth=2)
axes[0].plot(history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy curves
axes[1].plot(history['train_acc'], label='Train Accuracy', linewidth=2)
axes[1].plot(history['val_acc'], label='Validation Accuracy', linewidth=2)
axes[1].axhline(y=0.5, color='r', linestyle='--', label='Random Baseline')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## Part 6: Backtesting Engine

Implement a realistic backtesting framework with:
- Transaction costs
- Position sizing
- Walk-forward validation

In [None]:
class Backtester:
    """
    Backtesting engine for trading strategies.
    """
    
    def __init__(self, model, device, transaction_cost=0.001):
        """
        Args:
            model: Trained transformer model
            device: PyTorch device
            transaction_cost: Round-trip transaction cost (default 0.1%)
        """
        self.model = model
        self.device = device
        self.transaction_cost = transaction_cost
    
    def generate_signals(self, data_loader, threshold=0.5):
        """
        Generate trading signals from model predictions.
        
        Returns:
            signals: 1 for long, 0 for flat
            probabilities: Raw model output probabilities
        """
        self.model.eval()
        all_probs = []
        all_targets = []
        
        with torch.no_grad():
            for X, y in data_loader:
                X = X.to(self.device)
                probs = self.model(X).squeeze().cpu().numpy()
                all_probs.extend(probs if len(probs.shape) > 0 else [probs])
                all_targets.extend(y.numpy())
        
        probabilities = np.array(all_probs)
        targets = np.array(all_targets)
        signals = (probabilities > threshold).astype(int)
        
        return signals, probabilities, targets
    
    def calculate_returns(self, prices, signals):
        """
        Calculate strategy returns with transaction costs.
        
        Args:
            prices: Price series
            signals: Trading signals (1 for long, 0 for flat)
        
        Returns:
            Dictionary with strategy metrics
        """
        # Calculate market returns
        returns = prices.pct_change().fillna(0)
        
        # Align signals with returns (signal at t determines position at t+1)
        positions = pd.Series(signals, index=returns.index[:len(signals)])
        positions = positions.shift(1).fillna(0)  # Avoid look-ahead bias
        
        # Calculate position changes for transaction costs
        position_changes = positions.diff().abs().fillna(0)
        transaction_costs = position_changes * self.transaction_cost
        
        # Strategy returns
        strategy_returns = (positions * returns.iloc[:len(positions)]) - transaction_costs
        
        # Buy and hold returns
        buy_hold_returns = returns.iloc[:len(positions)]
        
        # Cumulative returns
        strategy_cumulative = (1 + strategy_returns).cumprod()
        buy_hold_cumulative = (1 + buy_hold_returns).cumprod()
        
        return {
            'strategy_returns': strategy_returns,
            'buy_hold_returns': buy_hold_returns,
            'strategy_cumulative': strategy_cumulative,
            'buy_hold_cumulative': buy_hold_cumulative,
            'positions': positions,
            'transaction_costs': transaction_costs.sum()
        }
    
    def calculate_metrics(self, results):
        """
        Calculate comprehensive performance metrics.
        """
        strategy_returns = results['strategy_returns']
        buy_hold_returns = results['buy_hold_returns']
        
        def calc_metrics(returns, name):
            total_return = (1 + returns).prod() - 1
            annual_return = (1 + total_return) ** (252 / len(returns)) - 1
            annual_vol = returns.std() * np.sqrt(252)
            sharpe = annual_return / annual_vol if annual_vol > 0 else 0
            
            # Maximum drawdown
            cumulative = (1 + returns).cumprod()
            rolling_max = cumulative.expanding().max()
            drawdowns = cumulative / rolling_max - 1
            max_drawdown = drawdowns.min()
            
            # Sortino ratio
            downside_returns = returns[returns < 0]
            downside_std = downside_returns.std() * np.sqrt(252)
            sortino = annual_return / downside_std if downside_std > 0 else 0
            
            # Calmar ratio
            calmar = annual_return / abs(max_drawdown) if max_drawdown != 0 else 0
            
            # Win rate
            win_rate = (returns > 0).sum() / len(returns)
            
            return {
                f'{name}_total_return': total_return,
                f'{name}_annual_return': annual_return,
                f'{name}_annual_vol': annual_vol,
                f'{name}_sharpe': sharpe,
                f'{name}_sortino': sortino,
                f'{name}_calmar': calmar,
                f'{name}_max_drawdown': max_drawdown,
                f'{name}_win_rate': win_rate
            }
        
        strategy_metrics = calc_metrics(strategy_returns, 'strategy')
        benchmark_metrics = calc_metrics(buy_hold_returns, 'benchmark')
        
        # Additional metrics
        positions = results['positions']
        additional = {
            'avg_position': positions.mean(),
            'num_trades': (positions.diff().abs() > 0).sum(),
            'total_transaction_costs': results['transaction_costs'],
            'time_in_market': (positions > 0).sum() / len(positions)
        }
        
        return {**strategy_metrics, **benchmark_metrics, **additional}

In [None]:
# Run backtest on test set
backtester = Backtester(model, device, transaction_cost=0.001)

# Generate signals
signals, probabilities, targets = backtester.generate_signals(test_loader, threshold=0.5)

# Get test period prices
test_dates = dates['test'][seq_len:]  # Adjust for sequence length
test_prices = prices.loc[test_dates[:len(signals)], 'SPY']

# Calculate returns
results = backtester.calculate_returns(test_prices, signals)

# Calculate metrics
metrics = backtester.calculate_metrics(results)

# Display metrics
print("="*60)
print("BACKTEST RESULTS")
print("="*60)
print(f"\nTest Period: {test_dates[0]} to {test_dates[-1]}")
print(f"Number of trading days: {len(test_prices)}")
print("\n" + "-"*60)
print("Strategy Performance:")
print("-"*60)
print(f"  Total Return:      {metrics['strategy_total_return']*100:>10.2f}%")
print(f"  Annual Return:     {metrics['strategy_annual_return']*100:>10.2f}%")
print(f"  Annual Volatility: {metrics['strategy_annual_vol']*100:>10.2f}%")
print(f"  Sharpe Ratio:      {metrics['strategy_sharpe']:>10.3f}")
print(f"  Sortino Ratio:     {metrics['strategy_sortino']:>10.3f}")
print(f"  Calmar Ratio:      {metrics['strategy_calmar']:>10.3f}")
print(f"  Max Drawdown:      {metrics['strategy_max_drawdown']*100:>10.2f}%")
print(f"  Win Rate:          {metrics['strategy_win_rate']*100:>10.2f}%")

print("\n" + "-"*60)
print("Benchmark (Buy & Hold) Performance:")
print("-"*60)
print(f"  Total Return:      {metrics['benchmark_total_return']*100:>10.2f}%")
print(f"  Annual Return:     {metrics['benchmark_annual_return']*100:>10.2f}%")
print(f"  Sharpe Ratio:      {metrics['benchmark_sharpe']:>10.3f}")
print(f"  Max Drawdown:      {metrics['benchmark_max_drawdown']*100:>10.2f}%")

print("\n" + "-"*60)
print("Trading Statistics:")
print("-"*60)
print(f"  Number of Trades:  {metrics['num_trades']:>10.0f}")
print(f"  Avg Position:      {metrics['avg_position']*100:>10.2f}%")
print(f"  Time in Market:    {metrics['time_in_market']*100:>10.2f}%")
print(f"  Transaction Costs: {metrics['total_transaction_costs']*100:>10.4f}%")
print("="*60)

---
## Part 7: Performance Analysis & Visualization

In [None]:
# Comprehensive performance visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Cumulative Returns
ax1 = axes[0, 0]
results['strategy_cumulative'].plot(ax=ax1, label='Transformer Strategy', linewidth=2, color='blue')
results['buy_hold_cumulative'].plot(ax=ax1, label='Buy & Hold', linewidth=2, color='gray', alpha=0.7)
ax1.fill_between(results['strategy_cumulative'].index, 1, results['strategy_cumulative'], 
                 where=results['strategy_cumulative'] > 1, alpha=0.3, color='green')
ax1.fill_between(results['strategy_cumulative'].index, 1, results['strategy_cumulative'], 
                 where=results['strategy_cumulative'] < 1, alpha=0.3, color='red')
ax1.axhline(y=1, color='black', linestyle='--', linewidth=0.5)
ax1.set_title('Cumulative Returns', fontsize=14)
ax1.set_xlabel('Date')
ax1.set_ylabel('Cumulative Return')
ax1.legend(loc='upper left')
ax1.grid(True, alpha=0.3)

# 2. Drawdown Analysis
ax2 = axes[0, 1]
strategy_cumulative = results['strategy_cumulative']
rolling_max = strategy_cumulative.expanding().max()
drawdowns = strategy_cumulative / rolling_max - 1

ax2.fill_between(drawdowns.index, 0, drawdowns * 100, color='red', alpha=0.6)
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax2.set_title('Strategy Drawdown', fontsize=14)
ax2.set_xlabel('Date')
ax2.set_ylabel('Drawdown (%)')
ax2.grid(True, alpha=0.3)

# 3. Signal Distribution
ax3 = axes[1, 0]
ax3.hist(probabilities, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
ax3.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Threshold')
ax3.set_title('Prediction Probability Distribution', fontsize=14)
ax3.set_xlabel('Predicted Probability')
ax3.set_ylabel('Frequency')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Monthly Returns Heatmap
ax4 = axes[1, 1]
monthly_returns = results['strategy_returns'].copy()
monthly_returns.index = pd.to_datetime(monthly_returns.index)
monthly_returns = monthly_returns.resample('M').sum() * 100

# Create year-month pivot
monthly_df = pd.DataFrame({
    'Year': monthly_returns.index.year,
    'Month': monthly_returns.index.month,
    'Return': monthly_returns.values
})
monthly_pivot = monthly_df.pivot(index='Year', columns='Month', values='Return')

sns.heatmap(monthly_pivot, annot=True, fmt='.1f', cmap='RdYlGn', center=0, 
            ax=ax4, annot_kws={'size': 8}, cbar_kws={'label': 'Return (%)'})
ax4.set_title('Monthly Returns Heatmap (%)', fontsize=14)
ax4.set_xlabel('Month')
ax4.set_ylabel('Year')

plt.tight_layout()
plt.show()

In [None]:
# Classification performance analysis
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# 1. Confusion Matrix Style Analysis
ax1 = axes[0]
predictions = (probabilities > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(targets, predictions)
precision = precision_score(targets, predictions, zero_division=0)
recall = recall_score(targets, predictions, zero_division=0)
f1 = f1_score(targets, predictions, zero_division=0)

metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
metrics_values = [accuracy, precision, recall, f1]
colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(metrics_names)))

bars = ax1.bar(metrics_names, metrics_values, color=colors, edgecolor='black')
ax1.axhline(y=0.5, color='red', linestyle='--', label='Random Baseline')
ax1.set_ylim(0, 1)
ax1.set_title('Classification Metrics', fontsize=14)
ax1.set_ylabel('Score')
ax1.legend()

# Add value labels on bars
for bar, value in zip(bars, metrics_values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
             f'{value:.3f}', ha='center', va='bottom', fontsize=10)

# 2. Rolling Accuracy
ax2 = axes[1]
correct = (predictions == targets).astype(int)
rolling_accuracy = pd.Series(correct).rolling(window=50).mean()

ax2.plot(rolling_accuracy, linewidth=2, color='blue')
ax2.axhline(y=0.5, color='red', linestyle='--', label='Random Baseline')
ax2.fill_between(range(len(rolling_accuracy)), 0.5, rolling_accuracy, 
                 where=rolling_accuracy > 0.5, alpha=0.3, color='green')
ax2.fill_between(range(len(rolling_accuracy)), 0.5, rolling_accuracy, 
                 where=rolling_accuracy < 0.5, alpha=0.3, color='red')
ax2.set_title('Rolling Accuracy (50-day window)', fontsize=14)
ax2.set_xlabel('Trading Days')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Prediction vs Actual Returns
ax3 = axes[2]
actual_returns = test_prices.pct_change().dropna().values[:len(probabilities)]

# Color points by prediction accuracy
colors = ['green' if (p > 0.5 and r > 0) or (p <= 0.5 and r <= 0) else 'red' 
          for p, r in zip(probabilities, actual_returns)]

ax3.scatter(probabilities, actual_returns * 100, c=colors, alpha=0.5, s=10)
ax3.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax3.axvline(x=0.5, color='black', linestyle='-', linewidth=0.5)
ax3.set_title('Prediction Probability vs Actual Return', fontsize=14)
ax3.set_xlabel('Predicted Probability (Long)')
ax3.set_ylabel('Actual Return (%)')
ax3.grid(True, alpha=0.3)

# Add quadrant labels
ax3.text(0.75, ax3.get_ylim()[1]*0.8, 'True Positive', ha='center', fontsize=10, color='green')
ax3.text(0.25, ax3.get_ylim()[1]*0.8, 'False Negative', ha='center', fontsize=10, color='red')
ax3.text(0.25, ax3.get_ylim()[0]*0.8, 'True Negative', ha='center', fontsize=10, color='green')
ax3.text(0.75, ax3.get_ylim()[0]*0.8, 'False Positive', ha='center', fontsize=10, color='red')

plt.tight_layout()
plt.show()

print(f"\nClassification Performance:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")

In [None]:
# Attention weights visualization
def visualize_attention(model, data_loader, device, sample_idx=0):
    """
    Visualize attention weights for a sample.
    """
    model.eval()
    
    # Get a sample batch
    for X, y in data_loader:
        X = X.to(device)
        _ = model(X)
        break
    
    # Get attention weights from last layer
    attention_weights = model.get_attention_weights()
    if attention_weights is None:
        print("No attention weights available")
        return
    
    # Take attention from the last transformer block
    last_attn = attention_weights[-1][sample_idx].detach().cpu().numpy()
    
    # Plot attention heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    
    sns.heatmap(last_attn, cmap='Blues', ax=ax)
    ax.set_title('Attention Weights (Last Transformer Layer)', fontsize=14)
    ax.set_xlabel('Key Position (Past Time Steps)')
    ax.set_ylabel('Query Position (Time Steps)')
    
    # Add time labels
    time_labels = [f't-{seq_len-1-i}' for i in range(seq_len)]
    ax.set_xticklabels(time_labels, rotation=45)
    ax.set_yticklabels(time_labels, rotation=0)
    
    plt.tight_layout()
    plt.show()
    
    # Show which time steps get the most attention
    avg_attention = last_attn.mean(axis=0)
    print("\nAverage attention by time step (most recent = t-0):")
    for i, att in enumerate(avg_attention):
        print(f"  t-{seq_len-1-i}: {att:.4f}")

# Visualize attention
visualize_attention(model, test_loader, device)

---
## Part 8: Risk Analysis

In [None]:
def analyze_risk(results, confidence_level=0.95):
    """
    Perform comprehensive risk analysis.
    """
    strategy_returns = results['strategy_returns']
    
    # Value at Risk (VaR)
    var_95 = np.percentile(strategy_returns, (1 - confidence_level) * 100)
    var_99 = np.percentile(strategy_returns, 1)
    
    # Conditional VaR (Expected Shortfall)
    cvar_95 = strategy_returns[strategy_returns <= var_95].mean()
    cvar_99 = strategy_returns[strategy_returns <= var_99].mean()
    
    # Tail ratio
    right_tail = np.percentile(strategy_returns, 95)
    left_tail = abs(np.percentile(strategy_returns, 5))
    tail_ratio = right_tail / left_tail if left_tail != 0 else np.inf
    
    # Skewness and Kurtosis
    skewness = strategy_returns.skew()
    kurtosis = strategy_returns.kurtosis()
    
    print("="*60)
    print("RISK ANALYSIS")
    print("="*60)
    print(f"\nValue at Risk (VaR):")
    print(f"  VaR 95%: {var_95*100:>10.4f}%")
    print(f"  VaR 99%: {var_99*100:>10.4f}%")
    print(f"\nConditional VaR (Expected Shortfall):")
    print(f"  CVaR 95%: {cvar_95*100:>10.4f}%")
    print(f"  CVaR 99%: {cvar_99*100:>10.4f}%")
    print(f"\nDistribution Statistics:")
    print(f"  Skewness: {skewness:>10.4f}")
    print(f"  Kurtosis: {kurtosis:>10.4f}")
    print(f"  Tail Ratio: {tail_ratio:>10.4f}")
    print("="*60)
    
    return {
        'var_95': var_95, 'var_99': var_99,
        'cvar_95': cvar_95, 'cvar_99': cvar_99,
        'skewness': skewness, 'kurtosis': kurtosis,
        'tail_ratio': tail_ratio
    }

# Perform risk analysis
risk_metrics = analyze_risk(results)

In [None]:
# Risk visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. Return Distribution with VaR
ax1 = axes[0]
strategy_returns = results['strategy_returns']

ax1.hist(strategy_returns * 100, bins=50, density=True, alpha=0.7, 
         color='steelblue', edgecolor='black', label='Strategy Returns')

# VaR lines
var_95 = risk_metrics['var_95'] * 100
var_99 = risk_metrics['var_99'] * 100
ax1.axvline(x=var_95, color='orange', linestyle='--', linewidth=2, label=f'VaR 95% ({var_95:.2f}%)')
ax1.axvline(x=var_99, color='red', linestyle='--', linewidth=2, label=f'VaR 99% ({var_99:.2f}%)')
ax1.axvline(x=0, color='black', linestyle='-', linewidth=1)

ax1.set_title('Return Distribution with Value at Risk', fontsize=14)
ax1.set_xlabel('Daily Return (%)')
ax1.set_ylabel('Density')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Rolling Risk Metrics
ax2 = axes[1]
rolling_window = 50

rolling_vol = strategy_returns.rolling(rolling_window).std() * np.sqrt(252) * 100
rolling_sharpe = (
    strategy_returns.rolling(rolling_window).mean() * 252 / 
    (strategy_returns.rolling(rolling_window).std() * np.sqrt(252))
)

ax2.plot(rolling_vol.index, rolling_vol, label='Rolling Volatility (%)', linewidth=2, color='blue')
ax2_twin = ax2.twinx()
ax2_twin.plot(rolling_sharpe.index, rolling_sharpe, label='Rolling Sharpe', linewidth=2, color='green', alpha=0.7)

ax2.set_xlabel('Date')
ax2.set_ylabel('Annualized Volatility (%)', color='blue')
ax2_twin.set_ylabel('Sharpe Ratio', color='green')
ax2.set_title(f'Rolling Risk Metrics ({rolling_window}-day window)', fontsize=14)

# Combine legends
lines1, labels1 = ax2.get_legend_handles_labels()
lines2, labels2 = ax2_twin.get_legend_handles_labels()
ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## Part 9: Summary & Key Takeaways

### What We Built
1. **Complete Transformer Trading System**: End-to-end pipeline from data to trading signals
2. **Rich Feature Engineering**: Technical indicators, volatility, cross-asset features
3. **Custom Transformer Architecture**: Multi-head attention for financial time series
4. **Realistic Backtesting**: Walk-forward validation with transaction costs
5. **Comprehensive Analysis**: Performance metrics, risk analysis, attention visualization

### Key Insights
- **Transformers can capture complex temporal dependencies** in financial data
- **Feature engineering remains crucial** even with attention mechanisms
- **Walk-forward validation prevents look-ahead bias**
- **Transaction costs significantly impact strategy performance**
- **Attention weights reveal which time steps the model focuses on**

### Future Improvements
- Add more sophisticated position sizing (Kelly criterion, risk parity)
- Implement ensemble methods with multiple transformers
- Include regime detection for adaptive strategies
- Add alternative data sources (sentiment, fundamentals)
- Implement online learning for model adaptation

In [None]:
# Final summary
print("="*70)
print("TRANSFORMER TRADING SYSTEM - FINAL SUMMARY")
print("="*70)

print(f"\nüìä DATA")
print(f"   Assets: {tickers}")
print(f"   Period: {start_date} to {end_date}")
print(f"   Features: {len(feature_cols)}")

print(f"\nüèóÔ∏è MODEL ARCHITECTURE")
print(f"   Type: Transformer Encoder")
print(f"   Layers: 3")
print(f"   Attention Heads: 4")
print(f"   Model Dimension: 64")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

print(f"\nüìà STRATEGY PERFORMANCE (Test Period)")
print(f"   Total Return: {metrics['strategy_total_return']*100:.2f}%")
print(f"   Sharpe Ratio: {metrics['strategy_sharpe']:.3f}")
print(f"   Max Drawdown: {metrics['strategy_max_drawdown']*100:.2f}%")

print(f"\nüìä BENCHMARK (Buy & Hold)")
print(f"   Total Return: {metrics['benchmark_total_return']*100:.2f}%")
print(f"   Sharpe Ratio: {metrics['benchmark_sharpe']:.3f}")

print(f"\n‚ö†Ô∏è RISK METRICS")
print(f"   VaR 95%: {risk_metrics['var_95']*100:.4f}%")
print(f"   CVaR 95%: {risk_metrics['cvar_95']*100:.4f}%")

print(f"\nüéØ MODEL ACCURACY")
print(f"   Accuracy: {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   F1 Score: {f1:.4f}")

print("\n" + "="*70)
print("Notebook completed successfully!")
print("="*70)