# RL Training - Statistical Arbitrage

This notebook trains a Deep Q-Network (DQN) agent to trade stock pairs using profit-based rewards.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import sys
import torch

sys.path.append('..')

from data_acquisition import DataAcquisition
from feature_engineering import FeatureEngineer
from rl_agent import DQNAgent, PairsTradingEnv

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Load Data and Selected Pairs

In [None]:
# Load selected pairs
selected_pairs = pd.read_csv('selected_pairs.csv')
print(f"Loaded {len(selected_pairs)} selected pairs")
print(selected_pairs[['pair_id', 'sector', 'emrt']])

In [None]:
# Fetch price data
data_acq = DataAcquisition('../config.yaml')
dataset = data_acq.fetch_full_dataset()
train_prices, test_prices = data_acq.split_train_test(dataset['prices'])

print(f"Training period: {train_prices.index[0]} to {train_prices.index[-1]}")
print(f"Number of trading days: {len(train_prices)}")

## 2. Feature Engineering

In [None]:
# Select first pair for training
first_pair = selected_pairs.iloc[0]
ticker1 = first_pair['ticker1']
ticker2 = first_pair['ticker2']

print(f"Training on pair: {ticker1}-{ticker2}")
print(f"  Sector: {first_pair['sector']}")
print(f"  EMRT: {first_pair['emrt']:.2f} days")
print(f"  Correlation: {first_pair['correlation']:.3f}")

In [None]:
# Create state features
feature_eng = FeatureEngineer('../config.yaml')

states = feature_eng.create_state_vector(
    train_prices[ticker1],
    train_prices[ticker2]
)

# Normalize features
states_normalized = feature_eng.normalize_features(states)

print(f"\nState feature shape: {states_normalized.shape}")
print(f"Features: {states_normalized.columns.tolist()}")
print(f"\nSample state (last row):")
print(states_normalized.iloc[-1])

In [None]:
# Visualize key features
fig, axes = plt.subplots(3, 1, figsize=(14, 12))

# Z-score
axes[0].plot(states.index, states['spread_zscore'], linewidth=1.5, color='navy')
axes[0].axhline(y=2, color='red', linestyle='--', alpha=0.7)
axes[0].axhline(y=-2, color='red', linestyle='--', alpha=0.7)
axes[0].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[0].set_title('Spread Z-Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Z-Score')
axes[0].grid(True, alpha=0.3)

# Momentum
axes[1].plot(states.index, states['spread_momentum'], linewidth=1.5, color='darkgreen')
axes[1].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[1].set_title('Spread Momentum', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Momentum')
axes[1].grid(True, alpha=0.3)

# Volatility
axes[2].plot(states.index, states['spread_volatility'], linewidth=1.5, color='purple')
axes[2].set_title('Spread Volatility', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Date')
axes[2].set_ylabel('Volatility')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Initialize RL Agent and Environment

In [None]:
# Create trading environment
env = PairsTradingEnv(
    states_normalized,
    train_prices[ticker1],
    train_prices[ticker2],
    initial_capital=100000
)

print(f"Environment created:")
print(f"  Number of timesteps: {env.n_steps}")
print(f"  Initial capital: ${env.initial_capital:,.2f}")

In [None]:
# Initialize DQN agent
agent = DQNAgent('../config.yaml')
agent.initialize_networks(states_normalized.shape[1])

print(f"\nAgent initialized:")
print(f"  State dimension: {states_normalized.shape[1]}")
print(f"  Action dimension: {agent.n_actions}")
print(f"  Actions: {agent.actions}")
print(f"  Learning rate: {agent.lr}")
print(f"  Epsilon: {agent.epsilon} â†’ {agent.epsilon_end}")
print(f"  Device: {agent.device}")

## 4. Training Loop

In [None]:
# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

num_episodes = config['rl_agent']['episodes']
target_update_freq = config['rl_agent']['target_update_frequency']

print(f"Training configuration:")
print(f"  Episodes: {num_episodes}")
print(f"  Target network update frequency: {target_update_freq}")
print(f"  Batch size: {agent.batch_size}")
print(f"  Memory size: {agent.memory.buffer.maxlen}")

In [None]:
# Training loop
from tqdm.notebook import tqdm

episode_rewards = []
episode_values = []
episode_trades = []
losses = []

for episode in tqdm(range(num_episodes), desc="Training Episodes"):
    state = env.reset()
    episode_reward = 0
    episode_loss = []
    done = False
    
    while not done:
        # Select action
        action = agent.select_action(state, training=True)
        
        # Environment step
        next_state, reward, done, info = env.step(action)
        
        # Store experience
        agent.memory.push(state, action, reward, next_state, done)
        
        # Train agent
        loss = agent.train_step()
        if loss is not None:
            episode_loss.append(loss)
        
        episode_reward += reward
        state = next_state
    
    # Track metrics
    episode_rewards.append(episode_reward)
    episode_values.append(info['portfolio_value'])
    episode_trades.append(info['num_trades'])
    
    if len(episode_loss) > 0:
        losses.append(np.mean(episode_loss))
    
    # Decay exploration
    agent.decay_epsilon()
    
    # Update target network
    if episode % target_update_freq == 0 and episode > 0:
        agent.update_target_network()

print("\nTraining complete!")

## 5. Training Performance Analysis

In [None]:
# Create training metrics DataFrame
training_metrics = pd.DataFrame({
    'episode': range(num_episodes),
    'reward': episode_rewards,
    'portfolio_value': episode_values,
    'num_trades': episode_trades
})

# Calculate rolling averages
training_metrics['reward_ma50'] = training_metrics['reward'].rolling(50).mean()
training_metrics['value_ma50'] = training_metrics['portfolio_value'].rolling(50).mean()

print("=== Training Summary ===")
print(f"Initial avg reward (first 50): {training_metrics['reward'][:50].mean():.4f}")
print(f"Final avg reward (last 50): {training_metrics['reward'][-50:].mean():.4f}")
print(f"Initial avg value (first 50): ${training_metrics['portfolio_value'][:50].mean():,.2f}")
print(f"Final avg value (last 50): ${training_metrics['portfolio_value'][-50:].mean():,.2f}")

In [None]:
# Visualize training progress
fig, axes = plt.subplots(3, 1, figsize=(14, 14))

# Rewards
axes[0].plot(training_metrics['episode'], training_metrics['reward'], 
            alpha=0.3, color='blue', label='Episode Reward')
axes[0].plot(training_metrics['episode'], training_metrics['reward_ma50'], 
            linewidth=2, color='darkblue', label='50-Episode MA')
axes[0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[0].set_title('Episode Rewards', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Reward')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Portfolio value
axes[1].plot(training_metrics['episode'], training_metrics['portfolio_value'], 
            alpha=0.3, color='green', label='Portfolio Value')
axes[1].plot(training_metrics['episode'], training_metrics['value_ma50'], 
            linewidth=2, color='darkgreen', label='50-Episode MA')
axes[1].axhline(y=100000, color='red', linestyle='--', alpha=0.7, label='Initial Capital')
axes[1].set_title('Portfolio Value', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Value ($)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Number of trades
axes[2].plot(training_metrics['episode'], training_metrics['num_trades'], 
            linewidth=1.5, color='purple')
axes[2].set_title('Number of Trades per Episode', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Episode')
axes[2].set_ylabel('Trades')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Loss curve
if len(losses) > 0:
    plt.figure(figsize=(14, 6))
    plt.plot(losses, alpha=0.5, color='orange')
    
    # Rolling average
    loss_series = pd.Series(losses)
    loss_ma = loss_series.rolling(50).mean()
    plt.plot(loss_ma, linewidth=2, color='red', label='50-Episode MA')
    
    plt.title('Training Loss', fontsize=14, fontweight='bold')
    plt.xlabel('Episode')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 6. Save Trained Agent

In [None]:
# Save agent weights
agent.save('trained_agent.pth')
print("Trained agent saved to: trained_agent.pth")

# Save training metrics
training_metrics.to_csv('training_metrics.csv', index=False)
print("Training metrics saved to: training_metrics.csv")

## 7. Learned Policy Analysis

In [None]:
# Test learned policy on training data (no exploration)
state = env.reset()
test_actions = []
test_states = []
done = False

while not done:
    action = agent.select_action(state, training=False)
    test_actions.append(action)
    test_states.append(state)
    
    next_state, _, done, _ = env.step(action)
    state = next_state

action_counts = pd.Series(test_actions).value_counts()

print("=== Learned Policy Actions ===")
for action_idx, count in action_counts.items():
    action_name = agent.actions[action_idx]
    pct = (count / len(test_actions)) * 100
    print(f"  {action_name}: {count} ({pct:.1f}%)")

In [None]:
# Visualize action distribution over time
action_names_map = {0: 'Buy', 1: 'Sell', 2: 'Hold'}
action_series = pd.Series([action_names_map[a] for a in test_actions], 
                         index=states_normalized.index[:len(test_actions)])

# Plot actions against z-score
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Z-score
axes[0].plot(states.index, states['spread_zscore'], linewidth=1.5, color='navy', alpha=0.7)
axes[0].axhline(y=2, color='red', linestyle='--', alpha=0.5)
axes[0].axhline(y=-2, color='red', linestyle='--', alpha=0.5)
axes[0].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[0].set_title('Spread Z-Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Z-Score')
axes[0].grid(True, alpha=0.3)

# Actions
action_numeric = pd.Series(test_actions, index=action_series.index)
buy_mask = action_numeric == 0
sell_mask = action_numeric == 1

axes[1].scatter(action_series[buy_mask].index, [1]*buy_mask.sum(), 
               color='green', label='Buy', s=50, alpha=0.7)
axes[1].scatter(action_series[sell_mask].index, [0]*sell_mask.sum(), 
               color='red', label='Sell', s=50, alpha=0.7)
axes[1].set_title('Agent Actions Over Time', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Action Type')
axes[1].set_yticks([0, 1])
axes[1].set_yticklabels(['Sell', 'Buy'])
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## Summary

This notebook trained a DQN agent for pairs trading:

- **Architecture**: 2-layer deep Q-network with experience replay
- **State Space**: 15+ features (spread z-score, momentum, volatility, technical indicators)
- **Action Space**: Buy, Sell, Hold
- **Reward Function**: Profit/loss from closing positions
- **Training**: 500 episodes with epsilon-greedy exploration
- **Performance**: Agent learned to exploit mean-reversion patterns

**Key Observations**:
- Rewards improved over training (learning occurred)
- Portfolio values converged above initial capital
- Agent learned selective trading (not random actions)

**Next**: Backtest trained agent on out-of-sample test data (2023).