# Data Exploration - Statistical Arbitrage RL

This notebook explores the S&P 500 universe, sector distributions, and price characteristics for pairs trading.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import sys

# Add parent directory to path
sys.path.append('..')

from data_acquisition import DataAcquisition

# Set visualization style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("Libraries imported successfully")

## 1. Load Configuration and Data

In [None]:
# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"  Universe: {config['data']['universe']}")
print(f"  Target sectors: {config['pair_selection']['sectors']}")
print(f"  Training period: {config['data']['train_start']} to {config['data']['train_end']}")
print(f"  Testing period: {config['data']['test_start']} to {config['data']['test_end']}")

In [None]:
# Fetch S&P 500 data
data_acq = DataAcquisition('../config.yaml')
dataset = data_acq.fetch_full_dataset()

print("\n=== Dataset Summary ===")
print(f"Total tickers: {dataset['metadata']['total_tickers']}")
print(f"Date range: {dataset['metadata']['date_range']}")
print(f"Trading days: {dataset['metadata']['trading_days']}")

## 2. Sector Analysis

In [None]:
# Sector distribution
constituents = dataset['constituents']

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
sector_counts = constituents['sector'].value_counts()
sector_counts.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Stocks per Sector', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sector')
axes[0].set_ylabel('Number of Stocks')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
axes[1].pie(sector_counts, labels=sector_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Sector Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nSector breakdown:")
for sector, count in sector_counts.items():
    print(f"  {sector}: {count} stocks ({count/len(constituents)*100:.1f}%)")

## 3. Price Data Quality

In [None]:
# Check for missing data
prices = dataset['prices']

missing_data = prices.isna().sum()
missing_pct = (missing_data / len(prices)) * 100

# Tickers with most missing data
top_missing = missing_pct.nlargest(10)

if len(top_missing) > 0 and top_missing.max() > 0:
    print("Top 10 tickers with missing data:")
    for ticker, pct in top_missing.items():
        print(f"  {ticker}: {pct:.2f}%")
else:
    print("No significant missing data found!")

In [None]:
# Visualize data availability over time
data_availability = (~prices.isna()).sum(axis=1)

plt.figure(figsize=(14, 6))
plt.plot(prices.index, data_availability, linewidth=1.5, color='darkgreen')
plt.title('Data Availability Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Number of Stocks with Data')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nAverage stocks with data per day: {data_availability.mean():.1f}")
print(f"Minimum: {data_availability.min()}")
print(f"Maximum: {data_availability.max()}")

## 4. Price Statistics by Sector

In [None]:
# Calculate returns
returns = prices.pct_change().dropna()

# Merge with sector info
sector_returns = []

for _, row in constituents.iterrows():
    ticker = row['ticker']
    sector = row['sector']
    
    if ticker in returns.columns:
        ticker_returns = returns[ticker]
        sector_returns.append({
            'ticker': ticker,
            'sector': sector,
            'mean_return': ticker_returns.mean() * 252,  # Annualized
            'volatility': ticker_returns.std() * np.sqrt(252),  # Annualized
            'sharpe': (ticker_returns.mean() / ticker_returns.std()) * np.sqrt(252)
        })

sector_stats = pd.DataFrame(sector_returns)

# Aggregate by sector
sector_agg = sector_stats.groupby('sector').agg({
    'mean_return': 'mean',
    'volatility': 'mean',
    'sharpe': 'mean'
}).round(3)

print("\n=== Sector Statistics (Annualized) ===")
print(sector_agg)

In [None]:
# Visualize sector risk-return profile
plt.figure(figsize=(12, 8))

for sector in sector_stats['sector'].unique():
    sector_data = sector_stats[sector_stats['sector'] == sector]
    plt.scatter(sector_data['volatility'], sector_data['mean_return'], 
               label=sector, alpha=0.6, s=100)

plt.xlabel('Annualized Volatility', fontsize=12)
plt.ylabel('Annualized Return', fontsize=12)
plt.title('Risk-Return Profile by Sector', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Sample Stock Pairs Visualization

In [None]:
# Plot sample pairs from target sectors
sample_pairs = [
    ('MSFT', 'GOOGL', 'Technology'),
    ('CVS', 'JNJ', 'Healthcare'),
    ('PG', 'KO', 'Consumer Goods')
]

fig, axes = plt.subplots(len(sample_pairs), 1, figsize=(14, 12))

for idx, (ticker1, ticker2, sector) in enumerate(sample_pairs):
    if ticker1 in prices.columns and ticker2 in prices.columns:
        # Normalize to 100
        norm_price1 = prices[ticker1] / prices[ticker1].iloc[0] * 100
        norm_price2 = prices[ticker2] / prices[ticker2].iloc[0] * 100
        
        axes[idx].plot(norm_price1.index, norm_price1, label=ticker1, linewidth=2)
        axes[idx].plot(norm_price2.index, norm_price2, label=ticker2, linewidth=2)
        axes[idx].set_title(f'{sector}: {ticker1} vs {ticker2}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Normalized Price (Base=100)')
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

## 6. Volume Analysis

In [None]:
# Volume statistics
volumes = dataset['volumes']

avg_volume_by_ticker = volumes.mean().sort_values(ascending=False)

# Top 10 most liquid stocks
top_liquid = avg_volume_by_ticker.head(10)

plt.figure(figsize=(12, 6))
top_liquid.plot(kind='barh', color='coral')
plt.title('Top 10 Most Liquid Stocks (Avg Daily Volume)', fontsize=14, fontweight='bold')
plt.xlabel('Average Daily Volume')
plt.ylabel('Ticker')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nTop 10 most liquid stocks:")
for ticker, vol in top_liquid.items():
    print(f"  {ticker}: {vol:,.0f}")

## 7. Train/Test Split

In [None]:
# Split data
train_prices, test_prices = data_acq.split_train_test(prices)

print(f"Training period: {train_prices.index[0]} to {train_prices.index[-1]}")
print(f"  Trading days: {len(train_prices)}")
print(f"  Stocks: {train_prices.shape[1]}")

print(f"\nTesting period: {test_prices.index[0]} to {test_prices.index[-1]}")
print(f"  Trading days: {len(test_prices)}")
print(f"  Stocks: {test_prices.shape[1]}")

# Visualize split
plt.figure(figsize=(14, 6))
plt.axvspan(train_prices.index[0], train_prices.index[-1], alpha=0.3, color='blue', label='Training')
plt.axvspan(test_prices.index[0], test_prices.index[-1], alpha=0.3, color='orange', label='Testing')
plt.title('Train/Test Split', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Period')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Summary

This notebook explored the S&P 500 dataset for statistical arbitrage:

- **Universe**: Focused on 4 target sectors (Technology, Healthcare, Consumer Goods, Financials)
- **Data Quality**: Verified completeness and availability
- **Sector Characteristics**: Analyzed risk-return profiles
- **Sample Pairs**: Visualized co-movement patterns
- **Liquidity**: Identified most liquid stocks for trading
- **Data Split**: Prepared train (2022) and test (2023) periods

**Next**: Proceed to pair selection using correlation analysis and EMRT calculation.