# Foreign Market Lead-Lag ML Strategy
## Notebook 1: Data Exploration

This notebook explores the data for the Foreign Market Lead-Lag ML strategy:
- Download S&P 500 constituent data
- Download 47 foreign market ETF data
- Analyze data quality and coverage
- Visualize correlations and lead-lag relationships

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import warnings
warnings.filterwarnings('ignore')

from data_acquisition import DataAcquisition

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

%matplotlib inline
%load_ext autoreload
%autoreload 2

## 1. Load Configuration

In [None]:
# Load config
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Strategy Configuration:")
print(f"  Target Universe: {config['data']['target_universe']}")
print(f"  Date Range: {config['data']['start_date']} to {config['data']['end_date']}")
print(f"  Foreign Markets: {len(config['data']['foreign_markets'])}")
print(f"  Lags: {config['features']['lags']}")

## 2. Download Data

In [None]:
# Initialize data acquisition
data_acq = DataAcquisition(config)

# Download all data
print("Downloading data... (this may take several minutes)")
sp500_prices, sp500_returns, foreign_returns = data_acq.get_all_data()

# Save data
data_acq.save_data(sp500_prices, sp500_returns, foreign_returns, '../data')

print("\nData downloaded successfully!")

## 3. Data Summary

In [None]:
print("S&P 500 Daily Prices:")
print(f"  Shape: {sp500_prices.shape}")
print(f"  Date Range: {sp500_prices.index[0]} to {sp500_prices.index[-1]}")
print(f"  Missing Values: {sp500_prices.isnull().sum().sum()}")

print("\nS&P 500 Daily Returns:")
print(f"  Shape: {sp500_returns.shape}")
print(f"  Mean: {sp500_returns.mean().mean():.4f}")
print(f"  Std: {sp500_returns.std().mean():.4f}")

print("\nForeign Weekly Returns:")
print(f"  Shape: {foreign_returns.shape}")
print(f"  Date Range: {foreign_returns.index[0]} to {foreign_returns.index[-1]}")
print(f"  Mean: {foreign_returns.mean().mean():.4f}")
print(f"  Std: {foreign_returns.std().mean():.4f}")

## 4. Visualize Foreign Market Returns

In [None]:
# Plot cumulative returns for foreign markets
fig, ax = plt.subplots(figsize=(15, 8))

cumulative_returns = (1 + foreign_returns).cumprod()

# Plot top 10 markets by total return
total_returns = cumulative_returns.iloc[-1].sort_values(ascending=False)
top_markets = total_returns.head(10).index

for market in top_markets:
    ax.plot(cumulative_returns.index, cumulative_returns[market], 
           label=market, linewidth=2, alpha=0.7)

ax.set_title('Cumulative Returns: Top 10 Foreign Markets', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Cumulative Return')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 10 Markets by Total Return:")
for market, ret in total_returns.head(10).items():
    print(f"  {market}: {(ret - 1) * 100:.1f}%")

## 5. Correlation Analysis

In [None]:
# Calculate correlation between foreign markets and S&P 500
# Resample S&P 500 to weekly
sp500_weekly = sp500_returns.resample('W-FRI').apply(lambda x: (1 + x).prod() - 1)

# Align dates
common_dates = foreign_returns.index.intersection(sp500_weekly.index)
foreign_aligned = foreign_returns.loc[common_dates]
sp500_aligned = sp500_weekly.loc[common_dates]

# Calculate average S&P 500 return
sp500_avg = sp500_aligned.mean(axis=1)

# Calculate correlations
correlations = foreign_aligned.corrwith(sp500_avg).sort_values(ascending=False)

# Plot correlations
fig, ax = plt.subplots(figsize=(12, 8))
correlations.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Correlation: Foreign Markets vs S&P 500', fontsize=14, fontweight='bold')
ax.set_xlabel('Correlation')
ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 10 Most Correlated Markets:")
for market, corr in correlations.head(10).items():
    print(f"  {market}: {corr:.3f}")

## 6. Lead-Lag Analysis

In [None]:
# Test lead-lag relationships
# Calculate correlation at different lags

lags = [1, 2, 3, 4]
lag_correlations = {}

for lag in lags:
    lagged_foreign = foreign_aligned.shift(lag)
    lag_corr = lagged_foreign.corrwith(sp500_avg)
    lag_correlations[f'Lag {lag}'] = lag_corr

lag_corr_df = pd.DataFrame(lag_correlations)

# Plot heatmap
fig, ax = plt.subplots(figsize=(10, 12))
sns.heatmap(lag_corr_df, annot=False, cmap='RdYlGn', center=0, 
           cbar_kws={'label': 'Correlation'}, ax=ax)
ax.set_title('Lead-Lag Correlations: Foreign Markets vs S&P 500', 
            fontsize=14, fontweight='bold')
ax.set_xlabel('Lag (weeks)')
ax.set_ylabel('Foreign Market')
plt.tight_layout()
plt.show()

# Find markets with strongest lagged correlations
print("\nMarkets with Strongest Lagged Correlations:")
for lag in lags:
    col = f'Lag {lag}'
    top_market = lag_corr_df[col].abs().idxmax()
    top_corr = lag_corr_df.loc[top_market, col]
    print(f"  Lag {lag}: {top_market} ({top_corr:.3f})")

## 7. Data Quality Check

In [None]:
# Check for missing data
missing_pct = (foreign_returns.isnull().sum() / len(foreign_returns)) * 100

print("Missing Data by Market:")
if missing_pct.sum() > 0:
    print(missing_pct[missing_pct > 0].sort_values(ascending=False))
else:
    print("  No missing data!")

# Check for extreme values
extreme_returns = (foreign_returns.abs() > 0.5).sum()
print("\nExtreme Returns (>50%) by Market:")
if extreme_returns.sum() > 0:
    print(extreme_returns[extreme_returns > 0].sort_values(ascending=False))
else:
    print("  No extreme returns detected")

## Summary

This notebook explored the data for the Foreign Market Lead-Lag ML strategy:
- Downloaded S&P 500 and foreign market data
- Analyzed correlations between foreign markets and S&P 500
- Examined lead-lag relationships at different time horizons
- Verified data quality

**Next Steps**: Proceed to Notebook 2 for feature engineering.