# Exploration Data Analysis notebook
This notebook is used to exploration raw data from the settings.yaml for analysis and process it for backtesting.

## Key Activates:
    1. Download raw data from settings.yaml
    2. Remove bad data from the dataset
    3. add the techinal indictors
    4. Plot graphs of these indictors
    5. Save the new dataset into data/processed

In [None]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import yaml
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path to import custom modules
sys.path.append('../src')
from data_pipeline import download_raw_data, preprocess_data, technical_indicators, save_data
from signals import zscore_normalize, order_book_imbalance

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load configuration from settings.yaml
with open('../configs/settings.yaml', 'r') as file:
    config = yaml.safe_load(file)

print("Configuration loaded:")
print(f"Tickers: {config['data']['tickers']}")
print(f"Date range: {config['data']['start_date']} to {config['data']['end_date']}")
print(f"Moving average windows: {config['data']['moving_average_windows']}")
print(f"Technical indicators: {config['data']['technical_indicators']}")

# Extract configuration parameters
tickers = config['data']['tickers']
start_date = config['data']['start_date']
end_date = config['data']['end_date']
ma_windows = config['data']['moving_average_windows']
raw_data_path = config['paths']['raw_data_paths']
processed_data_path = config['paths']['processed_data_paths']

In [None]:
# Step 1: Download raw data for all tickers
print("Downloading raw data for all tickers...")
raw_data = {}

for ticker in tickers:
    try:
        print(f"Downloading data for {ticker}...")
        data = download_raw_data(ticker, start_date, end_date)
        raw_data[ticker] = data
        print(f"✓ Successfully downloaded {len(data)} records for {ticker}")
    except Exception as e:
        print(f"✗ Error downloading {ticker}: {str(e)}")

print(f"\nCompleted downloading data for {len(raw_data)} tickers out of {len(tickers)} requested.")

# Display summary of downloaded data
print("\nData Summary:")
for ticker, data in raw_data.items():
    print(f"{ticker}: {len(data)} records from {data.index[0].date()} to {data.index[-1].date()}")

In [None]:
# Step 2: Data Quality Assessment and Preprocessing
print("Performing data quality assessment...")

# Check for missing values and data quality issues
for ticker, data in raw_data.items():
    print(f"\n{ticker} - Data Quality Check:")
    print(f"  Shape: {data.shape}")
    print(f"  Missing values: {data.isnull().sum().sum()}")
    print(f"  Duplicate indices: {data.index.duplicated().sum()}")
    
    # Check for obvious outliers in price data
    if 'Close' in data.columns:
        close_pct_change = data['Close'].pct_change()
        extreme_moves = close_pct_change[abs(close_pct_change) > 0.2]
        if len(extreme_moves) > 0:
            print(f"  Extreme price movements (>20%): {len(extreme_moves)}")

# Preprocess the data
print("\n" + "="*50)
print("Preprocessing raw data...")
processed_data = {}

for ticker, data in raw_data.items():
    try:
        processed_data[ticker] = preprocess_data(data.copy())
        print(f"✓ Preprocessed data for {ticker}")
    except Exception as e:
        print(f"✗ Error preprocessing {ticker}: {str(e)}")

print(f"Successfully preprocessed {len(processed_data)} datasets")

In [None]:
# Step 3: Basic Statistical Analysis
print("Performing basic statistical analysis...")

# Create a combined DataFrame for analysis
combined_data = pd.DataFrame()
returns_data = pd.DataFrame()

for ticker, data in processed_data.items():
    if 'Close' in data.columns:
        # Store closing prices
        combined_data[ticker] = data['Close']
        
        # Calculate returns
        returns = data['Close'].pct_change().dropna()
        returns_data[ticker] = returns

print(f"Combined dataset shape: {combined_data.shape}")
print(f"Returns dataset shape: {returns_data.shape}")

# Display basic statistics
print("\n" + "="*60)
print("PRICE STATISTICS")
print("="*60)
print(combined_data.describe())

print("\n" + "="*60)
print("RETURNS STATISTICS")
print("="*60)
print(returns_data.describe())

# Calculate correlation matrix
correlation_matrix = returns_data.corr()
print(f"\nAverage correlation between stocks: {correlation_matrix.values[np.triu_indices_from(correlation_matrix.values, k=1)].mean():.3f}")

In [None]:
# Step 4: Visualization - Price Charts and Returns
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Normalized price evolution
ax1 = axes[0, 0]
normalized_prices = combined_data.div(combined_data.iloc[0]) * 100
for ticker in normalized_prices.columns:
    ax1.plot(normalized_prices.index, normalized_prices[ticker], label=ticker, alpha=0.8)
ax1.set_title('Normalized Price Evolution (Base 100)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date')
ax1.set_ylabel('Normalized Price')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Plot 2: Returns distribution
ax2 = axes[0, 1]
returns_data.plot(kind='hist', bins=50, alpha=0.7, ax=ax2)
ax2.set_title('Returns Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Daily Returns')
ax2.set_ylabel('Frequency')
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot 3: Correlation heatmap
ax3 = axes[1, 0]
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, ax=ax3, cbar_kws={'shrink': 0.8})
ax3.set_title('Returns Correlation Matrix', fontsize=14, fontweight='bold')

# Plot 4: Rolling volatility (30-day)
ax4 = axes[1, 1]
rolling_vol = returns_data.rolling(window=30).std() * np.sqrt(252)  # Annualized volatility
for ticker in rolling_vol.columns:
    ax4.plot(rolling_vol.index, rolling_vol[ticker], label=ticker, alpha=0.8)
ax4.set_title('30-Day Rolling Volatility (Annualized)', fontsize=14, fontweight='bold')
ax4.set_xlabel('Date')
ax4.set_ylabel('Volatility')
ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print volatility summary
print("\nVolatility Summary (Annualized):")
vol_summary = returns_data.std() * np.sqrt(252)
for ticker, vol in vol_summary.items():
    print(f"{ticker}: {vol:.1%}")

In [None]:
# Step 5: Technical Indicators Analysis
print("Computing technical indicators...")

# Enhanced technical indicators function
def compute_comprehensive_indicators(df, ma_windows):
    """Compute comprehensive technical indicators"""
    df_copy = df.copy()
    
    # Moving Averages
    for window in ma_windows:
        df_copy[f'SMA_{window}'] = df_copy['Close'].rolling(window=window).mean()
        df_copy[f'EMA_{window}'] = df_copy['Close'].ewm(span=window).mean()
    
    # RSI
    delta = df_copy['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df_copy['RSI'] = 100 - (100 / (1 + rs))
    
    # MACD
    exp1 = df_copy['Close'].ewm(span=12).mean()
    exp2 = df_copy['Close'].ewm(span=26).mean()
    df_copy['MACD'] = exp1 - exp2
    df_copy['MACD_Signal'] = df_copy['MACD'].ewm(span=9).mean()
    df_copy['MACD_Histogram'] = df_copy['MACD'] - df_copy['MACD_Signal']
    
    # Bollinger Bands
    df_copy['BB_Middle'] = df_copy['Close'].rolling(window=20).mean()
    bb_std = df_copy['Close'].rolling(window=20).std()
    df_copy['BB_Upper'] = df_copy['BB_Middle'] + (bb_std * 2)
    df_copy['BB_Lower'] = df_copy['BB_Middle'] - (bb_std * 2)
    df_copy['BB_Width'] = df_copy['BB_Upper'] - df_copy['BB_Lower']
    df_copy['BB_Position'] = (df_copy['Close'] - df_copy['BB_Lower']) / (df_copy['BB_Upper'] - df_copy['BB_Lower'])
    
    # Average True Range (ATR)
    high_low = df_copy['High'] - df_copy['Low']
    high_close = np.abs(df_copy['High'] - df_copy['Close'].shift())
    low_close = np.abs(df_copy['Low'] - df_copy['Close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df_copy['ATR'] = true_range.rolling(window=14).mean()
    
    return df_copy

# Apply technical indicators to all stocks
technical_data = {}
for ticker, data in processed_data.items():
    print(f"Computing indicators for {ticker}...")
    technical_data[ticker] = compute_comprehensive_indicators(data, ma_windows[:4])  # Use first 4 MA windows

print("Technical indicators computed successfully!")

In [None]:
# Step 6: Technical Indicators Visualization
# Select one stock for detailed technical analysis visualization
sample_ticker = tickers[0]  # Use first ticker (AAPL)
sample_data = technical_data[sample_ticker].copy()

# Create comprehensive technical analysis charts
fig, axes = plt.subplots(4, 1, figsize=(15, 16))

# Chart 1: Price with Moving Averages and Bollinger Bands
ax1 = axes[0]
ax1.plot(sample_data.index, sample_data['Close'], label='Close Price', linewidth=2)
ax1.plot(sample_data.index, sample_data['SMA_25'], label='SMA 25', alpha=0.8)
ax1.plot(sample_data.index, sample_data['SMA_50'], label='SMA 50', alpha=0.8)
ax1.plot(sample_data.index, sample_data['BB_Upper'], label='BB Upper', linestyle='--', alpha=0.6)
ax1.plot(sample_data.index, sample_data['BB_Lower'], label='BB Lower', linestyle='--', alpha=0.6)
ax1.fill_between(sample_data.index, sample_data['BB_Upper'], sample_data['BB_Lower'], alpha=0.1)
ax1.set_title(f'{sample_ticker} - Price, Moving Averages & Bollinger Bands', fontsize=14, fontweight='bold')
ax1.set_ylabel('Price ($)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Chart 2: RSI
ax2 = axes[1]
ax2.plot(sample_data.index, sample_data['RSI'], color='purple', linewidth=2)
ax2.axhline(y=70, color='r', linestyle='--', alpha=0.7, label='Overbought (70)')
ax2.axhline(y=30, color='g', linestyle='--', alpha=0.7, label='Oversold (30)')
ax2.fill_between(sample_data.index, 30, 70, alpha=0.1, color='gray')
ax2.set_title('Relative Strength Index (RSI)', fontsize=14, fontweight='bold')
ax2.set_ylabel('RSI')
ax2.set_ylim(0, 100)
ax2.legend()
ax2.grid(True, alpha=0.3)

# Chart 3: MACD
ax3 = axes[2]
ax3.plot(sample_data.index, sample_data['MACD'], label='MACD', linewidth=2)
ax3.plot(sample_data.index, sample_data['MACD_Signal'], label='Signal', linewidth=2)
ax3.bar(sample_data.index, sample_data['MACD_Histogram'], label='Histogram', alpha=0.6)
ax3.axhline(y=0, color='black', linestyle='-', alpha=0.5)
ax3.set_title('MACD (Moving Average Convergence Divergence)', fontsize=14, fontweight='bold')
ax3.set_ylabel('MACD')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Chart 4: Volume and ATR
ax4 = axes[3]
ax4_twin = ax4.twinx()
ax4.bar(sample_data.index, sample_data['Volume'], alpha=0.6, color='blue', label='Volume')
ax4_twin.plot(sample_data.index, sample_data['ATR'], color='red', linewidth=2, label='ATR')
ax4.set_title('Volume and Average True Range (ATR)', fontsize=14, fontweight='bold')
ax4.set_ylabel('Volume', color='blue')
ax4_twin.set_ylabel('ATR', color='red')
ax4.set_xlabel('Date')
ax4.grid(True, alpha=0.3)

# Add legends
ax4.legend(loc='upper left')
ax4_twin.legend(loc='upper right')

plt.tight_layout()
plt.show()

# Print technical indicator summary
print(f"\nTechnical Indicators Summary for {sample_ticker}:")
print(f"Current RSI: {sample_data['RSI'].iloc[-1]:.1f}")
print(f"Current MACD: {sample_data['MACD'].iloc[-1]:.3f}")
print(f"Current BB Position: {sample_data['BB_Position'].iloc[-1]:.2f} (0=lower band, 1=upper band)")
print(f"Current ATR: {sample_data['ATR'].iloc[-1]:.2f}")

In [None]:
# Step 7: Cross-Asset Analysis and Signal Generation
print("Performing cross-asset analysis...")

# Create signals using custom functions from signals.py
print("Generating trading signals...")

# Calculate z-scores for mean reversion signals
zscore_data = pd.DataFrame()
for ticker in tickers:
    if ticker in technical_data:
        close_prices = technical_data[ticker]['Close']
        # Calculate rolling z-score (mean reversion signal)
        rolling_mean = close_prices.rolling(window=60).mean()
        rolling_std = close_prices.rolling(window=60).std()
        zscore_data[ticker] = (close_prices - rolling_mean) / rolling_std

# Apply z-score normalization using custom function
normalized_signals = pd.DataFrame()
for ticker in zscore_data.columns:
    try:
        normalized_signals[ticker] = zscore_normalize(zscore_data[ticker].dropna())
    except:
        print(f"Warning: Could not normalize {ticker}")

print(f"Generated signals for {len(normalized_signals.columns)} assets")

# Visualization of signals
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Plot 1: Z-scores for all assets
ax1 = axes[0]
for ticker in zscore_data.columns[:4]:  # Plot first 4 for clarity
    ax1.plot(zscore_data.index, zscore_data[ticker], label=ticker, alpha=0.8)
ax1.axhline(y=2, color='r', linestyle='--', alpha=0.7, label='Overbought (+2σ)')
ax1.axhline(y=-2, color='g', linestyle='--', alpha=0.7, label='Oversold (-2σ)')
ax1.fill_between(zscore_data.index, -2, 2, alpha=0.1, color='gray')
ax1.set_title('60-Day Rolling Z-Scores (Mean Reversion Signals)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Z-Score')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Signal distribution
ax2 = axes[1]
zscore_data.hist(bins=50, alpha=0.7, ax=ax2)
ax2.set_title('Distribution of Z-Score Signals', fontsize=14, fontweight='bold')
ax2.set_xlabel('Z-Score')
ax2.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Signal analysis
extreme_signals = zscore_data[abs(zscore_data) > 2].count()
print("\nExtreme Signals Analysis (|Z-Score| > 2):")
for ticker, count in extreme_signals.items():
    if count > 0:
        total_obs = len(zscore_data[ticker].dropna())
        percentage = (count / total_obs) * 100
        print(f"{ticker}: {count} signals ({percentage:.1f}% of observations)")

In [None]:
# Step 8: Statistical Arbitrage Opportunity Analysis
print("Analyzing statistical arbitrage opportunities...")

# Pairs analysis - find potential pairs for statistical arbitrage
from itertools import combinations

# Calculate correlation and cointegration analysis
pairs_analysis = []

for ticker1, ticker2 in combinations(tickers, 2):
    if ticker1 in returns_data.columns and ticker2 in returns_data.columns:
        # Get price series
        price1 = combined_data[ticker1].dropna()
        price2 = combined_data[ticker2].dropna()
        
        # Align dates
        common_dates = price1.index.intersection(price2.index)
        if len(common_dates) > 100:  # Ensure sufficient data
            price1_aligned = price1[common_dates]
            price2_aligned = price2[common_dates]
            
            # Calculate correlation
            correlation = price1_aligned.corr(price2_aligned)
            
            # Simple spread analysis
            spread = price1_aligned - price2_aligned
            spread_std = spread.std()
            spread_mean = spread.mean()
            
            pairs_analysis.append({
                'Pair': f"{ticker1}-{ticker2}",
                'Correlation': correlation,
                'Spread_Mean': spread_mean,
                'Spread_Std': spread_std,
                'Spread_CV': spread_std / abs(spread_mean) if spread_mean != 0 else np.inf
            })

# Convert to DataFrame and sort by correlation
pairs_df = pd.DataFrame(pairs_analysis)
pairs_df = pairs_df.sort_values('Correlation', ascending=False)

print("Top 10 Most Correlated Pairs:")
print(pairs_df.head(10))

# Visualize top pairs
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot top 2 most correlated pairs
top_pairs = pairs_df.head(2)
for i, (_, row) in enumerate(top_pairs.iterrows()):
    ticker1, ticker2 = row['Pair'].split('-')
    
    ax = axes[i//2, i%2]
    
    # Normalize prices to same scale
    price1_norm = combined_data[ticker1] / combined_data[ticker1].iloc[0] * 100
    price2_norm = combined_data[ticker2] / combined_data[ticker2].iloc[0] * 100
    
    ax.plot(price1_norm.index, price1_norm, label=ticker1, linewidth=2)
    ax.plot(price2_norm.index, price2_norm, label=ticker2, linewidth=2)
    ax.set_title(f'Normalized Prices: {ticker1} vs {ticker2}\nCorrelation: {row["Correlation"]:.3f}', 
                 fontsize=12, fontweight='bold')
    ax.set_ylabel('Normalized Price (Base 100)')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Plot spreads for top 2 pairs
for i, (_, row) in enumerate(top_pairs.iterrows()):
    ticker1, ticker2 = row['Pair'].split('-')
    
    ax = axes[1, i]
    
    spread = combined_data[ticker1] - combined_data[ticker2]
    ax.plot(spread.index, spread, color='red', linewidth=2)
    ax.axhline(y=spread.mean(), color='blue', linestyle='--', alpha=0.7, label='Mean')
    ax.axhline(y=spread.mean() + 2*spread.std(), color='orange', linestyle='--', alpha=0.7, label='+2σ')
    ax.axhline(y=spread.mean() - 2*spread.std(), color='orange', linestyle='--', alpha=0.7, label='-2σ')
    ax.fill_between(spread.index, 
                   spread.mean() - 2*spread.std(), 
                   spread.mean() + 2*spread.std(), 
                   alpha=0.1, color='orange')
    ax.set_title(f'Price Spread: {ticker1} - {ticker2}', fontsize=12, fontweight='bold')
    ax.set_ylabel('Price Spread ($)')
    ax.set_xlabel('Date')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Step 9: Risk Metrics and Portfolio Analysis
print("Computing risk metrics...")

# Calculate risk metrics for each asset
risk_metrics = pd.DataFrame()

for ticker in tickers:
    if ticker in returns_data.columns:
        returns = returns_data[ticker].dropna()
        
        # Basic risk metrics
        risk_metrics.loc[ticker, 'Annualized_Return'] = returns.mean() * 252
        risk_metrics.loc[ticker, 'Annualized_Volatility'] = returns.std() * np.sqrt(252)
        risk_metrics.loc[ticker, 'Sharpe_Ratio'] = (returns.mean() * 252) / (returns.std() * np.sqrt(252))
        
        # Downside metrics
        negative_returns = returns[returns < 0]
        risk_metrics.loc[ticker, 'Downside_Deviation'] = negative_returns.std() * np.sqrt(252)
        risk_metrics.loc[ticker, 'Sortino_Ratio'] = (returns.mean() * 252) / (negative_returns.std() * np.sqrt(252))
        
        # Maximum drawdown
        cumulative_returns = (1 + returns).cumprod()
        rolling_max = cumulative_returns.expanding().max()
        drawdown = (cumulative_returns - rolling_max) / rolling_max
        risk_metrics.loc[ticker, 'Max_Drawdown'] = drawdown.min()
        
        # VaR (95% confidence)
        risk_metrics.loc[ticker, 'VaR_95'] = np.percentile(returns, 5)
        risk_metrics.loc[ticker, 'CVaR_95'] = returns[returns <= np.percentile(returns, 5)].mean()

# Display risk metrics
print("\nRisk Metrics Summary:")
print(risk_metrics.round(4))

# Risk-Return visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Risk-Return scatter
ax1 = axes[0, 0]
scatter = ax1.scatter(risk_metrics['Annualized_Volatility'], 
                     risk_metrics['Annualized_Return'],
                     s=100, alpha=0.7, c=risk_metrics['Sharpe_Ratio'], 
                     cmap='viridis')
ax1.set_xlabel('Annualized Volatility')
ax1.set_ylabel('Annualized Return')
ax1.set_title('Risk-Return Profile', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Add ticker labels
for ticker in risk_metrics.index:
    ax1.annotate(ticker, 
                (risk_metrics.loc[ticker, 'Annualized_Volatility'], 
                 risk_metrics.loc[ticker, 'Annualized_Return']),
                xytext=(5, 5), textcoords='offset points', fontsize=8)

plt.colorbar(scatter, ax=ax1, label='Sharpe Ratio')

# Plot 2: Sharpe vs Sortino Ratio
ax2 = axes[0, 1]
ax2.scatter(risk_metrics['Sharpe_Ratio'], risk_metrics['Sortino_Ratio'], 
           s=100, alpha=0.7)
ax2.set_xlabel('Sharpe Ratio')
ax2.set_ylabel('Sortino Ratio')
ax2.set_title('Sharpe vs Sortino Ratio', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

for ticker in risk_metrics.index:
    ax2.annotate(ticker, 
                (risk_metrics.loc[ticker, 'Sharpe_Ratio'], 
                 risk_metrics.loc[ticker, 'Sortino_Ratio']),
                xytext=(5, 5), textcoords='offset points', fontsize=8)

# Plot 3: Maximum Drawdown
ax3 = axes[1, 0]
risk_metrics['Max_Drawdown'].plot(kind='bar', ax=ax3, color='red', alpha=0.7)
ax3.set_title('Maximum Drawdown by Asset', fontsize=14, fontweight='bold')
ax3.set_ylabel('Max Drawdown')
ax3.tick_params(axis='x', rotation=45)
ax3.grid(True, alpha=0.3)

# Plot 4: VaR and CVaR
ax4 = axes[1, 1]
x_pos = np.arange(len(risk_metrics.index))
width = 0.35
ax4.bar(x_pos - width/2, risk_metrics['VaR_95'], width, label='VaR (95%)', alpha=0.7)
ax4.bar(x_pos + width/2, risk_metrics['CVaR_95'], width, label='CVaR (95%)', alpha=0.7)
ax4.set_xlabel('Assets')
ax4.set_ylabel('Daily Return')
ax4.set_title('Value at Risk and Conditional VaR', fontsize=14, fontweight='bold')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(risk_metrics.index, rotation=45)
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Portfolio analysis
print("\n" + "="*60)
print("PORTFOLIO ANALYSIS")
print("="*60)

# Equal weight portfolio
equal_weights = np.ones(len(returns_data.columns)) / len(returns_data.columns)
portfolio_returns = (returns_data * equal_weights).sum(axis=1)

# Portfolio metrics
portfolio_metrics = {
    'Portfolio Annualized Return': portfolio_returns.mean() * 252,
    'Portfolio Annualized Volatility': portfolio_returns.std() * np.sqrt(252),
    'Portfolio Sharpe Ratio': (portfolio_returns.mean() * 252) / (portfolio_returns.std() * np.sqrt(252)),
    'Portfolio Max Drawdown': ((1 + portfolio_returns).cumprod() / (1 + portfolio_returns).cumprod().expanding().max() - 1).min()
}

print("Equal-Weight Portfolio Metrics:")
for metric, value in portfolio_metrics.items():
    print(f"{metric}: {value:.4f}")

# Diversification benefit
individual_risk = (risk_metrics['Annualized_Volatility'] * equal_weights).sum()
portfolio_risk = portfolio_metrics['Portfolio Annualized Volatility']
diversification_ratio = individual_risk / portfolio_risk

print(f"\nDiversification Ratio: {diversification_ratio:.2f}")
print(f"Risk Reduction: {(1 - 1/diversification_ratio)*100:.1f}%")

In [None]:
# Step 10: Save Processed Data
print("Saving processed datasets...")

# Create directories if they don't exist
os.makedirs(raw_data_path, exist_ok=True)
os.makedirs(processed_data_path, exist_ok=True)

# Save raw data
print("Saving raw data...")
for ticker, data in raw_data.items():
    filename = os.path.join(raw_data_path, f"{ticker}_raw.csv")
    save_data(data, filename)
    print(f"✓ Saved raw data for {ticker}")

# Save processed data with technical indicators
print("\nSaving processed data with technical indicators...")
for ticker, data in technical_data.items():
    filename = os.path.join(processed_data_path, f"{ticker}_processed.csv")
    save_data(data, filename)
    print(f"✓ Saved processed data for {ticker}")

# Save combined datasets
print("\nSaving combined datasets...")

# Combined prices
combined_data.to_csv(os.path.join(processed_data_path, "combined_prices.csv"))
print("✓ Saved combined prices dataset")

# Combined returns
returns_data.to_csv(os.path.join(processed_data_path, "combined_returns.csv"))
print("✓ Saved combined returns dataset")

# Risk metrics
risk_metrics.to_csv(os.path.join(processed_data_path, "risk_metrics.csv"))
print("✓ Saved risk metrics")

# Pairs analysis
pairs_df.to_csv(os.path.join(processed_data_path, "pairs_analysis.csv"), index=False)
print("✓ Saved pairs analysis")

# Z-score signals
zscore_data.to_csv(os.path.join(processed_data_path, "zscore_signals.csv"))
print("✓ Saved z-score signals")

# Summary statistics
summary_stats = {
    'dataset_info': {
        'tickers': tickers,
        'date_range': f"{start_date} to {end_date}",
        'total_observations': len(combined_data),
        'number_of_assets': len(tickers)
    },
    'data_quality': {
        'successful_downloads': len(raw_data),
        'successful_processing': len(technical_data),
        'average_correlation': correlation_matrix.values[np.triu_indices_from(correlation_matrix.values, k=1)].mean()
    }
}

# Save summary
with open(os.path.join(processed_data_path, "eda_summary.yaml"), 'w') as f:
    yaml.dump(summary_stats, f, default_flow_style=False)
print("✓ Saved EDA summary")

print(f"\n{'='*60}")
print("EDA COMPLETE!")
print(f"{'='*60}")
print(f"Data saved to: {processed_data_path}")
print(f"Raw data files: {len(raw_data)}")
print(f"Processed data files: {len(technical_data)}")
print(f"Analysis period: {start_date} to {end_date}")
print(f"Assets analyzed: {', '.join(tickers)}")
print("\nKey findings:")
print(f"- Average correlation between assets: {correlation_matrix.values[np.triu_indices_from(correlation_matrix.values, k=1)].mean():.3f}")
print(f"- Best Sharpe ratio: {risk_metrics['Sharpe_Ratio'].max():.3f} ({risk_metrics['Sharpe_Ratio'].idxmax()})")
print(f"- Highest volatility: {risk_metrics['Annualized_Volatility'].max():.1%} ({risk_metrics['Annualized_Volatility'].idxmax()})")
print(f"- Most correlated pair: {pairs_df.iloc[0]['Pair']} (correlation: {pairs_df.iloc[0]['Correlation']:.3f})")

print("\nReady for backtesting and strategy development!")