# 03 â€” Exploratory Data Analysis & PCA

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.pca_analysis import StockPCA

pd.set_option('display.max_columns', 100)
sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Feature Matrix

In [None]:
feature_matrix = pd.read_parquet("../data/processed/feature_matrix.parquet")
fundamentals_df = pd.read_parquet("../data/raw/fundamentals.parquet")
print(f"Feature matrix: {feature_matrix.shape}")
feature_matrix.head()

## 2. Feature Distributions

In [None]:
# Select 6 key features
key_features = ['RSI', 'Return_1d', 'Volatility_21d', 'BB_PctB', 'Volume_Ratio', 'Momentum_21d']
available_features = [f for f in key_features if f in feature_matrix.columns]

if len(available_features) >= 6:
    features_to_plot = available_features[:6]
else:
    features_to_plot = feature_matrix.columns[:6].tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, feature in enumerate(features_to_plot):
    axes[idx].hist(feature_matrix[feature].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(feature, fontweight='bold')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.savefig("../figures/03_feature_distributions.png", dpi=150, bbox_inches="tight")
plt.show()

## 3. Correlation Analysis

In [None]:
# Compute correlation matrix
numeric_features = feature_matrix.select_dtypes(include=[np.number])
corr_matrix = numeric_features.corr()

# Find top 10 correlated pairs
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
print(f"Top 10 Correlated Pairs:")
for feat1, feat2, corr in high_corr_pairs[:10]:
    print(f"  {feat1} <-> {feat2}: {corr:.3f}")

# Plot heatmap
fig, ax = plt.subplots(figsize=(14, 12))
top_features = numeric_features.var().nlargest(20).index
sns.clustermap(corr_matrix.loc[top_features, top_features], cmap='coolwarm', center=0, figsize=(14, 12))
plt.suptitle('Feature Correlation Clustermap (Top 20)', fontsize=14, fontweight='bold', y=0.98)
plt.savefig("../figures/03_correlation_clustermap.png", dpi=150, bbox_inches="tight")
plt.show()

## 4. Apply PCA

In [None]:
stock_pca = StockPCA(variance_threshold=0.90)
pca_data = stock_pca.fit_transform(numeric_features)
print(f"\nPCA output shape: {pca_data.shape}")
pca_data_df = pd.DataFrame(pca_data, index=numeric_features.index, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])
pca_data_df.head()

## 5. Explained Variance Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
n_comp = min(20, len(stock_pca.explained_variance_ratio_))
axes[0].bar(range(1, n_comp+1), stock_pca.explained_variance_ratio_[:n_comp], alpha=0.7, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Scree Plot')
axes[0].grid(alpha=0.3)

# Cumulative variance
cumsum_var = np.cumsum(stock_pca.explained_variance_ratio_)
axes[1].plot(range(1, len(cumsum_var)+1), cumsum_var, marker='o', linewidth=2, markersize=6)
axes[1].axhline(y=0.90, color='red', linestyle='--', linewidth=2, label='90% Threshold')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Print variance
var_df = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(stock_pca.explained_variance_ratio_))],
    'Variance': stock_pca.explained_variance_ratio_,
    'Cumulative': cumsum_var
})
print(var_df.to_string())

## 6. PCA Loadings Analysis

In [ ]:
loadings = pd.DataFrame(
    stock_pca.components_.T,
    columns=[f'PC{i+1}' for i in range(stock_pca.components_.shape[0])],
    index=numeric_features.columns
)

# Top features for first 3 PCs
for pc_num in range(1, 4):
    pc_name = f'PC{pc_num}'
    top_pos = loadings[pc_name].nlargest(5)
    top_neg = loadings[pc_name].nsmallest(5)
    print(f"\n{pc_name} Top Features:")
    print(f"  Positive: {list(top_pos.index)}")
    print(f"  Negative: {list(top_neg.index)}")

# Plot loadings heatmap
top_20_var = numeric_features.var().nlargest(20).index
top_loadings = loadings.loc[top_20_var, ['PC1', 'PC2', 'PC3']]
plt.figure(figsize=(10, 12))
sns.heatmap(top_loadings, cmap='RdBu_r', center=0, annot=True, fmt='.2f', cbar_kws={'label': 'Loading'})
plt.title('PCA Loadings (Top 20 Features)', fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Stocks in PCA Space

In [None]:
# Get sector from fundamentals if available
if 'sector' in fundamentals_df.columns:
    sectors = fundamentals_df.loc[pca_data_df.index, 'sector'].fillna('Unknown')
else:
    sectors = pd.Series('Unknown', index=pca_data_df.index)

# Plot stocks in PCA space
fig, ax = plt.subplots(figsize=(14, 10))
for sector in sectors.unique():
    mask = sectors == sector
    ax.scatter(pca_data_df[mask].iloc[:, 0], pca_data_df[mask].iloc[:, 1], s=150, alpha=0.7, label=sector, edgecolors='black', linewidth=1)

# Add ticker labels
for i, ticker in enumerate(pca_data_df.index):
    ax.annotate(ticker, (pca_data_df.iloc[i, 0], pca_data_df.iloc[i, 1]), fontsize=8, ha='center', va='center')

ax.set_xlabel(f'PC1 ({stock_pca.explained_variance_ratio_[0]:.2%})', fontweight='bold')
ax.set_ylabel(f'PC2 ({stock_pca.explained_variance_ratio_[1]:.2%})', fontweight='bold')
ax.set_title('S&P 500 Stocks in PCA Space (Colored by Sector)', fontweight='bold')
ax.legend(loc='best')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("../figures/03_pca_scatter.png", dpi=150, bbox_inches="tight")
plt.show()

## 8. Save PCA Results

In [None]:
os.makedirs("../data/processed", exist_ok=True)
pca_data_df.to_parquet("../data/processed/pca_data.parquet")
feature_matrix.to_parquet("../data/processed/feature_matrix.parquet")
print(f"Saved PCA data: {pca_data_df.shape}")
print(f"Saved feature_matrix: {feature_matrix.shape}")