# 02 â€” Feature Engineering

Computes technical indicators, fundamental features, and sentiment scores.

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import fetch_multiple_stocks
from src.feature_engineering import engineer_features, create_feature_matrix
from src.sentiment import compute_batch_sentiment

pd.set_option('display.max_columns', 100)
sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
SELECTED_TICKERS = ["AAPL","MSFT","GOOGL","AMZN","NVDA","META","TSLA","JPM","V","MA","GS","JNJ","UNH","LLY","MRK","ABBV","XOM","CVX","PG","KO","PEP","WMT","HD","CRM","AMD","NFLX","DIS","INTC","BA","GE"]
stock_data = fetch_multiple_stocks(SELECTED_TICKERS, period="2y")
fundamentals_df = pd.read_parquet("../data/raw/fundamentals.parquet")
print(f"Loaded {len(stock_data)} stocks and fundamentals for {len(fundamentals_df)} stocks")

## 2. Engineer Technical Features (Single Stock Example)

In [None]:
sample_ticker = "AAPL"
sample_features = engineer_features(stock_data[sample_ticker])
print(f"Features for {sample_ticker}: {sample_features.shape}")
print(f"\nFeature columns ({len(sample_features.columns)}):")
print(list(sample_features.columns))
sample_features.tail()

## 3. Visualize Technical Indicators

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(14, 16), sharex=True)

# Close + SMAs
axes[0].plot(sample_features.index, sample_features['Close'], label='Close', linewidth=2, color='black')
if 'SMA_20' in sample_features.columns:
    axes[0].plot(sample_features.index, sample_features['SMA_20'], label='SMA 20', linewidth=1.5, alpha=0.7)
if 'SMA_50' in sample_features.columns:
    axes[0].plot(sample_features.index, sample_features['SMA_50'], label='SMA 50', linewidth=1.5, alpha=0.7)
axes[0].set_title('Close Price with Moving Averages', fontweight='bold')
axes[0].set_ylabel('Price')
axes[0].legend()
axes[0].grid(alpha=0.3)

# RSI
if 'RSI' in sample_features.columns:
    axes[1].plot(sample_features.index, sample_features['RSI'], label='RSI', linewidth=2, color='orange')
    axes[1].axhline(y=70, color='r', linestyle='--', alpha=0.5, label='Overbought')
    axes[1].axhline(y=30, color='g', linestyle='--', alpha=0.5, label='Oversold')
    axes[1].set_title('Relative Strength Index (RSI)', fontweight='bold')
    axes[1].set_ylabel('RSI')
    axes[1].set_ylim([0, 100])
    axes[1].legend()
    axes[1].grid(alpha=0.3)

# MACD
if 'MACD' in sample_features.columns:
    axes[2].plot(sample_features.index, sample_features['MACD'], label='MACD', linewidth=2, color='purple')
    if 'MACD_Signal' in sample_features.columns:
        axes[2].plot(sample_features.index, sample_features['MACD_Signal'], label='Signal', linewidth=2, color='red')
    axes[2].axhline(y=0, color='k', linestyle='-', alpha=0.3)
    axes[2].set_title('MACD', fontweight='bold')
    axes[2].set_ylabel('MACD')
    axes[2].legend()
    axes[2].grid(alpha=0.3)

# Bollinger Bands
if 'BB_High' in sample_features.columns and 'BB_Low' in sample_features.columns:
    axes[3].plot(sample_features.index, sample_features['Close'], label='Close', linewidth=2, color='black')
    axes[3].fill_between(sample_features.index, sample_features['BB_High'], sample_features['BB_Low'], alpha=0.2, label='Bollinger Bands')
    axes[3].set_title('Bollinger Bands', fontweight='bold')
    axes[3].set_ylabel('Price')
    axes[3].set_xlabel('Date')
    axes[3].legend()
    axes[3].grid(alpha=0.3)

plt.tight_layout()
plt.savefig("../figures/02_technical_indicators.png", dpi=150, bbox_inches="tight")
plt.show()

## 4. Create Cross-Sectional Feature Matrix

In [None]:
feature_matrix = create_feature_matrix(stock_data)
print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"Stocks: {list(feature_matrix.index)}")
print(f"\nFeature stats:")
feature_matrix.describe().round(2)

## 5. Sentiment Analysis

Note: Sentiment requires API access. If unavailable, we use zero-filled placeholders.

In [None]:
try:
    sentiment_df = compute_batch_sentiment(list(stock_data.keys())[:5], days_back=30)
    print(f"Sentiment computed for {len(sentiment_df)} stocks")
    print(sentiment_df)
except Exception as e:
    print(f"Sentiment computation skipped: {e}")
    sentiment_df = pd.DataFrame(index=feature_matrix.index, columns=["compound_mean","compound_std","positive_ratio","negative_ratio","num_articles"]).fillna(0.0)
    print("Using placeholder sentiment data")

## 6. Combine All Features

In [None]:
# Merge feature_matrix with fundamentals (numeric only)
numeric_fundamentals = fundamentals_df.select_dtypes(include=[np.number])
combined_features = pd.concat([feature_matrix, numeric_fundamentals, sentiment_df], axis=1, join='left')
combined_features = combined_features.ffill().fillna(0)
print(f"Combined feature matrix shape: {combined_features.shape}")
combined_features.head()

## 7. Feature Correlation Analysis

In [None]:
# Select top 20 features by variance
top_20_features = combined_features.var().nlargest(20).index
corr_matrix = combined_features[top_20_features].corr()

fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, annot=True, fmt='.2f', ax=ax, cbar_kws={'label': 'Correlation'})
ax.set_title('Feature Correlation Matrix (Top 20 by Variance)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig("../figures/02_correlation.png", dpi=150, bbox_inches="tight")
plt.show()

## 8. Save Engineered Features

In [None]:
os.makedirs("../data/processed", exist_ok=True)
feature_matrix.to_parquet("../data/processed/feature_matrix.parquet")
numeric_fundamentals.to_parquet("../data/processed/fundamentals_clean.parquet")
sentiment_df.to_parquet("../data/processed/sentiment.parquet")
print(f"Saved feature_matrix: {feature_matrix.shape}")
print(f"Saved fundamentals_clean: {numeric_fundamentals.shape}")
print(f"Saved sentiment: {sentiment_df.shape}")