# 02 â€” Feature Engineering

This notebook computes technical indicators, fundamental features, and sentiment scores for the selected stocks. We will:
1. Load historical price data
2. Engineer technical features (RSI, MACD, Bollinger Bands, etc.)
3. Visualize technical indicators
4. Create cross-sectional feature matrix
5. Compute sentiment scores
6. Combine all features into a unified dataset
7. Analyze feature correlations
8. Save engineered features for modeling

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import get_stock_data, fetch_multiple_stocks
from feature_engineering import engineer_features, create_feature_matrix
from sentiment import compute_batch_sentiment

# Display and plot settings
pd.set_option('display.max_columns', 100)
sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load data from previous notebook or from saved parquet
try:
    price_matrix = pd.read_parquet('../data/raw/price_matrix.parquet')
    returns_matrix = pd.read_parquet('../data/raw/returns_matrix.parquet')
    fundamentals_df = pd.read_parquet('../data/raw/fundamentals.parquet')
    selected_tickers = pd.read_csv('../data/raw/selected_tickers.csv')['ticker'].tolist()
    print("Data loaded successfully from parquet files")
except Exception as e:
    print(f"Error loading parquet files: {e}")
    print("Falling back to data_loader functions...")
    selected_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']
    stock_data = fetch_multiple_stocks(selected_tickers, period='2y')

print(f"\nLoaded data for {len(selected_tickers)} tickers")
print(f"Price matrix shape: {price_matrix.shape}")
print(f"Returns matrix shape: {returns_matrix.shape}")

## 2. Engineer Technical Features

In [None]:
# Pick one sample stock for demonstration
sample_ticker = 'AAPL'
sample_stock_data = stock_data[sample_ticker] if isinstance(stock_data, dict) else price_matrix[[sample_ticker]].to_frame()

# Engineer technical features
engineered_data = engineer_features(sample_stock_data)

print(f"Engineered features for {sample_ticker}:")
print(f"Shape: {engineered_data.shape}")
print(f"\nColumns: {engineered_data.columns.tolist()}")
print(f"\nFirst few rows:")
engineered_data.head()

## 3. Visualize Technical Indicators

In [None]:
# Create subplots for technical indicators
fig, axes = plt.subplots(3, 1, figsize=(14, 12))

# Plot 1: Price and Bollinger Bands
axes[0].plot(engineered_data.index, engineered_data['Close'], label='Close Price', linewidth=2)
if 'BB_High' in engineered_data.columns and 'BB_Low' in engineered_data.columns:
    axes[0].fill_between(engineered_data.index, engineered_data['BB_High'], 
                          engineered_data['BB_Low'], alpha=0.2, label='Bollinger Bands')
axes[0].set_title(f'{sample_ticker} Price with Bollinger Bands', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Price ($)')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Plot 2: RSI
if 'RSI' in engineered_data.columns:
    axes[1].plot(engineered_data.index, engineered_data['RSI'], label='RSI', linewidth=2, color='orange')
    axes[1].axhline(y=70, color='r', linestyle='--', alpha=0.5, label='Overbought (70)')
    axes[1].axhline(y=30, color='g', linestyle='--', alpha=0.5, label='Oversold (30)')
    axes[1].set_title('Relative Strength Index (RSI)', fontsize=12, fontweight='bold')
    axes[1].set_ylabel('RSI')
    axes[1].set_ylim([0, 100])
    axes[1].legend()
    axes[1].grid(alpha=0.3)

# Plot 3: MACD
if 'MACD' in engineered_data.columns:
    axes[2].plot(engineered_data.index, engineered_data['MACD'], label='MACD', linewidth=2, color='purple')
    if 'MACD_Signal' in engineered_data.columns:
        axes[2].plot(engineered_data.index, engineered_data['MACD_Signal'], label='Signal Line', linewidth=2, color='red')
    axes[2].axhline(y=0, color='k', linestyle='-', alpha=0.3)
    axes[2].set_title('MACD', fontsize=12, fontweight='bold')
    axes[2].set_ylabel('MACD')
    axes[2].set_xlabel('Date')
    axes[2].legend()
    axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Create Cross-Sectional Feature Matrix

In [None]:
# Create feature matrix for all selected stocks
feature_matrix = create_feature_matrix(stock_data if isinstance(stock_data, dict) else {t: price_matrix[[t]] for t in selected_tickers})

print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"\nData types:")
print(feature_matrix.dtypes)
print(f"\nFeature matrix info:")
print(feature_matrix.info())
print(f"\nFirst few rows:")
feature_matrix.head()

## 5. Sentiment Analysis

In [None]:
# Compute sentiment for sample tickers
sample_sentiment_tickers = selected_tickers[:10]  # Sentiment analysis for first 10 stocks

try:
    sentiment_scores = compute_batch_sentiment(sample_sentiment_tickers)
    print(f"Sentiment scores computed for {len(sentiment_scores)} tickers")
    print(f"\nSentiment Scores:")
    print(sentiment_scores)
except Exception as e:
    print(f"Note: Sentiment analysis not available: {e}")
    print("Creating dummy sentiment data for demonstration...")
    sentiment_scores = pd.DataFrame({
        'ticker': sample_sentiment_tickers,
        'sentiment': np.random.randn(len(sample_sentiment_tickers)),
        'news_count': np.random.randint(5, 50, len(sample_sentiment_tickers))
    })
    print(sentiment_scores)

## 6. Combine All Features

In [None]:
# Merge feature_matrix with fundamentals
combined_features = feature_matrix.copy()

# Add fundamentals data
if 'fundamentals_df' in locals():
    # Align indices if needed
    fundamentals_indexed = fundamentals_df.reset_index().set_index('ticker')
    for col in fundamentals_indexed.columns:
        combined_features[f'fund_{col}'] = combined_features.index.map(
            fundamentals_indexed[col].to_dict()
        )

# Add sentiment data
if 'sentiment_scores' in locals():
    sentiment_indexed = sentiment_scores.set_index('ticker')
    for col in sentiment_indexed.columns:
        combined_features[f'sent_{col}'] = combined_features.index.map(
            sentiment_indexed[col].to_dict()
        )

# Handle missing values
print(f"Missing values before handling:")
print(combined_features.isnull().sum().sum())

# Fill missing values
combined_features = combined_features.fillna(method='ffill').fillna(method='bfill')

print(f"\nCombined feature matrix shape: {combined_features.shape}")
print(f"Total columns: {len(combined_features.columns)}")
print(f"Missing values after handling: {combined_features.isnull().sum().sum()}")
print(f"\nFirst few rows:")
combined_features.head()

## 7. Feature Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_features = combined_features.select_dtypes(include=[np.number])

# Calculate correlations
correlation_matrix = numeric_features.corr()

# Plot correlation heatmap for top features by variance
top_n = 20
top_features = numeric_features.var().nlargest(top_n).index
top_correlation = correlation_matrix.loc[top_features, top_features]

plt.figure(figsize=(12, 10))
sns.heatmap(top_correlation, cmap='coolwarm', center=0, 
            square=True, annot=True, fmt='.2f', cbar_kws={'label': 'Correlation'})
plt.title(f'Top {top_n} Features Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Find highly correlated pairs
print(f"\nHighly correlated feature pairs (> 0.8):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            print(f"{correlation_matrix.columns[i]} <-> {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")

## 8. Save Engineered Features

In [None]:
import os

# Create processed data directory
os.makedirs('../data/processed', exist_ok=True)

# Save combined features
combined_features.to_parquet('../data/processed/feature_matrix.parquet')

# Save feature metadata
feature_metadata = pd.DataFrame({
    'feature': combined_features.columns,
    'data_type': combined_features.dtypes.astype(str),
    'missing_pct': (combined_features.isnull().sum() / len(combined_features) * 100).values
})
feature_metadata.to_csv('../data/processed/feature_metadata.csv', index=False)

print("Features saved successfully!")
print(f"  - feature_matrix.parquet ({combined_features.shape})")
print(f"  - feature_metadata.csv (metadata for {len(feature_metadata)} features)")