# Batch Features Validation

This notebook validates the batch features computed by the ETL jobs.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession

# Set up plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Initialize Spark Session

In [None]:
# Initialize Spark session
spark = (
    SparkSession.builder
    .appName("validate_batch_features")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

## 2. Run Batch Feature Jobs

First, let's run the batch feature jobs to generate the features.

In [None]:
# Import the batch feature modules
import sys
sys.path.append('../src/etl')

import batch_features
import batch_fundamentals

In [None]:
# Run the batch feature jobs
print("Running batch_features.py...")
batch_features.main()

print("\nRunning batch_fundamentals.py...")
batch_fundamentals.main()

## 3. Validate Technical Indicators

In [None]:
# Load technical indicators
technical_df = spark.read.format("delta").load("../data/features/batch/technical")

# Show schema
print("Technical Indicators Schema:")
technical_df.printSchema()

# Show sample data
print("\nSample Technical Indicators:")
technical_df.show(5)

In [None]:
# Convert to Pandas for easier analysis
technical_pd = technical_df.toPandas()

# Display summary statistics
print("Technical Indicators Summary Statistics:")
technical_pd.describe()

In [None]:
# Plot technical indicators for a specific symbol
symbol = "AAPL"  # Change this to any symbol in your data
symbol_data = technical_pd[technical_pd['symbol'] == symbol].sort_values('timestamp')

if len(symbol_data) > 0:
    # Plot price and moving averages
    plt.figure(figsize=(14, 7))
    plt.plot(symbol_data['timestamp'], symbol_data['close'], label='Close Price')
    plt.plot(symbol_data['timestamp'], symbol_data['ma_5'], label='5-period MA')
    plt.plot(symbol_data['timestamp'], symbol_data['ma_15'], label='15-period MA')
    plt.plot(symbol_data['timestamp'], symbol_data['ma_60'], label='60-period MA')
    plt.title(f'{symbol} Price and Moving Averages')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Plot RSI
    plt.figure(figsize=(14, 5))
    plt.plot(symbol_data['timestamp'], symbol_data['rsi_14'])
    plt.axhline(y=70, color='r', linestyle='-', alpha=0.3)
    plt.axhline(y=30, color='g', linestyle='-', alpha=0.3)
    plt.title(f'{symbol} RSI (14-period)')
    plt.xlabel('Time')
    plt.ylabel('RSI')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Plot ATR
    plt.figure(figsize=(14, 5))
    plt.plot(symbol_data['timestamp'], symbol_data['atr_14'])
    plt.title(f'{symbol} ATR (14-period)')
    plt.xlabel('Time')
    plt.ylabel('ATR')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
else:
    print(f"No data available for symbol {symbol}")

## 4. Validate Fundamental Indicators

In [None]:
# Load fundamental indicators
fundamental_df = spark.read.format("delta").load("../data/features/batch/fundamental")

# Show schema
print("Fundamental Indicators Schema:")
fundamental_df.printSchema()

# Show sample data
print("\nSample Fundamental Indicators:")
fundamental_df.show(5)

In [None]:
# Convert to Pandas for easier analysis
fundamental_pd = fundamental_df.toPandas()

# Display summary statistics
print("Fundamental Indicators Summary Statistics:")
fundamental_pd.describe()

In [None]:
# Plot fundamental indicators
if len(fundamental_pd) > 0:
    # Plot P/E ratio by symbol
    plt.figure(figsize=(12, 6))
    sns.barplot(x='symbol', y='price_to_earnings', data=fundamental_pd)
    plt.title('Price-to-Earnings Ratio by Symbol')
    plt.xlabel('Symbol')
    plt.ylabel('P/E Ratio')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Plot Debt-to-Equity ratio by symbol
    plt.figure(figsize=(12, 6))
    sns.barplot(x='symbol', y='debt_to_equity', data=fundamental_pd)
    plt.title('Debt-to-Equity Ratio by Symbol')
    plt.xlabel('Symbol')
    plt.ylabel('D/E Ratio')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Plot sentiment by symbol
    plt.figure(figsize=(12, 6))
    sns.barplot(x='symbol', y='avg_sentiment', data=fundamental_pd)
    plt.title('Average Sentiment by Symbol')
    plt.xlabel('Symbol')
    plt.ylabel('Sentiment Score')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Plot mention count by symbol
    plt.figure(figsize=(12, 6))
    sns.barplot(x='symbol', y='mention_count', data=fundamental_pd)
    plt.title('Mention Count by Symbol')
    plt.xlabel('Symbol')
    plt.ylabel('Mentions')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
else:
    print("No fundamental data available")

## 5. Correlation Analysis

In [None]:
# Join technical and fundamental data
if len(technical_pd) > 0 and len(fundamental_pd) > 0:
    # Convert date columns to same format
    technical_pd['date'] = pd.to_datetime(technical_pd['date']).dt.date
    fundamental_pd['date'] = pd.to_datetime(fundamental_pd['date']).dt.date
    
    # Group technical data by symbol and date
    tech_daily = technical_pd.groupby(['symbol', 'date']).agg({
        'close': 'last',
        'ma_5': 'last',
        'ma_15': 'last',
        'ma_60': 'last',
        'rsi_14': 'last',
        'atr_14': 'last'
    }).reset_index()
    
    # Merge with fundamental data
    merged_df = pd.merge(tech_daily, fundamental_pd, on=['symbol', 'date'], how='inner')
    
    if len(merged_df) > 0:
        # Select numeric columns for correlation
        numeric_cols = merged_df.select_dtypes(include=[np.number]).columns
        
        # Compute correlation matrix
        corr_matrix = merged_df[numeric_cols].corr()
        
        # Plot correlation heatmap
        plt.figure(figsize=(14, 12))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
        plt.title('Correlation Matrix of Technical and Fundamental Features')
        plt.tight_layout()
        plt.show()
    else:
        print("No overlapping data between technical and fundamental indicators")
else:
    print("Insufficient data for correlation analysis")

## 6. Summary and Findings

### Technical Indicators
- Moving Averages (5, 15, 60-period) provide trend information at different time scales
- RSI (14-period) helps identify overbought/oversold conditions
- ATR (14-period) measures volatility

### Fundamental Indicators
- Price-to-Earnings ratio provides valuation context
- Debt-to-Equity ratio indicates financial leverage
- Sentiment analysis from news provides market perception

### Correlation Analysis
- Observed relationships between technical and fundamental indicators
- Potential for feature selection based on correlation analysis

### Next Steps
- Integrate these features into the Feast feature store
- Use these features for model training and backtesting

In [None]:
# Stop Spark session
spark.stop()