# Data Exploration: Sentiment-Based LLM Equity Strategy

This notebook explores the text and market data used for the market-labeled sentiment strategy.

**Objectives:**
- Analyze universe composition and characteristics
- Examine text data coverage and quality
- Validate data integrity and timestamp alignment
- Identify potential data issues before model training

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Libraries imported successfully")

In [None]:
# Load market data
market_data_path = Path('data/market/prices.csv')
text_data_path = Path('data/text/news_social.csv')

if market_data_path.exists():
    market_df = pd.read_csv(market_data_path)
    market_df['date'] = pd.to_datetime(market_df['date'])
    print(f"Market data loaded: {len(market_df):,} rows")
    print(f"Date range: {market_df['date'].min()} to {market_df['date'].max()}")
    print(f"Unique stocks: {market_df['ticker'].nunique()}")
else:
    print("Market data not found. Run data acquisition first.")
    market_df = pd.DataFrame()

if text_data_path.exists():
    text_df = pd.read_csv(text_data_path)
    text_df['date'] = pd.to_datetime(text_df['date'])
    print(f"\nText data loaded: {len(text_df):,} rows")
    print(f"Date range: {text_df['date'].min()} to {text_df['date'].max()}")
    print(f"Unique stocks: {text_df['ticker'].nunique()}")
else:
    print("\nText data not found. Run data acquisition first.")
    text_df = pd.DataFrame()

## 5. Summary and Next Steps

**Key Findings:**
1. Universe size and sector distribution validated
2. Text data coverage and quality assessed
3. Missing values identified and quantified
4. Data ready for sentiment model training

**Next Steps:**
- Proceed to `02_sentiment_analysis.ipynb` for model predictions analysis
- Address any data quality issues before training
- Ensure survivorship bias control in production (use CRSP)

In [None]:
# Ticker coverage: how many stocks have text data each day
if not market_df.empty and not text_df.empty:
    market_coverage = market_df.groupby('date')['ticker'].nunique().reset_index()
    market_coverage.columns = ['date', 'market_tickers']
    
    text_coverage = text_df.groupby('date')['ticker'].nunique().reset_index()
    text_coverage.columns = ['date', 'text_tickers']
    
    coverage = market_coverage.merge(text_coverage, on='date', how='outer').fillna(0)
    coverage['coverage_pct'] = (coverage['text_tickers'] / coverage['market_tickers'] * 100).round(1)
    
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(coverage['date'], coverage['coverage_pct'], linewidth=2, color='darkblue')
    ax.set_title('Text Data Coverage (%)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('% of Stocks with Text Data')
    ax.set_ylim([0, 105])
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Average text coverage: {coverage['coverage_pct'].mean():.1f}%")
    print(f"Days with <50% coverage: {(coverage['coverage_pct'] < 50).sum()}")

In [None]:
# Missing values analysis
print("Missing Values in Market Data:")
print("="*50)
if not market_df.empty:
    missing = market_df.isnull().sum()
    missing_pct = (missing / len(market_df) * 100).round(2)
    missing_df = pd.DataFrame({'Count': missing, 'Percentage': missing_pct})
    print(missing_df[missing_df['Count'] > 0])
else:
    print("No market data loaded")

print("\n\nMissing Values in Text Data:")
print("="*50)
if not text_df.empty:
    missing_text = text_df.isnull().sum()
    missing_text_pct = (missing_text / len(text_df) * 100).round(2)
    missing_text_df = pd.DataFrame({'Count': missing_text, 'Percentage': missing_text_pct})
    print(missing_text_df[missing_text_df['Count'] > 0])
else:
    print("No text data loaded")

## 4. Data Quality Checks

In [None]:
# Sample text examples
if not text_df.empty and 'text' in text_df.columns:
    print("Sample Text Data:")
    print("="*80)
    for idx, row in text_df.head(5).iterrows():
        print(f"\nTicker: {row['ticker']} | Date: {row['date']}")
        print(f"Text: {row['text'][:200]}...")
        print("-"*80)

In [None]:
# Text length distribution
if not text_df.empty and 'text' in text_df.columns:
    text_df['text_length'] = text_df['text'].str.len()
    
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.hist(text_df['text_length'], bins=50, edgecolor='black', alpha=0.7, color='purple')
    ax.set_title('Text Length Distribution', fontsize=14, fontweight='bold')
    ax.set_xlabel('Characters')
    ax.set_ylabel('Frequency')
    ax.axvline(512, color='red', linestyle='--', label='BERT Max Tokens (~512 chars)')
    ax.legend()
    plt.tight_layout()
    plt.show()
    
    print(f"\nText Length Stats:")
    print(f"Mean: {text_df['text_length'].mean():.0f} characters")
    print(f"Median: {text_df['text_length'].median():.0f} characters")
    print(f"Max: {text_df['text_length'].max():.0f} characters")

In [None]:
# Text volume over time
if not text_df.empty:
    text_volume = text_df.groupby('date').size().reset_index()
    text_volume.columns = ['date', 'n_articles']
    
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(text_volume['date'], text_volume['n_articles'], linewidth=2, color='darkgreen')
    ax.set_title('Text Data Volume Over Time', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Articles/Mentions')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Average daily articles: {text_volume['n_articles'].mean():.0f}")
    print(f"Total articles: {len(text_df):,}")

## 3. Text Data Analysis

In [None]:
# Price and market cap distributions
if not market_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Price distribution
    axes[0, 0].hist(market_df['Close'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Price Distribution', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Price ($)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].axvline(5, color='red', linestyle='--', label='$5 Filter')
    axes[0, 0].legend()
    
    # Market cap distribution (log scale)
    if 'market_cap' in market_df.columns:
        market_cap_clean = market_df['market_cap'].dropna()
        market_cap_clean = market_cap_clean[market_cap_clean > 0]
        axes[0, 1].hist(np.log10(market_cap_clean), bins=50, edgecolor='black', alpha=0.7, color='green')
        axes[0, 1].set_title('Market Cap Distribution (Log Scale)', fontsize=12, fontweight='bold')
        axes[0, 1].set_xlabel('Log10(Market Cap)')
        axes[0, 1].set_ylabel('Frequency')
    
    # Price over time (median)
    price_over_time = market_df.groupby('date')['Close'].median().reset_index()
    axes[1, 0].plot(price_over_time['date'], price_over_time['Close'], linewidth=2, color='navy')
    axes[1, 0].set_title('Median Price Over Time', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Date')
    axes[1, 0].set_ylabel('Median Price ($)')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Volume over time (median)
    if 'Volume' in market_df.columns:
        volume_over_time = market_df.groupby('date')['Volume'].median().reset_index()
        axes[1, 1].plot(volume_over_time['date'], volume_over_time['Volume'], linewidth=2, color='orange')
        axes[1, 1].set_title('Median Volume Over Time', fontsize=12, fontweight='bold')
        axes[1, 1].set_xlabel('Date')
        axes[1, 1].set_ylabel('Median Volume')
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nPrice Stats:")
    print(f"Mean: ${market_df['Close'].mean():.2f}")
    print(f"Median: ${market_df['Close'].median():.2f}")
    print(f"Stocks below $5: {(market_df['Close'] < 5).sum() / len(market_df) * 100:.1f}%")

In [None]:
# Sector distribution (GICS Level 1)
if not market_df.empty and 'sector' in market_df.columns:
    latest_date = market_df['date'].max()
    latest_data = market_df[market_df['date'] == latest_date]
    
    sector_dist = latest_data['sector'].value_counts()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Pie chart
    ax1.pie(sector_dist.values, labels=sector_dist.index, autopct='%1.1f%%', startangle=90)
    ax1.set_title('Sector Distribution (Latest Date)', fontsize=14, fontweight='bold')
    
    # Bar chart
    sector_dist.plot(kind='bar', ax=ax2, color='steelblue')
    ax2.set_title('Number of Stocks by Sector', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Sector')
    ax2.set_ylabel('Number of Stocks')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("\nSector Distribution:")
    print(sector_dist)

In [None]:
# Universe composition over time
if not market_df.empty:
    universe_size = market_df.groupby('date')['ticker'].nunique().reset_index()
    universe_size.columns = ['date', 'n_stocks']
    
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(universe_size['date'], universe_size['n_stocks'], linewidth=2)
    ax.set_title('Universe Size Over Time', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Stocks')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Average universe size: {universe_size['n_stocks'].mean():.0f} stocks")
    print(f"Min: {universe_size['n_stocks'].min()}, Max: {universe_size['n_stocks'].max()}")

## 2. Market Data Analysis

In [None]:
# Load configuration
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Strategy:", config['strategy']['name'])
print("Target Annual Return:", config['evaluation']['benchmarks']['annualized_return'])
print("Target Sharpe Ratio:", config['evaluation']['benchmarks']['sharpe_ratio'])

## 1. Load Configuration and Data