## Summary

**Key Findings:**
1. Sentiment score distribution characteristics validated
2. Correlation with next-day returns measured
3. Sentiment spread tracked for signal decay detection
4. Sector biases identified for neutrality constraints
5. Market cap hypothesis tested (information asymmetry)

**Next Steps:**
- Proceed to `03_portfolio_construction.ipynb` for portfolio formation analysis
- Monitor sentiment spread for signal decay
- Consider sector-neutral constraints in portfolio construction

In [None]:
if not merged_df.empty and 'market_cap' in merged_df.columns:
    # Classify stocks by market cap
    merged_df['cap_category'] = pd.qcut(merged_df['market_cap'], q=3, labels=['Small', 'Mid', 'Large'], duplicates='drop')
    
    # Sentiment correlation with returns by market cap
    cap_analysis = merged_df.groupby('cap_category').apply(
        lambda x: x[['sentiment', 'next_return']].corr().iloc[0, 1]
    ).reset_index()
    cap_analysis.columns = ['Market Cap', 'Correlation']
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(cap_analysis['Market Cap'], cap_analysis['Correlation'], 
           color=['lightcoral', 'lightblue', 'lightgreen'], edgecolor='black')
    ax.set_title('Sentiment-Return Correlation by Market Cap\n(Information Asymmetry Test)', 
                 fontsize=14, fontweight='bold')
    ax.set_ylabel('Correlation')
    ax.axhline(0, color='red', linestyle='--', alpha=0.5)
    ax.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    print("\nSentiment-Return Correlation by Market Cap:")
    print("="*50)
    print(cap_analysis)
    print("\nHypothesis: Smaller companies more sensitive to sentiment")
    print(f"Validated: {'✓ Yes' if cap_analysis.iloc[0]['Correlation'] > cap_analysis.iloc[-1]['Correlation'] else '✗ No'}")

## 7. Market Cap Analysis (Information Asymmetry Hypothesis)

In [None]:
if not merged_df.empty and 'sector' in merged_df.columns:
    # Box plot by sector
    fig, ax = plt.subplots(figsize=(14, 6))
    merged_df.boxplot(column='sentiment', by='sector', ax=ax, rot=45)
    ax.set_title('Sentiment Distribution by Sector', fontsize=14, fontweight='bold')
    ax.set_xlabel('Sector (GICS L1)')
    ax.set_ylabel('Sentiment Score')
    plt.suptitle('')  # Remove default title
    plt.tight_layout()
    plt.show()
    
    # Sector sentiment averages
    sector_sentiment = merged_df.groupby('sector')['sentiment'].mean().sort_values()
    print("\nAverage Sentiment by Sector:")
    print("="*50)
    print(sector_sentiment)

## 6. Sector Analysis

In [None]:
if not merged_df.empty:
    # Calculate sentiment spread (Long - Short)
    merged_df['rank'] = merged_df.groupby('date')['sentiment'].rank(pct=True)
    long_leg = merged_df[merged_df['rank'] >= 0.9]
    short_leg = merged_df[merged_df['rank'] <= 0.1]
    
    sentiment_spread = pd.DataFrame({
        'date': merged_df.groupby('date')['sentiment'].mean().index,
        'long_sentiment': long_leg.groupby('date')['sentiment'].mean().values,
        'short_sentiment': short_leg.groupby('date')['sentiment'].mean().values
    })
    sentiment_spread['spread'] = sentiment_spread['long_sentiment'] - sentiment_spread['short_sentiment']
    
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(sentiment_spread['date'], sentiment_spread['spread'], linewidth=2, color='darkblue')
    ax.set_title('Sentiment Spread (Long - Short)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Sentiment Spread')
    ax.axhline(sentiment_spread['spread'].mean(), color='red', linestyle='--', 
               label=f"Mean: {sentiment_spread['spread'].mean():.3f}")
    ax.grid(True, alpha=0.3)
    ax.legend()
    plt.tight_layout()
    plt.show()
    
    # Alert if spread narrowing
    threshold = 0.5 * sentiment_spread['spread'].std()
    recent_spread = sentiment_spread['spread'].iloc[-20:].mean()
    print(f"Recent 20-day avg spread: {recent_spread:.3f}")
    print(f"Alert threshold: {threshold:.3f}")
    print(f"Status: {'⚠️ ALERT - Spread narrowing!' if recent_spread < threshold else '✓ Normal'}")

## 5. Sentiment Spread Monitoring

In [None]:
if not merged_df.empty:
    # Create sentiment deciles
    merged_df['sentiment_decile'] = pd.qcut(merged_df['sentiment'], q=10, labels=False, duplicates='drop') + 1
    
    # Calculate average return by decile
    decile_returns = merged_df.groupby('sentiment_decile')['next_return'].mean().reset_index()
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(decile_returns['sentiment_decile'], decile_returns['next_return'], 
           color='steelblue', edgecolor='black', alpha=0.7)
    ax.set_title('Average Next-Day Return by Sentiment Decile', fontsize=14, fontweight='bold')
    ax.set_xlabel('Sentiment Decile (1=Lowest, 10=Highest)')
    ax.set_ylabel('Average Next-Day Return')
    ax.axhline(0, color='red', linestyle='--', alpha=0.5)
    ax.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    print("\nAverage Return by Decile:")
    print("="*50)
    print(decile_returns)
    print(f"\nLong-Short Spread (D10 - D1): {(decile_returns.iloc[-1]['next_return'] - decile_returns.iloc[0]['next_return']) * 100:.2f}%")

## 4. Cross-Sectional Analysis: Decile Returns

In [None]:
# Rolling correlation over time
if not merged_df.empty and len(merged_df) > 100:
    merged_df_sorted = merged_df.sort_values('date')
    rolling_corr = merged_df_sorted[['sentiment', 'next_return']].rolling(window=252).corr().iloc[0::2, -1]
    
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(merged_df_sorted['date'].iloc[::2], rolling_corr.values, linewidth=2, color='darkgreen')
    ax.set_title('Rolling 252-Day Correlation (Sentiment vs Next-Day Return)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Correlation')
    ax.axhline(0, color='red', linestyle='--', alpha=0.5)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
if not merged_df.empty:
    # Scatter plot: Sentiment vs Next-Day Return
    sample_df = merged_df.dropna(subset=['sentiment', 'next_return']).sample(min(1000, len(merged_df)))
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.scatter(sample_df['sentiment'], sample_df['next_return'], alpha=0.3, s=20)
    
    # Add regression line
    z = np.polyfit(sample_df['sentiment'], sample_df['next_return'], 1)
    p = np.poly1d(z)
    ax.plot(sample_df['sentiment'].sort_values(), p(sample_df['sentiment'].sort_values()), 
            "r--", linewidth=2, label=f"y={z[0]:.4f}x+{z[1]:.4f}")
    
    ax.set_title('Sentiment Score vs Next-Day Return', fontsize=14, fontweight='bold')
    ax.set_xlabel('Sentiment Score')
    ax.set_ylabel('Next-Day Return')
    ax.grid(True, alpha=0.3)
    ax.legend()
    plt.tight_layout()
    plt.show()
    
    # Correlation
    corr = merged_df[['sentiment', 'next_return']].corr().iloc[0, 1]
    print(f"Correlation (Sentiment vs Next-Day Return): {corr:.4f}")

## 3. Sentiment vs Returns Correlation

In [None]:
if not merged_df.empty:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    axes[0].hist(merged_df['sentiment'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    axes[0].set_title('Sentiment Score Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Sentiment Score')
    axes[0].set_ylabel('Frequency')
    axes[0].axvline(merged_df['sentiment'].mean(), color='red', linestyle='--', label=f"Mean: {merged_df['sentiment'].mean():.3f}")
    axes[0].legend()
    
    # Q-Q plot (normality test)
    stats.probplot(merged_df['sentiment'].dropna(), dist="norm", plot=axes[1])
    axes[1].set_title('Q-Q Plot (Normality Test)', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Descriptive stats
    print("Sentiment Score Statistics:")
    print("="*50)
    print(merged_df['sentiment'].describe())
    
    # Normality test
    _, p_value = stats.shapiro(merged_df['sentiment'].sample(min(5000, len(merged_df))))
    print(f"\nShapiro-Wilk p-value: {p_value:.4f}")
    print(f"Normal distribution: {'Yes' if p_value > 0.05 else 'No'}")

## 2. Sentiment Score Distribution Analysis

In [None]:
# Load sentiment scores
try:
    sentiment_df = pd.read_csv('data/portfolios/sentiment_scores.csv')
    sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
    print(f"Sentiment scores loaded: {len(sentiment_df):,} rows")
except FileNotFoundError:
    print("Run inference first: python main.py --mode inference")
    sentiment_df = pd.DataFrame()

# Load market data
try:
    market_df = pd.read_csv('data/market/prices.csv')
    market_df['date'] = pd.to_datetime(market_df['date'])
    print(f"Market data loaded: {len(market_df):,} rows")
except FileNotFoundError:
    print("Run data acquisition first")
    market_df = pd.DataFrame()

# Merge
if not sentiment_df.empty and not market_df.empty:
    merged_df = sentiment_df.merge(
        market_df[['ticker', 'date', 'return', 'sector', 'market_cap']], 
        on=['ticker', 'date'], 
        how='inner'
    )
    # Calculate next-day return
    merged_df = merged_df.sort_values(['ticker', 'date'])
    merged_df['next_return'] = merged_df.groupby('ticker')['return'].shift(-1)
    print(f"\nMerged data: {len(merged_df):,} rows")
else:
    merged_df = pd.DataFrame()

## 1. Load Sentiment Scores and Market Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# Sentiment Analysis: Market-Labeled BERT Predictions

This notebook analyzes the sentiment scores generated by the Smarty BERT model.

**Key Questions:**
1. Are sentiment scores normally distributed?
2. Do sentiment scores predict next-day returns?
3. Is there a sentiment spread between long/short portfolios?
4. Are there sector or market-cap biases?