# Task 3: Correlation Analysis between News Sentiment and Multiple Stocks

This notebook analyzes the correlation between news sentiment and multiple stock price movements.
It attempts to use stock-specific news sentiment if available, otherwise, it can be adapted
to use general market sentiment.

In [None]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import scipy.stats as stats
from typing import Dict, List, Tuple, Optional, Union



# Import functions from utility modules
from src.utils.sentiment_analysis import analyze_headlines, aggregate_daily_sentiment,plot_sentiment_returns_timeseries
from src.utils.correlation_analysis import calculate_returns, align_sentiment_returns, calculate_correlation_metrics, analyze_lagged_correlations

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [12, 6]

## 1. Load and Prepare Data

In [None]:
# Load news data
NEWS_FILE_PATH = '../data/raw_analyst_ratings.csv/raw_analyst_ratings.csv'
try:
    news_df_raw = pd.read_csv(NEWS_FILE_PATH)
    print("Raw news data shape:", news_df_raw.shape)
    print("News data columns:", news_df_raw.columns)
    news_df_raw.head()
except FileNotFoundError:
    print(f"Error: News data file not found at {NEWS_FILE_PATH}")
    news_df_raw = pd.DataFrame() # Create empty df to avoid later errors if user wants to proceed

In [None]:
# Load stock data for all companies
STOCK_DATA_DIR = Path('../data/yfinance_data')
stock_dfs = {}
if STOCK_DATA_DIR.exists() and STOCK_DATA_DIR.is_dir():
    stock_files = list(STOCK_DATA_DIR.glob('*.csv'))
    if not stock_files:
        print(f"No CSV files found in {STOCK_DATA_DIR}")
    for file in stock_files:
        try:
            symbol = file.stem.upper()  # Get filename without extension, use upper for consistency
            df = pd.read_csv(file)
            df['Date'] = pd.to_datetime(df['Date'])
            # Ensure 'Close' column exists
            if 'Close' not in df.columns:
                print(f"Warning: 'Close' column not found in {file.name}. Skipping this stock.")
                continue
            stock_dfs[symbol] = df
            print(f"Loaded {symbol} data with shape: {df.shape}")
        except Exception as e:
            print(f"Error loading or processing stock file {file.name}: {e}")
else:
    print(f"Error: Stock data directory not found at {STOCK_DATA_DIR}")

if stock_dfs:
    list(stock_dfs.values())[0].head()
else:
    print("No stock data loaded.")

## 2. Sentiment Analysis of News
We will perform sentiment analysis on headlines. If the news data contains a 'stock' or 'symbol' column,
we will attempt to perform stock-specific sentiment aggregation.

In [None]:
all_sentiment_results = pd.DataFrame()
daily_sentiment_aggregated = pd.DataFrame()

# Standardize potential stock symbol column names in news_df
# Common names: 'stock', 'ticker', 'symbol'
NEWS_SYMBOL_COL = None
potential_symbol_cols = ['stock', 'ticker', 'symbol']
for col in potential_symbol_cols:
    if col in news_df_raw.columns:
        NEWS_SYMBOL_COL = col
        print(f"Found stock symbol column: '{col}'")
        break

if not news_df_raw.empty:
    if 'headline' in news_df_raw.columns:
        # Analyze sentiment for each headline
        print("Analyzing sentiment for news headlines...")
        sentiment_scores_df = analyze_headlines(news_df_raw['headline'])
        
        # Add date column from news data
        if 'date' in news_df_raw.columns:
            sentiment_scores_df['date'] = pd.to_datetime(news_df_raw['date'], errors='coerce')
            sentiment_scores_df.dropna(subset=['date'], inplace=True) # Remove rows where date couldn't be parsed
        
        # Add company symbols if available and identified
        if NEWS_SYMBOL_COL:
            sentiment_scores_df[NEWS_SYMBOL_COL] = news_df_raw[NEWS_SYMBOL_COL].str.upper() # Standardize to uppercase
        
        # Store the full sentiment results
        all_sentiment_results = sentiment_scores_df
        
        # Aggregate sentiment by date (and symbol if available)
        if 'date' in sentiment_scores_df.columns:
            print("Aggregating daily sentiment...")
            daily_sentiment_aggregated = aggregate_daily_sentiment(
                sentiment_scores_df, 'date', NEWS_SYMBOL_COL
            )
            print(f"Daily sentiment shape: {daily_sentiment_aggregated.shape}")
            daily_sentiment_aggregated.head()
        else:
            print("Warning: No date column found in news data. Cannot aggregate by date.")
        
        # Visualize sentiment distribution
        if not all_sentiment_results.empty:
            plt.figure(figsize=(10, 6))
            sns.histplot(data=all_sentiment_results, x='polarity', bins=50, kde=True)
            plt.title('Distribution of Overall Sentiment Polarity in News')
            plt.xlabel('Polarity Score')
            plt.show()
        
        if not daily_sentiment_aggregated.empty:
            plt.figure(figsize=(12, 6))
            plt.plot(daily_sentiment_aggregated['date'], daily_sentiment_aggregated['mean_polarity'])
            plt.title('Daily Average Sentiment Polarity Over Time')
            plt.xlabel('Date')
            plt.ylabel('Mean Polarity')
            plt.grid(True)
            plt.show()
    else:
        print("News data is empty. Skipping sentiment analysis.")
else:
    print("Error: 'headline' column not found in news_df_raw. Cannot perform sentiment analysis.")

## 3. Calculate Stock Returns

In [None]:
stock_returns = {}
if stock_dfs:
    for symbol, df in stock_dfs.items():
        try:
            # Calculate daily returns
            returns = calculate_returns(df['Close'])
            
            # Create a DataFrame with date and returns
            returns_df = pd.DataFrame({
                'Date': df['Date'],
                'Close': df['Close'],
                'Returns': returns
            }).set_index('Date')
            
            # Store in dictionary
            stock_returns[symbol] = returns_df
            
            plot_sentiment_returns_timeseries()
            print(f"Calculated returns for {symbol}")
        except Exception as e:
            print(f"Error calculating returns for {symbol}: {e}")
    
    # Show example of returns for one stock
    if stock_returns:
        first_symbol = list(stock_returns.keys())[0]
        print(f"\nExample returns for {first_symbol}:")
        stock_returns[first_symbol].head()
else:
    print("No stock data loaded to calculate returns.")

## 4. Correlation Analysis for Each Stock
We will now correlate the daily sentiment scores with stock returns.
If stock-specific sentiment is available, it will be used for the respective stock.
Otherwise, a general sentiment (if calculated) might be used, or the stock skipped.

In [None]:
correlation_results = {}
aligned_data_cache = {} # To store aligned data for later use (e.g., plotting)

if stock_returns and not daily_sentiment_aggregated.empty:
    for symbol, returns_df in stock_returns.items():
        print(f"\nAnalyzing correlation for {symbol}...")
        
        # Get sentiment data for this symbol if available
        if NEWS_SYMBOL_COL and NEWS_SYMBOL_COL in daily_sentiment_aggregated.columns:
            # Filter sentiment for this specific stock
            symbol_sentiment = daily_sentiment_aggregated[daily_sentiment_aggregated[NEWS_SYMBOL_COL] == symbol]
            if len(symbol_sentiment) > 1:  # Need at least 2 points for correlation
                print(f"Found {len(symbol_sentiment)} days with sentiment data for {symbol}")
                sentiment_to_use = symbol_sentiment
            else:
                print(f"Insufficient sentiment data for {symbol} (only {len(symbol_sentiment)} days). Skipping.")
                correlation_results[symbol] = {
                    'pearson_correlation': None, 'pearson_p_value': None,
                    'spearman_correlation': None, 'spearman_p_value': None,
                    'n_observations': len(symbol_sentiment),
                    'error': 'Insufficient sentiment data'
                }
                continue
        else:
            # Use general sentiment (all headlines)
            print(f"No stock-specific sentiment found. Using general market sentiment for {symbol}.")
            sentiment_to_use = daily_sentiment_aggregated
        
        try:
            # Align sentiment and returns data by date
            aligned_sentiment, aligned_returns = align_sentiment_returns(
                sentiment_to_use, 
                returns_df.reset_index(), 
                'date' if 'date' in sentiment_to_use.columns else daily_sentiment_aggregated.columns[0],
                'Date'
            )
            
            if len(aligned_sentiment) >= 2:  # Need at least 2 points for correlation
                print(f"Found {len(aligned_sentiment)} days with both sentiment and returns data")
                
                # Store aligned data for later use
                aligned_data_cache[symbol] = {
                    'sentiment': aligned_sentiment,
                    'returns': aligned_returns
                }
                
                # Calculate correlation metrics
                metrics = calculate_correlation_metrics(
                    aligned_sentiment['mean_polarity'],
                    aligned_returns['Returns']
                )
                
                correlation_results[symbol] = metrics
                
                # Print correlation results
                print(f"Pearson correlation: {metrics['pearson_correlation']:.4f} (p-value: {metrics['pearson_p_value']:.4f})")
                print(f"Spearman correlation: {metrics['spearman_correlation']:.4f} (p-value: {metrics['spearman_p_value']:.4f})")
                
                # Analyze lagged correlations
                print("\nAnalyzing lagged correlations...")
                lagged_corr = analyze_lagged_correlations(
                    aligned_sentiment['mean_polarity'],
                    aligned_returns['Returns'],
                    max_lag=5
                )
                
                if not lagged_corr.empty:
                    # Plot lagged correlations
                    plt.figure(figsize=(12, 6))
                    plt.bar(lagged_corr['lag'], lagged_corr['pearson_correlation'])
                    plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
                    plt.title(f'Lagged Correlations for {symbol} (Sentiment vs. Returns)')
                    plt.xlabel('Lag (days): negative = sentiment leads, positive = returns lead')
                    plt.ylabel('Pearson Correlation')
                    
                    # Add N values to the plot
                    for i, row in lagged_corr.iterrows():
                        plt.text(row['lag'], row['pearson_correlation'], 
                                f"N={row['n_observations']}", 
                                ha='center', va='bottom' if row['pearson_correlation'] > 0 else 'top',
                                fontsize=8)
                    
                    plt.grid(True)
                    plt.show()
                else:
                    print("Could not calculate lagged correlations due to insufficient data")
            else:
                print(f"Insufficient aligned data for {symbol} (only {len(aligned_sentiment)} days). Skipping correlation analysis.")
                correlation_results[symbol] = {
                    'pearson_correlation': None, 'pearson_p_value': None,
                    'spearman_correlation': None, 'spearman_p_value': None,
                    'n_observations': len(aligned_sentiment),
                    'error': 'Insufficient aligned data'
                }
        except Exception as e:
            print(f"Error analyzing correlation for {symbol}: {str(e)}")
            correlation_results[symbol] = {
                'pearson_correlation': None, 'pearson_p_value': None,
                'spearman_correlation': None, 'spearman_p_value': None,
                'error': str(e)
            }
else:
    print("Cannot perform correlation analysis: missing sentiment data or stock returns.")

## 5. Visualize Correlation Results

In [None]:
# Scatter plot of sentiment vs returns for each stock with sufficient data
if aligned_data_cache:
    for symbol, data in aligned_data_cache.items():
        if len(data['sentiment']) >= 5:  # Only plot if we have enough data points
            plt.figure(figsize=(10, 6))
            plt.scatter(data['sentiment']['mean_polarity'], data['returns']['Returns'], alpha=0.7)
            
            # Add regression line if we have valid correlation
            if symbol in correlation_results and correlation_results[symbol].get('valid', False):
                x = data['sentiment']['mean_polarity']
                y = data['returns']['Returns']
                
                # Simple linear regression
                try:
                    m, b = np.polyfit(x, y, 1)
                    plt.plot(x, m*x + b, 'r-', alpha=0.7)
                    
                    # Add correlation info to plot
                    corr = correlation_results[symbol]['pearson_correlation']
                    p_val = correlation_results[symbol]['pearson_p_value']
                    plt.text(0.05, 0.95, f"Correlation: {corr:.4f}\np-value: {p_val:.4f}", 
                            transform=plt.gca().transAxes, fontsize=12,
                            verticalalignment='top', bbox=dict(boxstyle='round', alpha=0.1))
                except Exception as e:
                    print(f"Could not add regression line for {symbol}: {e}")
            
            plt.title(f'Sentiment vs. Returns for {symbol}')
            plt.xlabel('Mean Sentiment Polarity')
            plt.ylabel('Returns')
            plt.grid(True)
            plt.show()

## 6. Summary of Correlation Analysis

In [None]:
# Create a summary table of correlation results
if correlation_results:
    summary_data = []
    
    for symbol, metrics in correlation_results.items():
        if metrics.get('valid', True) and metrics.get('pearson_correlation') is not None:
            summary_data.append({
                'Symbol': symbol,
                'Pearson Correlation': metrics['pearson_correlation'],
                'Pearson p-value': metrics['pearson_p_value'],
                'Spearman Correlation': metrics['spearman_correlation'],
                'Spearman p-value': metrics['spearman_p_value'],
                'Observations': metrics.get('n_observations', 0),
                'Significant at 5% (Pearson)': metrics['pearson_p_value'] < 0.05,
                'Notes': 'Valid correlation'
            })
        else: # Include symbols for which analysis failed or was skipped, with reasons
             summary_data.append({
                'Symbol': symbol,
                'Pearson Correlation': None, 'Pearson p-value': None,
                'Spearman Correlation': None, 'Spearman p-value': None,
                'Observations': metrics.get('n_observations', 0),
                'Significant at 5% (Pearson)': False,
                'Notes': metrics.get('error', 'Skipped or failed')
            })


    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        # Attempt to sort, handle cases where correlation might be None
        try:
            summary_df = summary_df.sort_values('Pearson Correlation', ascending=False, na_position='last')
        except TypeError:
            print("Note: Could not sort summary by Pearson Correlation due to mixed types (likely None values).")
        
        print("\nComprehensive Summary of Correlation Analysis:")
        print(summary_df)
    else:
        print("No data to generate summary insights.")
else:
    print("No correlation results to summarize.")

---
