In [None]:
# Correlation Analysis Notebook
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import DataLoader
from sentiment_analyzer import SentimentAnalyzer
from technical_indicators import TechnicalIndicators
from correlation_analysis import CorrelationAnalysis

# Initialize classes
data_loader = DataLoader()
sentiment_analyzer = SentimentAnalyzer()
tech_indicators = TechnicalIndicators()
correlation_analyzer = CorrelationAnalysis()

# Load and prepare data
news_data = data_loader.load_news_data()
tickers = data_loader.get_available_tickers()

# Perform sentiment analysis
news_with_sentiment = sentiment_analyzer.analyze_news_sentiment(news_data)
daily_sentiment = sentiment_analyzer.get_daily_sentiment_scores(news_with_sentiment)

# Analyze each stock
correlation_results_all = {}

for ticker in tickers:
    print(f"\n{'='*50}")
    print(f"Analyzing {ticker}")
    print(f"{'='*50}")
    
    # Load stock data
    stock_data = data_loader.load_stock_data([ticker])
    if ticker not in stock_data:
        continue
    
    stock_df = stock_data[ticker]
    stock_with_indicators = tech_indicators.calculate_all_indicators(stock_df)
    
    # Filter sentiment for this ticker
    ticker_sentiment = news_with_sentiment[news_with_sentiment['stock'] == ticker]
    daily_ticker_sentiment = sentiment_analyzer.get_daily_sentiment_scores(ticker_sentiment)
    
    if len(daily_ticker_sentiment) == 0:
        print(f"No sentiment data for {ticker}")
        continue
    
    # Perform correlation analysis
    correlation_results = correlation_analyzer.comprehensive_correlation_analysis(
        daily_ticker_sentiment, stock_with_indicators
    )
    
    correlation_results_all[ticker] = correlation_results
    
    # Print correlation report
    report = correlation_analyzer.generate_correlation_report(correlation_results)
    print(report)
    
    # Visualization: Sentiment vs Returns
    sentiment_aligned, stock_aligned = correlation_analyzer.align_data(
        daily_ticker_sentiment, stock_with_indicators
    )
    
    if len(sentiment_aligned) > 0:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Plot 1: Sentiment vs Price
        ax1 = axes[0, 0]
        ax1.plot(sentiment_aligned.index, sentiment_aligned['avg_sentiment'], 
                color='blue', label='Avg Sentiment', alpha=0.7)
        ax1.set_ylabel('Sentiment Score', color='blue')
        ax1.tick_params(axis='y', labelcolor='blue')
        ax1.legend(loc='upper left')
        
        ax1b = ax1.twinx()
        ax1b.plot(stock_aligned.index, stock_aligned['Close'], 
                 color='red', label='Close Price', alpha=0.7)
        ax1b.set_ylabel('Price', color='red')
        ax1b.tick_params(axis='y', labelcolor='red')
        ax1b.legend(loc='upper right')
        ax1.set_title(f'{ticker} - Sentiment vs Price')
        
        # Plot 2: Sentiment vs Daily Returns
        ax2 = axes[0, 1]
        scatter = ax2.scatter(sentiment_aligned['avg_sentiment'], 
                            stock_aligned['daily_return'] * 100, 
                            alpha=0.6, c=stock_aligned.index, cmap='viridis')
        ax2.set_xlabel('Average Sentiment')
        ax2.set_ylabel('Daily Return (%)')
        ax2.set_title(f'{ticker} - Sentiment vs Returns\nCorrelation: {correlation_results["sentiment_vs_returns"].get("pearson_correlation", 0):.3f}')
        plt.colorbar(scatter, ax=ax2, label='Time')
        
        # Plot 3: Sentiment distribution by return sign
        ax3 = axes[1, 0]
        positive_returns = sentiment_aligned[stock_aligned['daily_return'] > 0]['avg_sentiment']
        negative_returns = sentiment_aligned[stock_aligned['daily_return'] < 0]['avg_sentiment']
        
        ax3.boxplot([positive_returns, negative_returns], 
                   labels=['Positive Returns', 'Negative Returns'])
        ax3.set_ylabel('Sentiment Score')
        ax3.set_title('Sentiment Distribution by Return Sign')
        
        # Plot 4: Cumulative returns by sentiment days
        ax4 = axes[1, 1]
        high_sentiment_days = sentiment_aligned['avg_sentiment'] > sentiment_aligned['avg_sentiment'].median()
        low_sentiment_days = sentiment_aligned['avg_sentiment'] <= sentiment_aligned['avg_sentiment'].median()
        
        high_sentiment_cumulative = (1 + stock_aligned[high_sentiment_days]['daily_return']).cumprod()
        low_sentiment_cumulative = (1 + stock_aligned[low_sentiment_days]['daily_return']).cumprod()
        
        ax4.plot(high_sentiment_cumulative.index, high_sentiment_cumulative, 
                label='High Sentiment Days', color='green')
        ax4.plot(low_sentiment_cumulative.index, low_sentiment_cumulative, 
                label='Low Sentiment Days', color='red')
        ax4.set_ylabel('Cumulative Return')
        ax4.set_title('Cumulative Returns: High vs Low Sentiment Days')
        ax4.legend()
        
        plt.tight_layout()
        plt.show()

# Summary correlation analysis
summary_data = []
for ticker, results in correlation_results_all.items():
    if 'sentiment_vs_returns' in results:
        summary_data.append({
            'ticker': ticker,
            'pearson_correlation': results['sentiment_vs_returns'].get('pearson_correlation', 0),
            'spearman_correlation': results['sentiment_vs_returns'].get('spearman_correlation', 0),
            'sample_size': results['sentiment_vs_returns'].get('sample_size', 0),
            'p_value': results['sentiment_vs_returns'].get('pearson_p_value', 1)
        })

summary_df = pd.DataFrame(summary_data)
print("\nSUMMARY CORRELATION ACROSS ALL STOCKS:")
print(summary_df)

# Plot overall correlation distribution
plt.figure(figsize=(10, 6))
plt.hist(summary_df['pearson_correlation'], bins=20, edgecolor='black', alpha=0.7)
plt.axvline(summary_df['pearson_correlation'].mean(), color='red', linestyle='--', 
           label=f'Mean: {summary_df["pearson_correlation"].mean():.3f}')
plt.xlabel('Pearson Correlation Coefficient')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment-Return Correlations Across Stocks')
plt.legend()
plt.show()