## Importing dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.stats import pearsonr
import nltk
%matplotlib inline
import re

## Load all stock data 

In [None]:
print("=== LOADING NEWS AND ALL STOCK DATA ===")

# Load news data with flexible date parsing (same approach as Task 1)
news_df = pd.read_csv('../data/raw_analyst_ratings.csv')

# Convert date column to string first for consistent handling
news_df['date_str'] = news_df['date'].astype(str)

# Extract basic date information using string operations
news_df['year'] = news_df['date_str'].str.extract(r'(\d{4})-\d{2}-\d{2}').astype(float)
news_df['month'] = news_df['date_str'].str.extract(r'\d{4}-(\d{2})-\d{2}').astype(float)
news_df['date_simple'] = news_df['date_str'].str.extract(r'(\d{4}-\d{2}-\d{2})')

# Extract time if available
def extract_hour(date_str):
    """Extract hour from date string"""
    try:
        time_match = re.search(r'(\d{1,2}):(\d{2}):(\d{2})', str(date_str))
        if time_match:
            return int(time_match.group(1))
    except:
        pass
    return 0  # Default to midnight if no time found

news_df['hour'] = news_df['date_str'].apply(extract_hour)

print(f"News Data: {len(news_df):,} articles")
print(f"   Date extraction: {news_df['date_simple'].notna().sum():,} articles with dates")

# Load ALL 6 stock datasets
stock_files = {
    'AAPL': '../data/AAPL.csv',
    'MSFT': '../data/MSFT.csv', 
    'GOOG': '../data/GOOG.csv',
    'AMZN': '../data/AMZN.csv',
    'NVDA': '../data/NVDA.csv',
    'META': '../data/META.csv'
}

stocks_data = {}
for stock_name, file_path in stock_files.items():
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])  # Stock dates are clean
    df.set_index('Date', inplace=True)
    df['stock'] = stock_name
    stocks_data[stock_name] = df
    print(f" {stock_name}: {len(df):,} trading days")

# Combine all stocks for analysis
all_stocks_df = pd.concat(stocks_data.values())
all_stocks_df['date_only'] = all_stocks_df.index.date

print(f"\n Date Ranges:")
print(f"   News: {news_df['date_simple'].min()} to {news_df['date_simple'].max()}")
print(f"   Stocks: {all_stocks_df.index.min()} to {all_stocks_df.index.max()}")
print(f"   Total Stock Records: {len(all_stocks_df):,}")

# Preview data
print(f"\n News Sample:")
print(news_df[['headline', 'date_simple', 'hour', 'publisher', 'stock']].head(3))
print(f"\n Stocks Sample:")
print(all_stocks_df[['stock', 'Close', 'Volume']].head(6))

## Date alignment

In [None]:
print("=== DATE ALIGNMENT ===")

# Simple alignment: Use date_simple to match with stock dates
# Since we have mixed date formats, we'll use the extracted date_simple

def simple_date_alignment(news_row, stock_dates_dict):
    """
    Simple alignment using extracted date_simple
    """
    date_simple = news_row['date_simple']
    stock_symbol = news_row['stock']
    
    if pd.isna(date_simple):
        return None
    
    # Get trading dates for this stock
    stock_dates = stock_dates_dict.get(stock_symbol, [])
    
    # Convert date_simple to datetime for comparison
    try:
        news_date = pd.to_datetime(date_simple).date()
        
        # Check if this date exists in stock data
        if news_date in stock_dates:
            return news_date
        
        # If not, find the closest trading day
        future_dates = [d for d in stock_dates if d >= news_date]
        if future_dates:
            return min(future_dates)
        
        past_dates = [d for d in stock_dates if d <= news_date]
        if past_dates:
            return max(past_dates)
            
    except:
        pass
    
    return None

# Get trading dates for each stock (as date objects)
trading_dates_by_stock = {}
for stock_name, df in stocks_data.items():
    trading_dates_by_stock[stock_name] = sorted(df.index.date)

print(" Aligning news with trading days...")

# Apply simple alignment
news_df['aligned_date'] = news_df.apply(
    lambda row: simple_date_alignment(row, trading_dates_by_stock),
    axis=1
)

# Check alignment success
aligned_count = news_df['aligned_date'].notna().sum()
print(f" Successfully aligned: {aligned_count:,}/{len(news_df):,} articles ({aligned_count/len(news_df)*100:.1f}%)")

# Show alignment by stock
print(f"\n Alignment by Stock:")
for stock in stock_files.keys():
    stock_news = news_df[news_df['stock'] == stock]
    aligned_stock = stock_news['aligned_date'].notna().sum()
    total_stock = len(stock_news)
    if total_stock > 0:
        print(f"   {stock}: {aligned_stock:,}/{total_stock:,} articles ({aligned_stock/total_stock*100:.1f}%)")

# Show some examples
print(f"\n Alignment Examples:")
aligned_samples = news_df[news_df['aligned_date'].notna()].head(5)
for _, row in aligned_samples.iterrows():
    print(f"   {row['date_simple']} (hour: {row['hour']:02d}) ‚Üí {row['aligned_date']} | {row['stock']}")

## Sentiments

In [None]:
print("=== SENTIMENT ANALYSIS FOR ALL STOCKS ===")

# Initialize sentiment analyzers
try:
    vader_analyzer = SentimentIntensityAnalyzer()
    vader_available = True
    print(" Using VADER for sentiment analysis")
except:
    vader_available = False
    print("  VADER not available, using TextBlob only")

def analyze_sentiment_vader(text):
    """Use VADER or fallback to TextBlob"""
    if vader_available:
        scores = vader_analyzer.polarity_scores(str(text))
        return scores['compound']
    else:
        # Fallback to TextBlob
        analysis = TextBlob(str(text))
        return analysis.sentiment.polarity

print("Analyzing sentiment for aligned articles...")

# Apply sentiment analysis to aligned news only
aligned_news = news_df[news_df['aligned_date'].notna()].copy()

# Primary sentiment method
aligned_news['sentiment'] = aligned_news['headline'].apply(analyze_sentiment_vader)

print(f" Sentiment analysis completed on {len(aligned_news):,} aligned articles")

# Sentiment distribution by stock
print(f"\n Sentiment Distribution by Stock:")
for stock in stock_files.keys():
    stock_sentiments = aligned_news[aligned_news['stock'] == stock]['sentiment']
    if len(stock_sentiments) > 0:
        print(f"   {stock}: {len(stock_sentiments):>5,} articles | Mean: {stock_sentiments.mean():.3f} | Range: {stock_sentiments.min():.3f} to {stock_sentiments.max():.3f}")

# Overall sentiment stats
print(f"\n Overall Sentiment Statistics:")
print(f"   Mean Sentiment: {aligned_news['sentiment'].mean():.3f}")
print(f"   Positive Articles (>0.05): {len(aligned_news[aligned_news['sentiment'] > 0.05]):,} ({len(aligned_news[aligned_news['sentiment'] > 0.05])/len(aligned_news)*100:.1f}%)")
print(f"   Negative Articles (<-0.05): {len(aligned_news[aligned_news['sentiment'] < -0.05]):,} ({len(aligned_news[aligned_news['sentiment'] < -0.05])/len(aligned_news)*100:.1f}%)")
print(f"   Neutral Articles: {len(aligned_news[(aligned_news['sentiment'] >= -0.05) & (aligned_news['sentiment'] <= 0.05)]):,} ({len(aligned_news[(aligned_news['sentiment'] >= -0.05) & (aligned_news['sentiment'] <= 0.05)])/len(aligned_news)*100:.1f}%)")

# Show sentiment examples
print(f"\n Sentiment Examples:")
sample_sentiments = aligned_news[['headline', 'stock', 'sentiment']].head(8)
for _, row in sample_sentiments.iterrows():
    sentiment_label = "POSITIVE" if row['sentiment'] > 0.05 else "NEGATIVE" if row['sentiment'] < -0.05 else "NEUTRAL"
    print(f"   {sentiment_label:>8} ({row['sentiment']:6.3f}) | {row['stock']} | {row['headline'][:55]}...")

## Daily aggregation

In [None]:
print("=== DAILY AGGREGATION: SENTIMENT & RETURNS ===")

# Calculate daily stock returns for all stocks
print(" Calculating daily returns for all stocks...")
for stock_name, df in stocks_data.items():
    df['daily_return'] = df['Close'].pct_change() * 100  # Percentage returns

# Aggregate daily sentiment by stock and date
print(" Aggregating daily sentiment scores...")
daily_sentiment = aligned_news.groupby(['stock', 'aligned_date']).agg({
    'sentiment': ['mean', 'count'],  # Average sentiment and article count
    'headline': 'count'
}).round(4)

# Flatten column names
daily_sentiment.columns = ['sentiment_mean', 'sentiment_count', 'article_count']
daily_sentiment = daily_sentiment.reset_index()

print(f" Created {len(daily_sentiment):,} daily sentiment records")
print(f"Date range: {daily_sentiment['aligned_date'].min()} to {daily_sentiment['aligned_date'].max()}")

# Show daily aggregation examples
print(f"\n Daily Aggregation Examples:")
for stock in list(stock_files.keys())[:2]:  # Show first 2 stocks
    stock_daily = daily_sentiment[daily_sentiment['stock'] == stock].head(3)
    print(f"\n   {stock}:")
    for _, row in stock_daily.iterrows():
        print(f"      {row['aligned_date']}: {row['article_count']} articles, sentiment: {row['sentiment_mean']:.3f}")

## Correlation analysis

In [None]:
print("=== CORRELATION ANALYSIS ===")

# Merge sentiment with stock returns
correlation_results = []

for stock_name in stock_files.keys():
    print(f"\nüîç Analyzing {stock_name}...")
    
    # Get stock returns
    stock_returns = stocks_data[stock_name][['daily_return']].copy()
    stock_returns = stock_returns.reset_index()
    stock_returns['date_only'] = stock_returns['Date'].dt.date
    
    # Get sentiment for this stock
    stock_sentiment = daily_sentiment[daily_sentiment['stock'] == stock_name].copy()
    
    if len(stock_sentiment) > 0:
        # Merge on date
        merged_data = pd.merge(stock_sentiment, stock_returns, 
                              left_on='aligned_date', right_on='date_only')
        
        if len(merged_data) > 10:  # Need sufficient data points
            # Calculate correlation
            correlation, p_value = pearsonr(merged_data['sentiment_mean'], 
                                          merged_data['daily_return'])
            
            correlation_results.append({
                'stock': stock_name,
                'correlation': correlation,
                'p_value': p_value,
                'data_points': len(merged_data),
                'mean_sentiment': merged_data['sentiment_mean'].mean(),
                'mean_return': merged_data['daily_return'].mean()
            })
            
            print(f"    Correlation: {correlation:.4f} (p-value: {p_value:.4f})")
            print(f"    Data points: {len(merged_data)} days")
            print(f"    Mean return: {merged_data['daily_return'].mean():.3f}%")
            print(f"    Mean sentiment: {merged_data['sentiment_mean'].mean():.3f}")
        else:
            print(f"    Insufficient data: {len(merged_data)} points")
    else:
        print(f"     No sentiment data for {stock_name}")

# Create correlation results dataframe
if correlation_results:
    corr_df = pd.DataFrame(correlation_results)
    print(f"\nüèÜ CORRELATION RANKING:")
    corr_df = corr_df.sort_values('correlation', ascending=False)
    
    for _, row in corr_df.iterrows():
        significance = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
        print(f"   {row['stock']}: {row['correlation']:.4f} {significance} (p={row['p_value']:.4f})")
else:
    print("‚ùå No correlation results calculated")