In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment import SentimentIntensityAnalyzer
import yfinance as yf

In [2]:
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [16]:
# --- Load and Combine Stock Data ---
def load_stock_data(folder_path):
    stock_frames = []
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            ticker = file.replace('.csv', '').upper()
            df_stock = pd.read_csv(os.path.join(folder_path, file), parse_dates=['Date'])
            df_stock['Ticker'] = ticker
            stock_frames.append(df_stock)
    return pd.concat(stock_frames, ignore_index=True)

In [26]:
# --- Sentiment Analysis Function ---
def sentiment_analysis(df):
    print("\n=== Sentiment Analysis ===")
    sia = SentimentIntensityAnalyzer()
    df['sentiment_scores'] = df['headline'].apply(lambda x: sia.polarity_scores(x))
    df['sentiment_compound'] = df['sentiment_scores'].apply(lambda x: x['compound'])
    df['sentiment_label'] = df['sentiment_compound'].apply(
        lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral')
    )
    print(df[['headline', 'sentiment_compound', 'sentiment_label']].head())
    return df

In [29]:
# --- Correlation Analysis Function ---

def correlation_analysis(merged_df):
    print("\n=== Correlation Analysis ===")
    
    # Ensure date columns are datetime.date
    merged_df['date'] = pd.to_datetime(merged_df['date']).dt.date
    
    # Aggregate daily sentiment by mean compound score for each stock
    daily_sentiment = merged_df.groupby(['date', 'stock'])['sentiment_compound'].mean().reset_index()
    
    # Aggregate daily stock closing price (mean in case multiple entries per day)
    daily_price = merged_df.groupby(['date', 'stock'])['Close'].mean().reset_index()
    
    # Merge sentiment and price on date and stock
    combined = pd.merge(daily_sentiment, daily_price, on=['date', 'stock'], how='inner')
    
    # Calculate correlation per stock
    stocks = combined['stock'].unique()
    for ticker in stocks:
        df_stock = combined[combined['stock'] == ticker]
        corr = df_stock['sentiment_compound'].corr(df_stock['Close'])
        print(f"Correlation between sentiment and close price for {ticker}: {corr:.4f}")
        
        # Plotting
        fig, ax1 = plt.subplots(figsize=(10,5))
        ax2 = ax1.twinx()
        
        ax1.plot(df_stock['date'], df_stock['sentiment_compound'], 'g-', label='Avg Daily Sentiment')
        ax2.plot(df_stock['date'], df_stock['Close'], 'b-', label='Avg Close Price')
        
        ax1.set_xlabel('Date')
        ax1.set_ylabel('Sentiment', color='g')
        ax2.set_ylabel('Close Price', color='b')
        plt.title(f'Daily Sentiment vs. Stock Price for {ticker}')
        fig.tight_layout()
        plt.show()


In [None]:
import nltk
nltk.download('vader_lexicon')

# --- Main Function ---
def main():
    # Load news data
    df = pd.read_csv('../data/raw_analyst_ratings.csv')
    df.columns = df.columns.str.strip().str.lower()

    if 'date' not in df.columns or 'headline' not in df.columns or 'stock' not in df.columns:
        print("❌ Required columns 'date', 'headline', or 'stock' not found.")
        return

    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])

    

    # Feature Engineering
    df['headline_length'] = df['headline'].apply(lambda x: len(str(x).split()))
    df['publication_day'] = df['date'].dt.day_name()
    df['publication_hour'] = df['date'].dt.hour
    df['stock'] = df['stock'].str.upper()

    # Load pre-downloaded stock data
    stock_data = load_stock_data('../data/yfinance_data/')  # directory with multiple stock CSVs
    stock_data.rename(columns={'Date': 'date'}, inplace=True)
    stock_data['Ticker'] = stock_data['Ticker'].str.upper()
    df['date'] = df['date'].dt.tz_localize(None)
    stock_data['date'] = stock_data['date'].dt.tz_localize(None)


    # Merge on date and ticker
    merged_df = pd.merge(df, stock_data, left_on=['date', 'stock'], right_on=['date', 'Ticker'], how='left')

    # Apply sentiment
    merged = sentiment_analysis(merged_df)

    # Run all analysis
    #descriptive_stats(merged)
    #topic_modeling(merged)
    #publisher_analysis(merged)
    correlation_analysis(merged_df)

if __name__ == "__main__":
    main()