In [1]:
import pandas as pd
from textblob import TextBlob
from scipy.stats import pearsonr


In [5]:
# Load datasets
news_df = pd.read_csv('../data/raw_analyst_ratings.csv', parse_dates=['date'])
stock_df = pd.read_csv('../data/stock_metrics.csv', parse_dates=['Date'])

# Ensure 'date' column is datetime
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')
stock_df['Date'] = pd.to_datetime(stock_df['Date'], errors='coerce')

# Normalize dates (remove time component)
news_df['date'] = news_df['date'].dt.date
stock_df['Date'] = stock_df['Date'].dt.date

# Sentiment analysis on news headlines
def get_sentiment(headline):
    return TextBlob(str(headline)).sentiment.polarity  # Returns score from -1 (negative) to 1 (positive)

news_df['Sentiment'] = news_df['headline'].apply(get_sentiment)

In [6]:
# Aggregate daily sentiment scores per stock
daily_sentiment = news_df.groupby(['stock', 'date'])['Sentiment'].mean().reset_index()
daily_sentiment.rename(columns={'stock': 'Stock', 'date': 'Date'}, inplace=True)

# Merge news sentiment with stock data
merged_df = pd.merge(
    daily_sentiment,
    stock_df[['Date', 'Stock', 'Daily_Return']],
    on=['Date', 'Stock'],
    how='inner'
)

# Calculate Pearson correlation per stock
tickers = merged_df['Stock'].unique()
correlations = {}
for ticker in tickers:
    df_ticker = merged_df[merged_df['Stock'] == ticker]
    if len(df_ticker) > 1:  # Need at least 2 data points for correlation
        corr, _ = pearsonr(df_ticker['Sentiment'], df_ticker['Daily_Return'])
        correlations[ticker] = corr
    else:
        correlations[ticker] = None
        print(f"Warning: Insufficient data for {ticker}")

# Save merged data and correlations
merged_df.to_csv('../data/news_stock_merged.csv', index=False)
pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation']).to_csv('../data/correlations.csv')

print("Merged data saved to '../data/news_stock_merged.csv'")
print("Correlations saved to '../data/correlations.csv'")
print("Correlation results:", correlations)

Merged data saved to '../data/news_stock_merged.csv'
Correlations saved to '../data/correlations.csv'
Correlation results: {'AAPL': np.float64(-1.0), 'AMZN': np.float64(-1.0), 'GOOG': np.float64(-0.08944881499680815), 'NVDA': np.float64(0.17571428527519445), 'TSLA': None}
