In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import nltk
from scipy.stats import pearsonr
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Try to import textblob, if not available we'll use VADER only
try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except ImportError:
    print("TextBlob not available, using VADER only")
    TEXTBLOB_AVAILABLE = False

# Download required NLTK data
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

print("=== TASK 3: NEWS SENTIMENT & STOCK MOVEMENT CORRELATION ===\n")

=== TASK 3: NEWS SENTIMENT & STOCK MOVEMENT CORRELATION ===



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/liluebuy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
def parse_mixed_dates_fixed(date_series):
    # Convert all to timezone-naive first
    dates_parsed = pd.to_datetime(date_series, errors='coerce')
    # Remove timezone info from all dates
    dates_parsed = dates_parsed.dt.tz_localize(None)
    return dates_parsed

print("Loading news data with proper date parsing...")
df_news = pd.read_csv('../data/raw_analyst_ratings.csv')
df_news['date'] = parse_mixed_dates_fixed(df_news['date'])

# Get the most mentioned stock from news data
top_stock = df_news['stock'].value_counts().head(1).index[0]
print(f"Most mentioned stock: {top_stock}")

# Download stock data for the same period as news
end_date = df_news['date'].max()
start_date = end_date - timedelta(days=365)  # 1 year of data

# Convert to string for yfinance
start_str = start_date.strftime('%Y-%m-%d')
end_str = end_date.strftime('%Y-%m-%d')

print(f"Downloading {top_stock} data from {start_str} to {end_str}...")
stock_data = yf.download(top_stock, start=start_str, end=end_str)
stock_data.columns = [col[0] for col in stock_data.columns]  # Flatten columns

print(f"Downloaded {len(stock_data)} trading days")

Loading news data with proper date parsing...
Most mentioned stock: MRK
Downloading MRK data from 2019-06-12 to 2020-06-11...


[*********************100%***********************]  1 of 1 completed

Downloaded 252 trading days





In [16]:
print("\n=== DATE ALIGNMENT ===\n")

# First, check for and remove any NaN dates
print(f"News data before cleaning: {len(df_news)}")
df_news_clean = df_news.dropna(subset=['date']).copy()
print(f"News data after removing NaN dates: {len(df_news_clean)}")

# Extract date-only for alignment
df_news_clean['date_only'] = df_news_clean['date'].dt.date
stock_data['date_only'] = stock_data.index.date

print(f"News date range: {df_news_clean['date_only'].min()} to {df_news_clean['date_only'].max()}")
print(f"Stock date range: {stock_data['date_only'].min()} to {stock_data['date_only'].max()}")

# Filter news to match stock trading days
news_in_range = df_news_clean[df_news_clean['date_only'].isin(stock_data['date_only'])]
print(f"News articles on trading days: {len(news_in_range)}")

# Use the cleaned data for the rest of the analysis
df_news = df_news_clean


=== DATE ALIGNMENT ===

News data before cleaning: 1407328
News data after removing NaN dates: 55987
News date range: 2011-04-27 to 2020-06-11
Stock date range: 2019-06-12 to 2020-06-10
News articles on trading days: 31846


## Sentiment analysis

In [17]:
print("\n=== SENTIMENT ANALYSIS ===\n")

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

def analyze_sentiment_vader(text):
    """Analyze sentiment using VADER"""
    if pd.isna(text):
        return 0.0
    scores = sia.polarity_scores(str(text))
    return scores['compound']  # Returns between -1 (negative) and +1 (positive)

# Apply sentiment analysis to headlines
print("Analyzing sentiment for news headlines...")
df_news['sentiment_vader'] = df_news['headline'].apply(analyze_sentiment_vader)

# If TextBlob is available, use it too
if TEXTBLOB_AVAILABLE:
    def analyze_sentiment_textblob(text):
        if pd.isna(text):
            return 0.0
        blob = TextBlob(str(text))
        return blob.sentiment.polarity  # Returns between -1 and +1
    
    df_news['sentiment_textblob'] = df_news['headline'].apply(analyze_sentiment_textblob)
    print("TextBlob sentiment analysis completed")
else:
    df_news['sentiment_textblob'] = df_news['sentiment_vader']
    print("Using VADER sentiment for both measures")

print("Sentiment analysis completed!")
print(f"Sentiment statistics (VADER):")
print(f"  Mean: {df_news['sentiment_vader'].mean():.4f}")
print(f"  Std:  {df_news['sentiment_vader'].std():.4f}")
print(f"  Min:  {df_news['sentiment_vader'].min():.4f}")
print(f"  Max:  {df_news['sentiment_vader'].max():.4f}")


=== SENTIMENT ANALYSIS ===

Analyzing sentiment for news headlines...
TextBlob sentiment analysis completed
Sentiment analysis completed!
Sentiment statistics (VADER):
  Mean: 0.0666
  Std:  0.3135
  Min:  -0.9382
  Max:  0.9666


In [18]:
print("\n=== STOCK RETURNS CALCULATION ===\n")

# Calculate daily returns
stock_data['daily_return'] = stock_data['Close'].pct_change()
stock_data['daily_return_pct'] = stock_data['daily_return'] * 100  # Percentage

print("Stock returns calculated:")
print(stock_data[['Close', 'daily_return_pct']].describe())

# Remove the first row (NaN return)
stock_data_clean = stock_data.dropna(subset=['daily_return']).copy()
print(f"Trading days with returns: {len(stock_data_clean)}")


=== STOCK RETURNS CALCULATION ===

Stock returns calculated:
            Close  daily_return_pct
count  252.000000        251.000000
mean    66.047177          0.021362
std      3.398457          1.973164
min     53.279583         -8.899009
25%     64.597492         -0.827409
50%     66.082150          0.000000
75%     67.683992          0.942763
max     73.248100          7.783653
Trading days with returns: 251


In [19]:
#  Aggregate Daily Sentiment
print("\n=== AGGREGATE DAILY SENTIMENT ===\n")

# Group news by date and calculate average daily sentiment
daily_sentiment = df_news.groupby('date_only').agg({
    'sentiment_vader': ['mean', 'count'],
    'sentiment_textblob': 'mean'
}).round(4)

# Flatten column names
daily_sentiment.columns = ['sentiment_vader_mean', 'article_count', 'sentiment_textblob_mean']
daily_sentiment = daily_sentiment.reset_index()

print("Daily sentiment summary:")
print(f"Days with news: {len(daily_sentiment)}")
print(f"Average articles per day: {daily_sentiment['article_count'].mean():.2f}")
print(daily_sentiment.describe())


=== AGGREGATE DAILY SENTIMENT ===

Daily sentiment summary:
Days with news: 2528
Average articles per day: 22.15
       sentiment_vader_mean  article_count  sentiment_textblob_mean
count           2528.000000    2528.000000              2528.000000
mean               0.076574      22.146756                 0.047167
std                0.156459      68.144109                 0.110425
min               -0.802000       1.000000                -1.000000
25%                0.000000       3.000000                 0.000000
50%                0.063450       9.000000                 0.025000
75%                0.148425      17.000000                 0.075050
max                0.790600     973.000000                 1.000000


In [None]:
#  Merge Sentiment with Stock Returns
print("\n=== MERGING DATASETS ===\n")

# Merge sentiment with stock returns
merged_data = pd.merge(
    daily_sentiment,
    stock_data_clean[['date_only', 'daily_return', 'daily_return_pct', 'Close']],
    on='date_only',
    how='inner'
)

print(f"Merged dataset size: {len(merged_data)}")
print("Merged data sample:")
print(merged_data.head(10))

In [22]:
# Step 7: Merge Sentiment with Stock Returns (Redo)
print("\n=== MERGING DATASETS ===\n")

# Group news by date and calculate average daily sentiment
daily_sentiment = df_news.groupby('date_only').agg({
    'sentiment_vader': ['mean', 'count'],
    'sentiment_textblob': 'mean'
}).round(4)

# Flatten column names
daily_sentiment.columns = ['sentiment_vader_mean', 'article_count', 'sentiment_textblob_mean']
daily_sentiment = daily_sentiment.reset_index()

# Merge sentiment with stock returns
merged_data = pd.merge(
    daily_sentiment,
    stock_data_clean[['date_only', 'daily_return', 'daily_return_pct', 'Close']],
    on='date_only',
    how='inner'
)

print(f"Merged dataset size: {len(merged_data)}")
print("Merged data sample:")
print(merged_data.head())


=== MERGING DATASETS ===

Merged dataset size: 251
Merged data sample:
    date_only  sentiment_vader_mean  article_count  sentiment_textblob_mean  \
0  2019-06-13                0.1178             19                   0.1382   
1  2019-06-14                0.1229             27                   0.1623   
2  2019-06-17                0.1595             13                   0.0881   
3  2019-06-18                0.1441             18                   0.1132   
4  2019-06-19                0.1417             19                   0.0887   

   daily_return  daily_return_pct      Close  
0     -0.009676         -0.967646  64.660477  
1      0.005221          0.522125  64.998085  
2      0.006282          0.628176  65.406387  
3      0.014286          1.428575  66.340767  
4      0.010297          1.029715  67.023888  


In [24]:
# Step 8: Correlation Analysis
print("\n=== CORRELATION ANALYSIS ===\n")

# Calculate Pearson correlation
corr_vader = pearsonr(merged_data['sentiment_vader_mean'], merged_data['daily_return'])[0]
corr_textblob = pearsonr(merged_data['sentiment_textblob_mean'], merged_data['daily_return'])[0]

print("PEARSON CORRELATION RESULTS:")
print(f"VADER Sentiment vs Daily Returns: {corr_vader:.4f}")
print(f"TextBlob Sentiment vs Daily Returns: {corr_textblob:.4f}")

# Interpret correlation strength
def interpret_correlation(corr):
    abs_corr = abs(corr)
    if abs_corr >= 0.7:
        return "Strong"
    elif abs_corr >= 0.5:
        return "Moderate" 
    elif abs_corr >= 0.3:
        return "Weak"
    else:
        return "Very Weak"

print(f"\nCorrelation Strength (VADER): {interpret_correlation(corr_vader)}")
print(f"Correlation Strength (TextBlob): {interpret_correlation(corr_textblob)}")


=== CORRELATION ANALYSIS ===

PEARSON CORRELATION RESULTS:
VADER Sentiment vs Daily Returns: 0.1763
TextBlob Sentiment vs Daily Returns: 0.0852

Correlation Strength (VADER): Very Weak
Correlation Strength (TextBlob): Very Weak
