In [9]:
# --- Imports for Task 3 ---
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging
from scipy.stats import pearsonr
from datetime import datetime

In [22]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='task3_analysis.log', filemode='w')
logger = logging.getLogger(__name__)
logger.handlers = []  # Clear existing handlers
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(console)

# Stock symbols
stock_symbols = ['TSLA', 'NVDA', 'META', 'AMZN', 'GOOG', 'AAPL', 'MSFT']

# Directories
DATA_DIR = os.path.abspath('../data')
PLOTS_DIR = os.path.abspath('../Plots-task3')
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# Set plotting style
sns.set_theme(style="whitegrid")
logger.info("Starting Task 3: Correlation Analysis for %s", ', '.join(stock_symbols))

Starting Task 3: Correlation Analysis for TSLA, NVDA, META, AMZN, GOOG, AAPL, MSFT


In [28]:
# Load processed stock data
def load_stock_data(symbol, data_dir):
    for pkl_name in [f"{symbol}_indicators.pkl", f"{symbol}_processed.pkl"]:
        pkl_path = os.path.join(data_dir, pkl_name)
        if os.path.exists(pkl_path):
            logger.info("Attempting to load stock data for %s from %s", symbol, pkl_path)
            try:
                df = pd.read_pickle(pkl_path)
                if 'Close' not in df.columns:
                    logger.error("No 'Close' column in %s", pkl_path)
                    continue
                df.index = pd.to_datetime(df.index).tz_localize(None)
                df['Daily_Return'] = df['Close'].pct_change()
                logger.info("Loaded stock data from %s. Date range: %s to %s", pkl_path, df.index.min(), df.index.max())
                return df[['Close', 'Daily_Return']]
            except Exception as e:
                logger.error("Error loading %s: %s", pkl_path, e)

    csv_path = os.path.join(data_dir, f"{symbol}_historical_data.csv")
    logger.info("Falling back to CSV for %s from %s", symbol, csv_path)
    try:
        if not os.path.exists(csv_path):
            logger.error("No CSV file found for %s at %s", symbol, csv_path)
            return None
        df = pd.read_csv(csv_path)
        if 'Date' not in df.columns or 'Close' not in df.columns:
            logger.error("Missing 'Date' or 'Close' column in %s", csv_path)
            return None
        df['Date'] = pd.to_datetime(df['Date']).dt.tz_localize(None)
        df.set_index('Date', inplace=True)
        df['Daily_Return'] = df['Close'].pct_change()
        logger.info("Loaded stock data from %s. Date range: %s to %s", csv_path, df.index.min(), df.index.max())
        return df[['Close', 'Daily_Return']]
    except Exception as e:
        logger.error("Error loading CSV %s: %s", csv_path, e)
        return None

In [29]:
# Load news data
def load_news_data(data_dir, target_stocks):
    pkl_path = os.path.join(data_dir, 'processed_news_data.pkl')
    logger.info("Attempting to load news data from %s", pkl_path)
    if os.path.exists(pkl_path):
        try:
            df = pd.read_pickle(pkl_path)
            required_cols = ['date', 'headline', 'stock']
            if not all(col in df.columns for col in required_cols):
                logger.error("Missing required columns in %s: %s", pkl_path, required_cols)
                return None
            df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_convert('US/Eastern').dt.tz_localize(None)
            df = df[df['stock'].isin(target_stocks)]
            logger.info("Loaded news data from %s. Sample:\n%s", pkl_path, df.head().to_string())
            logger.info("News headlines per stock:\n%s", df['stock'].value_counts().to_string())
            return df[required_cols]
        except Exception as e:
            logger.error("Error loading %s: %s", pkl_path, e)

    csv_path = os.path.join(data_dir, 'raw_analyst_ratings.csv')
    logger.info("Falling back to CSV for news from %s", csv_path)
    try:
        if not os.path.exists(csv_path):
            logger.error("No CSV file found for news at %s", csv_path)
            return None
        df = pd.read_csv(csv_path)
        # Log raw counts for debugging
        logger.info("Raw news headline counts:\n%s", df[df['stock'].isin(target_stocks + ['FB'])]['stock'].value_counts().to_string())
        required_cols = ['date', 'stock', 'headline']
        if not all(col in df.columns for col in required_cols):
            logger.error("Missing required columns in %s: %s", csv_path, required_cols)
            return None
        df['date'] = pd.to_datetime(df['date'], utc=True, errors='coerce').dt.tz_convert('US/Eastern').dt.tz_localize(None)
        df = df.dropna(subset=['date'])
        df = df[df['stock'].isin(target_stocks)]
        logger.info("Filtered news data from %s. Sample:\n%s", csv_path, df.head().to_string())
        logger.info("Filtered news headlines per stock:\n%s", df['stock'].value_counts().to_string())
        return df[required_cols]
    except Exception as e:
        logger.error("Error loading news CSV %s: %s", csv_path, e)
        return None

In [30]:
# Perform sentiment analysis
def get_sentiment(headline):
    try:
        if not isinstance(headline, str) or not headline.strip():
            return 0.0
        blob = TextBlob(headline)
        return blob.sentiment.polarity
    except Exception as e:
        logger.warning("Error analyzing sentiment for headline '%s': %s", headline, e)
        return 0.0

In [31]:
# Load data
all_stocks_data = {}
for symbol in stock_symbols:
    df = load_stock_data(symbol, DATA_DIR)
    if df is not None:
        all_stocks_data[symbol] = df

news_df = load_news_data(DATA_DIR, stock_symbols)
if news_df is None or not all_stocks_data:
    logger.error("Failed to load news or stock data. Exiting.")
    exit(1)

logger.info("Loaded stock data for %d symbols: %s", len(all_stocks_data), list(all_stocks_data.keys()))

Attempting to load stock data for TSLA from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_indicators.pkl
Error loading d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_indicators.pkl: invalid load key, '\x1f'.
Attempting to load stock data for TSLA from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_processed.pkl
Error loading d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_processed.pkl: invalid load key, '\x1f'.
Falling back to CSV for TSLA from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_historical_data.csv
Loaded stock data from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_historical_data.csv. Date range: 2010-06-29 00:00:00 to 2024-07-30 00:00:00
Attempting to load stock data for NVDA from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\NVDA_indicators.pkl


In [32]:
# Sentiment analysis
logger.info("Performing sentiment analysis on news headlines")
news_df['sentiment'] = news_df['headline'].apply(get_sentiment)
logger.info("Sentiment analysis completed. Sample sentiments:\n%s", news_df[['headline', 'sentiment']].head().to_string())

# Aggregate daily sentiment per stock
daily_sentiment = news_df.groupby(['stock', news_df['date'].dt.date])['sentiment'].mean().reset_index()
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
logger.info("Aggregated daily sentiment scores:\n%s", daily_sentiment.head().to_string())

Performing sentiment analysis on news headlines
Sentiment analysis completed. Sample sentiments:
                                                                                     headline  sentiment
6680                    Tech Stocks And FAANGS Strong Again To Start Day As Market Awaits Fed   0.433333
6681                                            10 Biggest Price Target Changes For Wednesday   0.000000
6682  Benzinga Pro's Top 5 Stocks To Watch For Wed., Jun. 10, 2020: AAPL, BAC, NIO, SONO, GLW   0.500000
6683                        Deutsche Bank Maintains Buy on Apple, Raises Price Target to $350   0.000000
6684  Apple To Let Users Trade In Their Mac Computers For Credit At US, Canada Stores: Report   0.000000
Aggregated daily sentiment scores:
  stock       date  sentiment
0  AAPL 2020-06-09   0.088333
1  AAPL 2020-06-10   0.166919
2  AMZN 2020-06-09  -0.020833
3  AMZN 2020-06-10   0.204798
4  GOOG 2020-06-04   0.000000


In [33]:
# Correlation analysis
correlation_results = {}
for symbol in stock_symbols:
    logger.info("Analyzing correlation for %s", symbol)
    try:
        stock_df = all_stocks_data.get(symbol)
        if stock_df is None:
            logger.warning("No stock data for %s", symbol)
            continue

        sentiment_df = daily_sentiment[daily_sentiment['stock'] == symbol][['date', 'sentiment']]
        if sentiment_df.empty:
            logger.warning("No sentiment data for %s", symbol)
            continue

        # Create full date range to forward-fill sentiment
        date_range = pd.date_range(start=max(stock_df.index.min(), sentiment_df['date'].min()),
                                  end=min(stock_df.index.max(), sentiment_df['date'].max()))
        sentiment_df = sentiment_df.set_index('date').reindex(date_range, method='ffill').reset_index()
        sentiment_df.columns = ['date', 'sentiment']
        sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])

        stock_df = stock_df.reset_index().rename(columns={'Date': 'date'})
        merged_df = pd.merge(stock_df[['date', 'Daily_Return']], sentiment_df, on='date', how='inner')
        merged_df = merged_df.dropna()

        logger.info("Overlapping data for %s: %d rows\n%s", symbol, len(merged_df), merged_df.head().to_string())
        if len(merged_df) < 1:
            logger.warning("No overlapping data for %s", symbol)
            continue

        corr, p_value = pearsonr(merged_df['Daily_Return'], merged_df['sentiment'])
        correlation_results[symbol] = {'correlation': corr, 'p_value': p_value}
        logger.info("%s correlation: %.4f, p-value: %.4f", symbol, corr, p_value)

        plt.figure(figsize=(12, 6))
        plt.plot(merged_df['date'], merged_df['Daily_Return'], label='Daily Return', color='blue')
        plt.plot(merged_df['date'], merged_df['sentiment'], label='Average Daily Sentiment', color='orange')
        plt.title(f'{symbol} Daily Returns vs. News Sentiment\nCorrelation: {corr:.4f} (p={p_value:.4f})')
        plt.xlabel('Date')
        plt.ylabel('Value')
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(PLOTS_DIR, f'{symbol}_sentiment_vs_returns.png'))
        plt.close()

        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=merged_df['sentiment'], y=merged_df['Daily_Return'])
        plt.title(f'{symbol} Sentiment vs. Daily Returns')
        plt.xlabel('Average Daily Sentiment Score')
        plt.ylabel('Daily Return')
        plt.tight_layout()
        plt.savefig(os.path.join(PLOTS_DIR, f'{symbol}_sentiment_scatter.png'))
        plt.close()

    except Exception as e:
        logger.error("Error analyzing correlation for %s: %s", symbol, e)
        continue

Analyzing correlation for TSLA
Overlapping data for TSLA: 1 rows
        date  Daily_Return  sentiment
0 2020-06-10      0.089702   0.037515
Error analyzing correlation for TSLA: `x` and `y` must have length at least 2.
Analyzing correlation for NVDA
Overlapping data for NVDA: 8 rows
        date  Daily_Return  sentiment
0 2020-06-01     -0.007802       0.00
1 2020-06-02      0.002158       0.25
2 2020-06-03     -0.006317       0.25
3 2020-06-04     -0.000342       0.25
4 2020-06-05      0.017510       0.25
NVDA correlation: 0.1598, p-value: 0.7054
Analyzing correlation for META
No sentiment data for META
Analyzing correlation for AMZN
Overlapping data for AMZN: 2 rows
        date  Daily_Return  sentiment
0 2020-06-09      0.030427  -0.020833
1 2020-06-10      0.017913   0.204798
AMZN correlation: -1.0000, p-value: 1.0000
Analyzing correlation for GOOG
Overlapping data for GOOG: 5 rows
        date  Daily_Return  sentiment
0 2020-06-04     -0.016848   0.000000
1 2020-06-05      0.0185

In [34]:
# Save correlation results
if correlation_results:
    results_df = pd.DataFrame(correlation_results).T
    results_path = os.path.join(DATA_DIR, 'correlation_results.csv')
    results_df.to_csv(results_path)
    logger.info("Saved correlation results to %s:\n%s", results_path, results_df.to_string())

    plt.figure(figsize=(10, 6))
    sns.barplot(x=results_df.index, y=results_df['correlation'])
    plt.title('Correlation Between News Sentiment and Stock Returns')
    plt.xlabel('Stock Symbol')
    plt.ylabel('Pearson Correlation Coefficient')
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, 'correlation_summary.png'))
    plt.close()
else:
    logger.warning("No correlation results to save.")

logger.info("Task 3 completed. Plots saved in %s", PLOTS_DIR)

Saved correlation results to d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\correlation_results.csv:
      correlation   p_value
NVDA     0.159802  0.705433
AMZN    -1.000000  1.000000
GOOG    -0.089449  0.886262
AAPL    -1.000000  1.000000
Task 3 completed. Plots saved in d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\Plots-task3
