In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Sentiment-Stock Analysis Pipeline

## Overview
1. ✅ Load tweets from Kaggle (80,793 tweets)
2. ✅ Run sentiment analysis using Twitter-roBERTa NLP model
3. Aggregate sentiment by date & ticker
4. Fetch stock prices from Yahoo Finance
5. Merge sentiment + prices
6. Analyze correlation between sentiment and stock movements
7. Train predictive models

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load data
df_tweets = pd.read_csv('../data/stock_tweets.csv')
print(f"✅ Loaded {len(df_tweets):,} tweets")
print(f"Date range: {df_tweets['Date'].min()} to {df_tweets['Date'].max()}")
print(f"Tickers: {df_tweets['Stock Name'].unique().tolist()}")

                        Date  \
0  2022-09-29 23:41:16+00:00   
1  2022-09-29 23:24:43+00:00   
2  2022-09-29 23:18:08+00:00   
3  2022-09-29 22:40:07+00:00   
4  2022-09-29 22:27:05+00:00   

                                               Tweet Stock Name Company Name  
0  Mainstream media has done an amazing job at br...       TSLA  Tesla, Inc.  
1  Tesla delivery estimates are at around 364k fr...       TSLA  Tesla, Inc.  
2  3/ Even if I include 63.0M unvested RSUs as of...       TSLA  Tesla, Inc.  
3  @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...       TSLA  Tesla, Inc.  
4  @RealDanODowd @Tesla Stop trying to kill kids,...       TSLA  Tesla, Inc.  
Index(['Date', 'Tweet', 'Stock Name', 'Company Name'], dtype='object')
(80793, 4)
Date            object
Tweet           object
Stock Name      object
Company Name    object
dtype: object


In [None]:
# Load sentiment model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# Run sentiment analysis on all tweets
sentiment_scores = []
sentiment_labels = []

print("Running sentiment analysis on all tweets...")
for tweet in tqdm(df_tweets['Tweet'], total=len(df_tweets), desc="Processing"):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
    
    sentiment_score = probabilities[2] - probabilities[0]  # positive - negative
    sentiment_label = ['Negative', 'Neutral', 'Positive'][probabilities.argmax()]
    
    sentiment_scores.append(sentiment_score)
    sentiment_labels.append(sentiment_label)

df_tweets['sentiment_score'] = sentiment_scores
df_tweets['sentiment_label'] = sentiment_labels

print("\n✅ Sentiment analysis complete!")
print(f"\nSentiment distribution:")
print(df_tweets['sentiment_label'].value_counts())
print(f"\nSentiment score stats:")
print(df_tweets['sentiment_score'].describe())

Running sentiment analysis on all 80,793 tweets...


Processing tweets: 100%|██████████| 80793/80793 [23:58<00:00, 56.17it/s]


✅ Sentiment analysis complete!

Sentiment distribution:
sentiment_label
Neutral     37921
Positive    30418
Negative    12454
Name: count, dtype: int64

Sentiment score stats:
count    80793.000000
mean         0.237951
std          0.532447
min         -0.953388
25%         -0.078064
50%          0.232134
75%          0.725453
max          0.990082
Name: sentiment_score, dtype: float64





In [None]:
# Parse dates first
df_tweets['Date'] = pd.to_datetime(df_tweets['Date'])
df_tweets['date'] = df_tweets['Date'].dt.date

# Aggregate sentiment by date and ticker
daily_sentiment = df_tweets.groupby(['date', 'Stock Name']).agg({
    'sentiment_score': ['mean', 'std', 'count'],
    'sentiment_label': lambda x: (x == 'Positive').sum()
}).reset_index()

daily_sentiment.columns = ['date', 'ticker', 'sentiment_mean', 'sentiment_std', 'tweet_count', 'positive_count']
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

print("✅ Aggregated sentiment by date and ticker")
print(f"\nSample of daily sentiment:")
print(daily_sentiment.head(10))
print(f"\nShape: {daily_sentiment.shape}")

In [None]:
# Get stock price data
tickers = daily_sentiment['ticker'].unique().tolist()
start_date = daily_sentiment['date'].min() - pd.Timedelta(days=5)
end_date = daily_sentiment['date'].max() + pd.Timedelta(days=5)

print(f"Fetching prices for: {tickers}")
print(f"Date range: {start_date.date()} to {end_date.date()}\n")

stock_data = yf.download(tickers, start=start_date, end=end_date, progress=False)

# Process stock data
price_list = []
for ticker in tickers:
    if len(tickers) > 1:
        df_price = stock_data['Close'][ticker].reset_index()
    else:
        df_price = stock_data['Close'].reset_index()
    
    df_price.columns = ['date', 'close_price']
    df_price['ticker'] = ticker
    df_price['date'] = pd.to_datetime(df_price['date'])
    df_price['price_change_pct'] = df_price['close_price'].pct_change() * 100
    
    price_list.append(df_price)

stock_prices = pd.concat(price_list, ignore_index=True)

print(f"✅ Fetched stock prices for {len(stock_prices)} trading days")
print(f"\nSample stock data:")
print(stock_prices.head(10))

## Step 5: Merge Sentiment + Stock Prices

Combine daily sentiment and stock prices for analysis.

In [None]:
# Merge sentiment and prices
merged_df = daily_sentiment.merge(stock_prices, on=['date', 'ticker'], how='inner')
merged_df = merged_df.sort_values(['ticker', 'date']).reset_index(drop=True)

print(f"✅ Merged sentiment + stock prices")
print(f"\nFinal dataset shape: {merged_df.shape}")
print(f"Columns: {merged_df.columns.tolist()}")
print(f"\nTickers: {merged_df['ticker'].unique()}")
print(f"Date range: {merged_df['date'].min().date()} to {merged_df['date'].max().date()}")
print(f"\nRecords per ticker:")
print(merged_df['ticker'].value_counts().sort_index())
print(f"\nSample merged data:")
print(merged_df.head(10))

## Step 6: Analyze Sentiment-Price Correlation

Does sentiment predict stock price movements? Calculate correlation for each ticker.

In [None]:
# Calculate correlation between sentiment and price change
print("Correlation between sentiment and price change (%):\n")

for ticker in sorted(merged_df['ticker'].unique()):
    ticker_data = merged_df[merged_df['ticker'] == ticker].copy()
    ticker_data = ticker_data.dropna(subset=['sentiment_mean', 'price_change_pct'])
    
    if len(ticker_data) > 1:
        corr = ticker_data['sentiment_mean'].corr(ticker_data['price_change_pct'])
        print(f"{ticker:5s}: {corr:7.3f}  (n={len(ticker_data):3d} days)")

# Visualize sentiment vs price for all tickers
n_tickers = len(merged_df['ticker'].unique())
fig, axes = plt.subplots((n_tickers + 1) // 2, 2, figsize=(14, 4 * ((n_tickers + 1) // 2)))
axes = axes.flatten()

for idx, ticker in enumerate(sorted(merged_df['ticker'].unique())):
    ax = axes[idx]
    ticker_data = merged_df[merged_df['ticker'] == ticker].dropna(subset=['sentiment_mean', 'price_change_pct'])
    
    ax.scatter(ticker_data['sentiment_mean'], ticker_data['price_change_pct'], alpha=0.6, s=30)
    ax.set_xlabel('Daily Avg Sentiment Score', fontsize=10)
    ax.set_ylabel('Daily Price Change %', fontsize=10)
    ax.set_title(f'{ticker}: Sentiment vs Price Change', fontsize=11, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    # Add correlation text
    corr = ticker_data['sentiment_mean'].corr(ticker_data['price_change_pct'])
    ax.text(0.05, 0.95, f'r={corr:.3f}', transform=ax.transAxes, 
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
            verticalalignment='top', fontsize=9)

# Remove extra subplots
for idx in range(len(merged_df['ticker'].unique()), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("✅ DATA EXPLORATION COMPLETE!")
print("="*70)
print(f"\nYour dataset is ready for modeling:")
print(f"  • {len(merged_df):,} daily records")
print(f"  • {len(merged_df['ticker'].unique())} stock tickers")
print(f"  • Sentiment scores from NLP analysis")
print(f"  • Stock prices and daily returns")
print(f"  • Correlation analysis complete")
print(f"\nNext steps:")
print(f"  1. Add lag features (sentiment from previous days)")
print(f"  2. Add technical indicators (volatility, moving averages)")
print(f"  3. Train machine learning models (Linear, XGBoost, etc.)")
print(f"  4. Evaluate with proper time-series validation")