In [20]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [21]:
reddit_df = pd.read_csv('year_reddit_data.csv')
yf_df = pd.read_csv('year_yf_data.csv')

print(reddit_df.head())
print(yf_df.head())


   Unnamed: 0       id          created_utc ticker subreddit  \
0           0  1oyzy4v  2025-11-16 22:59:25   nvda    stocks   
1           1  1owabf1  2025-11-13 19:23:30   nvda    stocks   
2           2  1ow7lym  2025-11-13 17:43:31   nvda    stocks   
3           3  1ovqca4  2025-11-13 03:17:50   nvda    stocks   
4           4  1ovctdu  2025-11-12 18:15:41   nvda    stocks   

                 author                                              title  \
0            Denver-Ski  Peter Thielâ€™s latest 13F shows a full exit fro...   
1        SecretComposer  Do you have any less discussed positions that ...   
2    Necessary_Fold5478  Nvidia CEO reportedly said China will lead the...   
3  Apprehensive_Two1528  USD to CNY hits lowest level YTD. Apple is gon...   
4           One_Rub7972  CRWV has plummeted; is now a good time to buy ...   

   score  upvote_ratio  num_comments query   stock                 created_dt  \
0   1478          0.93           241  nvda  nvidia  2025-11-16 

# Reddit + Yahoo Finance preprocessing pipeline

This notebook prepares weekly sequences for the BiLSTM self-attention model.

Steps:
- Normalize schemas and dates on `reddit_df` and `yf_df`.
- Clean text (remove URLs, whitespace) and detect ticker mentions.
- Compute FinBERT embeddings and sentiment per post.
- Aggregate per ticker-day: engagement-weighted embedding and sentiment/engagement features.
- Join OHLCV from Yahoo Finance data.
- Build 5-day windows and next-week labels (direction/magnitude).
- Create a WeeklySentimentDataset and collate a batch.
- Run the batch through `LSTMClassifier` to verify shapes.

In [22]:
import re
from datetime import datetime, timedelta

from transformers import AutoTokenizer, AutoModelForSequenceClassification


def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"https?://\S+", " ", text)
    text = re.sub(r"\n+", " ", text)
    return re.sub(r"\s+", " ", text).strip()


def load_finbert(device):
    tok = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    mdl = AutoModelForSequenceClassification.from_pretrained(
        "ProsusAI/finbert", output_hidden_states=True
    ).to(device)
    mdl.eval()
    return tok, mdl


@torch.no_grad()
def finbert_embed_and_sentiment(texts, tokenizer, model, device, batch_size: int = 16):
    all_embeds = []
    all_scores = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i : i + batch_size]
        enc = tokenizer(chunk, padding=True, truncation=True, max_length=128, return_tensors="pt")
        enc = {k: v.to(device) for k, v in enc.items()}
        out = model(**enc)
        hidden = out.hidden_states[-1]
        mask = enc["attention_mask"].unsqueeze(-1)
        masked = hidden * mask
        lengths = mask.sum(dim=1).clamp_min(1)
        mean_pool = masked.sum(dim=1) / lengths
        logits = out.logits
        probs = torch.softmax(logits, dim=-1)
        score = probs[:, 2] - probs[:, 0]
        all_embeds.append(mean_pool.cpu())
        all_scores.append(score.cpu())
    return torch.cat(all_embeds, dim=0), torch.cat(all_scores, dim=0)

In [23]:
# Normalize schemas
reddit = reddit_df.copy()
yf = yf_df.copy()

# Reddit expected columns: ['post.id','post.created_utc','post.title','post.selftext','post.ups','post.downs','post.upvote_ratio','post.num_comments','post.score','Post.subreddit.display_name','Ticker_id']
# Map to simpler names if present
col_map = {
    'post.id': 'post_id',
    'post.created_utc': 'created_utc',
    'post.title': 'title',
    'post.selftext': 'selftext',
    'post.ups': 'ups',
    'post.downs': 'downs',
    'post.upvote_ratio': 'upvote_ratio',
    'post.num_comments': 'num_comments',
    'post.score': 'score',
    'Post.subreddit.display_name': 'subreddit',
    'Ticker_id': 'ticker'
}
reddit.rename(columns={k: v for k, v in col_map.items() if k in reddit.columns}, inplace=True)

# Parse datetime and clean text
if 'created_utc' in reddit.columns:
    reddit['created_utc'] = pd.to_datetime(reddit['created_utc'], utc=True, errors='coerce')
else:
    raise ValueError('created_utc column not found in reddit_df')

# Use only post titles (selftext removed from model inputs)
reddit['title'] = reddit.get('title', '').astype(str)
reddit['text'] = reddit['title'].map(clean_text)

# Ensure numeric engagement columns exist
for c, dtp, default in [
    ('ups', int, 0), ('downs', int, 0), ('num_comments', int, 0),
    ('score', int, 0), ('upvote_ratio', float, 0.0)
]:
    if c not in reddit:
        reddit[c] = default
    reddit[c] = reddit[c].fillna(default).astype(dtp)

# Ticker column
if 'ticker' not in reddit:
    if 'Ticker_id' in reddit_df.columns:
        reddit['ticker'] = reddit_df['Ticker_id'].astype(str)
    else:
        raise ValueError('ticker/Ticker_id column required in reddit_df')

# Yahoo Finance CSV has multi-level headers (field, ticker). Re-parse and pivot to long format.
# Example fields: Close, High, Low, Open, Volume
yf_path = 'year_yf_data.csv'
try:
    yf_multi = pd.read_csv(yf_path, header=[0, 1], index_col=0, skiprows=[2])
except Exception as e:
    raise RuntimeError(f'Failed to parse {yf_path} with multi-level headers: {e}')

# Ensure index is datetime-like (dates in the first column)
yf_multi.index.name = 'date'
try:
    yf_multi.index = pd.to_datetime(yf_multi.index, errors='coerce')
except Exception:
    pass

# Helper to convert a category to long format
def _to_long(cat: str) -> pd.DataFrame:
    if cat not in yf_multi.columns.get_level_values(0):
        return pd.DataFrame(columns=['date', 'ticker', cat])
    dfc = yf_multi[cat]
    long_df = dfc.stack().reset_index()
    long_df.columns = ['date', 'ticker', cat]
    return long_df

close_long = _to_long('Close')
vol_long = _to_long('Volume')

# Merge desired fields
if close_long.empty and vol_long.empty:
    yf = pd.DataFrame(columns=['date', 'ticker', 'Close', 'Volume'])
else:
    yf = close_long.merge(vol_long, on=['date', 'ticker'], how='outer')

# Finalize types
yf['date'] = pd.to_datetime(yf['date'], errors='coerce').dt.date
yf['ticker'] = yf['ticker'].astype(str)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# FinBERT embeddings + sentiment per post
print('Embedding posts (FinBERT)...')
fin_tok, fin_model = load_finbert(device)
embeds, sent = finbert_embed_and_sentiment(reddit['text'].tolist(), fin_tok, fin_model, device)
reddit['embed'] = [e.cpu().numpy() if isinstance(e, torch.Tensor) else e for e in embeds]
reddit['sentiment'] = sent.cpu().numpy()

# Daily aggregation per ticker-date
reddit['date'] = reddit['created_utc'].dt.tz_convert('US/Eastern').dt.date
agg_rows = []
for (tic, dt), g in reddit.groupby(['ticker','date'], sort=True):
    arr = np.stack(g['embed'].values, axis=0)
    ups = g['ups'].to_numpy()
    w = ups.astype(float)
    w = w / max(w.sum(), 1.0)
    day_embed = (arr * w[:, None]).sum(axis=0).astype(np.float32)
    row = {
        'ticker': tic,
        'date': dt,
        'embed': day_embed,
        'sentiment_mean': float(g['sentiment'].mean()),
        'sentiment_weighted': float((g['sentiment'] * (ups / max(ups.sum(), 1))).sum()),
        'total_upvotes': int(ups.sum()),
        'avg_upvotes': float(ups.mean()),
        'comment_engagement': float((g['num_comments'] / np.maximum(ups + g['downs'].to_numpy(), 1)).mean()),
    }
    agg_rows.append(row)
agg = pd.DataFrame(agg_rows)

# Merge OHLCV
merged = agg.merge(yf.rename(columns={'Ticker':'ticker'}), how='left', on=['ticker','date'])

# Build 2-day sequences and labels (next-week direction/magnitude)
seq_len = 5
sequences = []
lengths = []
labels_dir = []
labels_mag = []
meta = []

for tic, g in merged.sort_values('date').groupby('ticker'):
    g = g.reset_index(drop=True)
    # construct daily feature vectors
    def feat_vec(r):
        return np.concatenate([
            r['embed'],
            np.array([
                r.get('Close', np.nan),
                r.get('Volume', np.nan),
                r.get('sentiment_mean', 0.0),
                r.get('sentiment_weighted', 0.0),
                r.get('total_upvotes', 0.0),
                r.get('avg_upvotes', 0.0),
                r.get('comment_engagement', 0.0),
            ], dtype=np.float32)
        ]).astype(np.float32)

    feats = [feat_vec(r) for _, r in g.iterrows()]
    closes = g['Close'].to_numpy()

    if len(feats) < seq_len + 5:
        # need at least 5 future days to compute next-week movement
        continue

    # Use the last window we can build with following week
    for i in range(0, len(feats) - seq_len - 5 + 1):
        window = np.stack(feats[i:i+seq_len], axis=0)
        # Next week direction/magnitude from close prices
        c0 = closes[i+seq_len-1]
        c1 = closes[i+seq_len+2-1]
        if np.isnan(c0) or np.isnan(c1):
            continue
        ret = (c1 - c0) / max(abs(c0), 1e-6)
        direction = 1.0 if ret > 0 else 0.0
        magnitude = float(abs(ret))

        sequences.append(torch.tensor(window, dtype=torch.float32))
        lengths.append(seq_len)
        labels_dir.append(direction)
        labels_mag.append(magnitude)
        meta.append({'ticker': tic, 'start_index': i})

print(f'Built {len(sequences)} sequences across {merged["ticker"].nunique()} tickers')

Embedding posts (FinBERT)...
Built 0 sequences across 12 tickers


In [25]:
from dataset import WeeklySentimentDataset, collate_weekly
from model import LSTMClassifier

# Build dataset
if len(sequences) == 0:
    print('No sequences built. Check your input coverage and date ranges.')
else:
    ds = WeeklySentimentDataset(
        sequences=sequences,
        lengths=lengths,
        direction=torch.tensor(labels_dir),
        magnitude=torch.tensor(labels_mag),
        metadata=meta,
    )
    batch = collate_weekly([ds[i] for i in range(min(8, len(ds)))], pad_to=5)

    input_dim = sequences[0].shape[1]
    model = LSTMClassifier(input_dim=input_dim).to(device)
    out = model(batch['x'].to(device), batch['lengths'].to(device))
    {k: tuple(v.shape) for k, v in out.items() if isinstance(v, torch.Tensor)}

No sequences built. Check your input coverage and date ranges.
