In [1]:
import pandas as pd
import torch
import numpy as np
import os

In [2]:

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
START = "2015-01-01"
END   = "2025-10-01"

In [4]:
import pandas as pd

def load_clean(path, date_col=("date","published_at","datetime","time"),
                      text_col=("headline","title")):
    df = pd.read_csv(path)
    cols = {c.lower().strip(): c for c in df.columns}

    # grab the first matching date & text columns
    date_key = next((c for c in date_col if c in cols), None)
    text_key = next((c for c in text_col if c in cols), None)
    if not date_key or not text_key:
        raise ValueError(f"Missing date/headline in {path}")

    # parse date (no timezone mapping), keep only date part
    df["Date"] = pd.to_datetime(df[cols[date_key]], errors="coerce").dt.date
    df["Headline"] = df[cols[text_key]].astype(str)

    # drop bad rows & exact duplicates
    df = df.dropna(subset=["Date","Headline"]).drop_duplicates(subset=["Date","Headline"])

    return df[["Date","Headline"]]


In [5]:
# Replace with your actual file paths/column names
spx  = load_clean("Raw Datasets/sp500_headlines_2008_2024.csv")
fwe = load_clean("Raw Datasets/financial_news_events.csv")


In [6]:
# Union the sources and clip to 2019–2025
news = pd.concat([spx, fwe], ignore_index=True).drop_duplicates()
news = news[(news["Date"] >= pd.to_datetime(START).date()) &
            (news["Date"] <= pd.to_datetime(END).date())]
news = news.sort_values("Date").reset_index(drop=True)
news

Unnamed: 0,Date,Headline
0,2015-01-02,Dog blindness may lead to insights into human ...
1,2015-01-02,Fun With Wall Street's S&P 500 Targets
2,2015-01-02,"Stocks close first day of 2015 mixed: Dow up, ..."
3,2015-01-02,Elvis Presley’s ‘Flying Graceland’ up for sale
4,2015-01-05,JPM: S&P 500 Inflection Points Chart
...,...,...
17378,2025-08-14,"Global trade talks collapse, causing market tu..."
17379,2025-08-14,Energy company's dividend hike excites shareho...
17380,2025-08-14,Quarterly report shows surprising losses for r...
17381,2025-08-14,Gold prices surge as investors flee riskier as...


In [7]:
os.makedirs("Processed Datasets", exist_ok=True)
out_processed = "Processed Datasets/2015-2025_date_headline.csv"
news.to_csv(out_processed, index=False)

In [8]:
N = len(news)
print("Headlines:", N)

Headlines: 17383


In [9]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL = "ProsusAI/finbert"
tok = AutoTokenizer.from_pretrained(MODEL)                          
mdl = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device).eval()  

def finbert_scores_iter(texts, bs=64, max_len=96):
    """Yield prob blocks to avoid huge memory; shows progress; uses GPU if available."""
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), bs), total=(len(texts)+bs-1)//bs, desc="FinBERT"):
            batch = texts[i:i+bs]
            enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt").to(device)
            logits = mdl(**enc).logits
            yield torch.softmax(logits, dim=1).detach().cpu().numpy()  # [neg,neu,pos]


probs_blocks = list(finbert_scores_iter(news["Headline"].tolist(), bs=64))
probs = np.vstack(probs_blocks)

news["neg"], news["neu"], news["pos"] = probs[:,0], probs[:,1], probs[:,2]
news["score"] = news["pos"] - news["neg"]

FinBERT: 100%|██████████| 272/272 [00:05<00:00, 45.73it/s]


In [10]:
news["Date"] = pd.to_datetime(news["Date"], errors="coerce").dt.date
news = news.dropna(subset=["Date"])
news

Unnamed: 0,Date,Headline,neg,neu,pos,score
0,2015-01-02,Dog blindness may lead to insights into human ...,0.127928,0.013646,0.858427,0.730499
1,2015-01-02,Fun With Wall Street's S&P 500 Targets,0.093921,0.032640,0.873440,0.779519
2,2015-01-02,"Stocks close first day of 2015 mixed: Dow up, ...",0.077934,0.899513,0.022552,-0.055382
3,2015-01-02,Elvis Presley’s ‘Flying Graceland’ up for sale,0.075358,0.026304,0.898337,0.822979
4,2015-01-05,JPM: S&P 500 Inflection Points Chart,0.046683,0.071612,0.881705,0.835022
...,...,...,...,...,...,...
17378,2025-08-14,"Global trade talks collapse, causing market tu...",0.012807,0.926468,0.060725,0.047918
17379,2025-08-14,Energy company's dividend hike excites shareho...,0.553237,0.382237,0.064527,-0.488710
17380,2025-08-14,Quarterly report shows surprising losses for r...,0.175466,0.808205,0.016329,-0.159138
17381,2025-08-14,Gold prices surge as investors flee riskier as...,0.197134,0.720288,0.082579,-0.114555


In [11]:
sentiment_csv = (news.groupby("Date", as_index=False).agg(Sentiment=("score","mean"))) 
sentiment_csv

Unnamed: 0,Date,Sentiment
0,2015-01-02,0.569404
1,2015-01-05,0.715140
2,2015-01-06,0.394994
3,2015-01-07,-0.297627
4,2015-01-08,-0.242861
...,...,...
2429,2025-08-10,-0.152810
2430,2025-08-11,-0.246667
2431,2025-08-12,-0.220568
2432,2025-08-13,-0.190902


In [12]:
os.makedirs("Processed Datasets", exist_ok=True)
out_processed = "Processed Datasets/2015-2025_sentiment.csv"
sentiment_csv.to_csv(out_processed, index=False)