# Experiment Code Pipeline

## Setup and Data Collection

In [None]:
!pip install --upgrade notebook
!pip install python-dotenv
!pip install joblib
!pip install scikit-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install threadpoolctl
!pip install tensorflow
!pip install scikit-image
!pip install scikit-learn-intelex
!pip install keras


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import openai          
import transformers    
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.tsa.stattools as stt
import statsmodels.tsa.holtwinters as smt
import statsmodels.tsa.seasonal as smt
import statsmodels.tsa.arima.model as smt


In [None]:
from dotenv import load_dotenv # Ensuring OpenAI API Key is retrieved from .env file
load_dotenv()
import os
print(os.getenv("OPENAI_API_KEY"))

## S&P 500 Index Data Retrieval

In [None]:
### Market data
### Literature lays out data suggestions and pipeline implementation

import pandas as pd
import yfinance as yf

sp500 = yf.download("^GSPC", start="2015-01-01", end="2025-01-01")

# DataFrame with Date index and columns Open, High, Low, Close (used for Returns), Volume, Adj Close
sp500['Return'] = sp500['Close'].pct_change()
sp500 = sp500.reset_index() # Date column for merging
sp500['Date'] = pd.to_datetime(sp500['Date']).dt.date 

sp500['Date'] = pd.to_datetime(sp500['Date']).dt.strftime('%Y-%m-%d') # Converting Date to string format for merging

# Fetching macro data with VIX:
vix = yf.download("^VIX", start="2015-01-01", end="2025-01-01")[['Close']]
vix.rename(columns={'Close':'VIX'}, inplace=True)
vix = vix.reset_index()
vix['Date'] = pd.to_datetime(vix['Date']).dt.date
vix['Date'] = pd.to_datetime(vix['Date']).dt.strftime('%Y-%m-%d')

# Merging the two DataFrames on 'Date'
merged_data = pd.merge(sp500, vix, on='Date', how='inner')
merged_data['Date'] = pd.to_datetime(merged_data['Date'])
merged_data.set_index('Date', inplace=False)
merged_data.dropna(inplace=True)

merged_data.to_csv('sp500_vix_data.csv', index=False)


## New York Times Article Summary Scraping

### General Article Scriping 

In [None]:
### NYT Article Summary Data
import os, time, json, sys
from datetime import date
from typing import List, Dict

import requests
import pandas as pd

# Configuration:
NYT_API_KEY = os.getenv("NYT_API_KEY", "your_real_key")

START_DATE  = date(2015, 1, 1)
END_DATE    = date(2025, 1, 1)
CSV_PATH    = "nyt_business_archive.csv"
REQS_PER_MIN = 5
SLEEP_SEC   = 60 / REQS_PER_MIN        # The 12s pause forces code to stay within the API rate limit of 5-calls per minute
CHECKPOINT  = "archive_checkpoint.json"
ARCHIVE_URL = "https://api.nytimes.com/svc/archive/v1/{year}/{month}.json"

def archive_month(year: int, month: int, max_retries: int = 5) -> Dict: # Wrapper handles 429 and retries
    """Fetch one month from NYT Archive API with rate-limit back-off."""
    url    = ARCHIVE_URL.format(year=year, month=month)
    params = {"api-key": NYT_API_KEY}

    for attempt in range(max_retries):
        resp = requests.get(url, params=params, timeout=60)
        if resp.status_code == 429:                # Too Many Requests
            wait = int(resp.headers.get("Retry-After", "15"))
            print(f"429 received — sleeping {wait}s and retrying …")
            time.sleep(wait)
            continue
        resp.raise_for_status()
        return resp.json()

    raise RuntimeError(f"Failed to fetch {year}-{month:02} after {max_retries} retries")

def iter_months(start: date, end: date):
    y, m = start.year, start.month
    while True:
        current = date(y, m, 1)
        if current > end.replace(day=1):
            break
        yield current
        m += 1
        if m == 13:
            m, y = 1, y + 1

def load_checkpoint():
    if os.path.isfile(CHECKPOINT):
        with open(CHECKPOINT) as fp:
            return date.fromisoformat(json.load(fp)["last_month"])
    return None

def save_checkpoint(d: date):
    with open(CHECKPOINT, "w") as fp:
        json.dump({"last_month": d.isoformat()}, fp)

def append_rows(rows: List[Dict]):
    pd.DataFrame(rows).to_csv(
        CSV_PATH,
        mode="a",
        index=False,
        header=not os.path.isfile(CSV_PATH),
    )

def is_business(doc: Dict) -> bool:
    sec  = (doc.get("section_name") or "").lower()
    desk = (doc.get("news_desk")    or "").lower()
    return "business" in sec or "business" in desk

def main():
    resume_from = load_checkpoint()
    months      = list(iter_months(START_DATE, END_DATE))
    if resume_from:
        months = [m for m in months if m > resume_from]
        print(f"Resuming after {resume_from} — {len(months)} months left.")

    for first in months:
        print(f"Fetching {first:%Y-%m} …", end=" ", flush=True)
        data = archive_month(first.year, first.month)      
        docs = data["response"]["docs"]

        rows = [
            {
                "Date": doc["pub_date"][:10],
                "Headline": doc["headline"]["main"],
                "Summary": doc.get("abstract") or doc.get("snippet", ""),
                "Section": doc.get("section_name") or doc.get("news_desk"),
                "URL": doc["web_url"],
            }
            for doc in docs if is_business(doc)
        ]

        append_rows(rows)
        save_checkpoint(first)
        print(f"kept {len(rows):3d} / {len(docs):3d} docs")
        time.sleep(SLEEP_SEC)                              

    print("All months processed. CSV downloaded")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nInterrupted — progress saved, just rerun to continue.")

### Aggregated Sector Mapping Article Scraping

In [None]:
### Aggregate Sector Mapping for NYT Article Summaries
import os, time, json, sys, re
from datetime import date
from typing import List, Dict

import requests
import pandas as pd

path = "nyt_business_archive.csv"

df = pd.read_csv(path, header=None)  # no header to see all columns
max_cols = df.shape[1]

if max_cols == 5 and "Summary" not in pd.read_csv(path, nrows=0).columns:
    df.columns = ["Date", "Headline", "Summary", "Section", "URL"]
    df.to_csv(path, index=False)
    print("✔ Added missing 'Summary' header and rewrote file.")
else:
    print("No header problem detected — nothing changed.")

# Configuration:
NYT_API_KEY  = os.getenv("NYT_API_KEY", "your_key")
START_DATE   = date(2015, 1, 1)
END_DATE     = date(2025, 1, 1)
CSV_PATH     = "nyt_business_archive.csv"
AGG_PATH     = "nyt_aggregated_data.csv"
REQS_PER_MIN = 5
SLEEP_SEC    = 60 / REQS_PER_MIN       # 12 s → 5 req/min
CHECKPOINT   = "archive_checkpoint.json"
ARCHIVE_URL  = "https://api.nytimes.com/svc/archive/v1/{year}/{month}.json"

def archive_month(year: int, month: int, retries=5) -> Dict: # NYT API call with 429 back-off to handle rate limits
    url, params = ARCHIVE_URL.format(year=year, month=month), {"api-key": NYT_API_KEY}
    for _ in range(retries):
        r = requests.get(url, params=params, timeout=60)
        if r.status_code == 429:                       # Too Many Requests
            wait = int(r.headers.get("Retry-After", "15"))
            print(f"⚠429 – sleeping {wait}s")
            time.sleep(wait)
            continue
        r.raise_for_status()
        return r.json()
    raise RuntimeError(f"Failed after {retries} retries: {year}-{month:02}")

def iter_months(start: date, end: date): # Generator = iterates over months in a date range
    y, m = start.year, start.month
    while True:
        first = date(y, m, 1)
        if first > end.replace(day=1):
            break
        yield first
        m += 1
        if m == 13:
            m, y = 1, y + 1

def load_checkpoint(): # Setting up checkpoint management for resuming crawls
    if os.path.isfile(CHECKPOINT):
        with open(CHECKPOINT) as fp:
            return date.fromisoformat(json.load(fp)["last_month"])
    return None

def save_checkpoint(d: date): # Saving the last processed month to a checkpoint file
    with open(CHECKPOINT, "w") as fp:
        json.dump({"last_month": d.isoformat()}, fp)

def append_rows(rows: List[Dict]): # Rows to the CSV file, creating it if it doesn't exist
    pd.DataFrame(rows).to_csv(
        CSV_PATH,
        mode="a",
        index=False,
        header=not os.path.isfile(CSV_PATH),
    )

def is_business(doc: Dict) -> bool: # Business filter
    sec  = (doc.get("section_name") or "").lower()
    desk = (doc.get("news_desk")    or "").lower()
    return "business" in sec or "business" in desk

def crawl_nyt_archive(): # Main crawling function to fetch NYT archive data
    resume_from = load_checkpoint()
    months      = list(iter_months(START_DATE, END_DATE))
    if resume_from:
        months = [m for m in months if m > resume_from]
        print(f"Resuming after {resume_from} – {len(months)} months left.")

    for first in months:
        print(f"Fetching {first:%Y-%m} …", end=" ", flush=True)
        data = archive_month(first.year, first.month)
        docs = data["response"]["docs"]

        rows = [
            {
                "Date": doc["pub_date"][:10],
                "Headline": doc["headline"]["main"],
                "Summary": doc.get("abstract") or doc.get("snippet", ""),
                "Section": doc.get("section_name") or doc.get("news_desk"),
                "URL": doc["web_url"],
            }
            for doc in docs if is_business(doc)
        ]

        append_rows(rows)
        save_checkpoint(first)
        print(f"kept {len(rows):3d} / {len(docs):3d}")
        time.sleep(SLEEP_SEC)

    print("NYT crawl complete.")

# Sector classification & aggregation
SECTOR_MAP: dict[str, list[str]] = {
    # Information Tech Secotr:
    "Software & IT Services": [
        "software", "saas", "cloud", "it services", "consulting",
        "microsoft", "adobe", "oracle", "sap", "salesforce", "servicenow",
        "workday", "vmware", "accenture", "infosys", "tcs", "capgemini",
    ],
    "Hardware & Devices": [
        "hardware", "pc", "laptop", "smartphone", "iphone", "ipad",
        "dell", "hp", "lenovo", "asus", "acer", "logitech",
    ],
    "Semiconductors": [
        "chip", "chips", "semiconductor", "fab", "foundry",
        "intel", "amd", "nvidia", "qualcomm", "tsmc", "broadcom",
        "micron", "arm holdings", "sk hynix",
    ],
    "Internet & Social Media": [
        "google", "alphabet", "youtube", "search engine",
        "meta", "facebook", "instagram", "whatsapp", "threads",
        "twitter", "x corp", "snapchat", "tiktok", "reddit",
        "linkedin", "pinterest", "social media",
    ],
    # Communication & Media Sector:
    "Telecommunications": [
        "telecom", "5g", "wireless", "broadband",
        "verizon", "at&t", "t-mobile", "comcast", "charter",
        "vodafone", "telefonica", "bt group", "rogers", "singtel",
    ],
    "Media & Entertainment": [
        "media", "streaming", "disney", "espn", "hulu",
        "netflix", "warner bros", "hbo", "paramount", "peacock",
        "sony pictures", "universal", "box office", "cinema",
    ],
    # Consumer Sector:
    "Retail & E-Commerce": [
        "retail", "e-commerce", "amazon", "alibaba", "shopify",
        "ebay", "etsy", "walmart", "target", "costco", "kroger",
        "best buy", "flipkart", "mercado libre",
    ],
    "Consumer Goods & Apparel": [
        "nike", "adidas", "lululemon", "puma", "under armour",
        "apparel", "footwear", "luxury", "lvmh", "gucci", "burberry",
        "rolex", "hermes", "tapestry",
    ],
    "Food & Beverage": [
        "food", "beverage", "coca-cola", "pepsico", "nestlé",
        "restaurant", "fast food", "mcdonald", "starbucks",
        "yum brands", "kfc", "pizza hut", "chipotle",
        "kraft", "general mills", "heinz", "tyson foods",
    ],
    "Hospitality & Leisure": [
        "hotel", "marriott", "hilton", "hyatt", "airbnb",
        "booking.com", "expedia", "travel", "cruise", "carnival",
        "royal caribbean", "las vegas sands", "mgm resorts",
    ],
    "Automotive": [
        "automotive", "auto", "car", "vehicle", "ev",
        "tesla", "general motors", "ford", "stellantis",
        "volkswagen", "toyota", "nissan", "bmw", "mercedes",
        "hyundai", "kia", "rivian", "lucid",
    ],
    # Healthcare Sector:
    "Pharmaceuticals": [
        "pharma", "drug", "medicine", "vaccine", "fda",
        "pfizer", "moderna", "johnson & johnson", "merck",
        "novartis", "roche", "astrazeneca", "bayer", "gsk",
    ],
    "Biotechnology": [
        "biotech", "gene therapy", "crispr", "genomics",
        "illumina", "gilead", "amgen", "biogen", "regeneron",
        "vertex", "bluebird bio",
    ],
    "Medical Devices & Services": [
        "medical device", "medtech", "diagnostics", "surgical",
        "medtronic", "boston scientific", "abbott", "stryker",
        "philips healthcare", "siemens healthineers", "cardinal health",
        "hospital", "clinic", "healthcare services",
    ],
    # Energy & Utilities Sector:
    "Oil & Gas": [
        "oil", "gas", "petroleum", "upstream", "downstream",
        "exxon", "chevron", "bp", "shell", "totalenergies",
        "conocophillips", "aramco", "occidental", "slb",
    ],
    "Renewables & Clean Energy": [
        "renewable", "solar", "wind", "geothermal", "hydro",
        "clean energy", "green energy", "next era", "sunpower",
        "first solar", "enphase", "vestas", "siemens gamesa",
        "hydrogen", "electrolyzer", "fuel cell",
    ],
    "Utilities": [
        "utility", "power grid", "electricity", "water utility",
        "natural gas utility", "duke energy", "southern company",
        "dominion", "pg&e", "national grid", "aes",
    ],
    # Financials Sector:
    "Banks": [
        "bank", "commercial bank", "jpmorgan", "bank of america",
        "citigroup", "wells fargo", "goldman sachs", "morgan stanley",
        "u.s. bancorp", "hsbc", "barclays", "santander", "dbs",
    ],
    "Investment & Asset Management": [
        "asset manager", "blackrock", "vanguard", "fidelity",
        "state street", "schwab", "hedge fund", "private equity",
        "kkr", "carried interest", "mutual fund", "etf",
        "sovereign wealth fund",
    ],
    "Insurance": [
        "insurance", "insurer", "aig", "allstate", "progressive",
        "metlife", "prudential", "chubb", "berkshire hathaway insurance",
        "reinsurance", "lloyd's", "actuarial",
    ],
    "Fintech & Payments": [
        "fintech", "payment", "visa", "mastercard",
        "american express", "paypal", "block inc", "square",
        "stripe", "sofi", "robinhood", "buy now pay later",
        "klarna", "affirm", "ant financial",
    ],
    "Cryptocurrency & Blockchain": [
        "bitcoin", "ethereum", "crypto", "blockchain",
        "coinbase", "binance", "defi", "nft",
        "stablecoin", "mining rig", "hashrate",
    ],
    # Industrial Sector:
    "Aerospace & Defense": [
        "aerospace", "defense", "boeing", "airbus", "northrop",
        "lockheed martin", "raytheon", "bae systems", "general dynamics",
        "drones", "satellite", "nasa contract",
    ],
    "Transportation & Logistics": [
        "shipping", "freight", "logistics", "supply chain",
        "fedex", "ups", "dhl", "maersk", "csx", "union pacific",
        "delta airlines", "american airlines", "united airlines",
        "railroad", "port congestion",
    ],
    "Manufacturing & Machinery": [
        "manufacturing", "factory", "industrial", "caterpillar",
        "3m", "general electric", "siemens", "honeywell", "emerson",
        "robotics", "automation", "abb", "fanuc",
    ],
    "Construction & Engineering": [
        "construction", "engineering", "infrastructure",
        "bechtel", "fluor", "jacobs", "skanska", "kiewit",
        "turner construction", "architect", "building materials",
    ],
    "Chemicals & Specialty Materials": [
        "chemical", "chemicals", "specialty chemical",
        "dupont", "dow", "basf", "lyondellbasell", "air products",
        "eastman", "evonik", "synthetic rubber", "petrochemical",
    ],
    "Metals & Mining": [
        "mining", "metal", "steel", "aluminum", "copper",
        "iron ore", "rio tinto", "bhp", "vale", "newmont",
        "glencore", "lithium", "nickel", "rare earth",
    ],
    "Agriculture": [
        "agriculture", "farming", "crop", "soybean", "corn",
        "wheat", "cargill", "archer daniels midland", "bunge",
        "deere", "monsanto", "fertilizer", "nutrien", "potash",
    ],
    # Real Estate Sector:
    "Real Estate": [
        "real estate", "realtor", "reit", "property", "mortgage",
        "office vacancy", "housing market", "zillow", "redfin",
        "wework", "commercial property", "residential property",
        "industrial park", "logistics park",
    ],
    # ESG / Government / Education Sectors:
    "Environmental & ESG": [
        "esg", "sustainability", "carbon", "emissions",
        "carbon credit", "offset", "green bond", "climate risk",
        "cop28", "environmental regulation",
    ],
    "Government & Policy": [
        "government", "regulation", "legislation", "policy",
        "federal reserve", "congress", "white house",
        "eu commission", "trade tariff", "sanction", "geopolitics",
    ],
    "Education": [
        "education", "edtech", "university", "college", "school",
        "coursera", "edx", "udemy", "chegg", "student loan",
    ],
}

def assign_sector(text: str) -> str: # Function to assign sectors based on keywords in the summary text
    text_low = text.lower()
    for sector, keywords in SECTOR_MAP.items():
        if any(kw in text_low for kw in keywords):
            return sector
    return "General"
  
def aggregate_nyt(df: pd.DataFrame) -> pd.DataFrame: # Aggregation function to combine headlines and summaries by Date and Sector
    """Combine all Business headlines/summaries into one row per Date × Sector."""
    return (
        df.groupby(["Date", "Sector"])
          .agg({
              "Headline": lambda x: " | ".join(x.dropna().astype(str)),
              "Summary":  lambda x: " | ".join(x.dropna().astype(str)),
          })
          .reset_index()
    )

# Main function to run the entire process
def main():
    crawl_nyt_archive() # Crawl or resume NYT archive

    nyt_df = pd.read_csv(CSV_PATH) # Loading full Business CSV
    nyt_df["Headline"] = nyt_df["Headline"].astype(str)
    nyt_df["Summary"] = nyt_df["Summary"].fillna("").astype(str)

    nyt_df["Sector"] = nyt_df["Summary"].fillna("").apply(assign_sector) # Assigning sectors

    nyt_aggregated = aggregate_nyt(nyt_df) # Aggregate is saved
    nyt_aggregated.to_csv(AGG_PATH, index=False)
    print(f"Aggregated file written: {AGG_PATH}")
  
    if "news_df" in globals(): # Reuters clean-up if news_df already exists
        news_df["Article"] = (
            news_df["Article"]
            .str.replace(r"By .*? \|", "", regex=True)
            .str.replace(r"\n+", " ", regex=True)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
        )
        print("Reuters news_df cleaned.")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nInterrupted – progress saved, just rerun to continue.")

In [None]:
nyt_aggregated = pd.read_csv("nyt_aggregated_data.csv") # Printing the first few rows of the NYT scrapped aggregate data file
print(nyt_aggregated.head())

merged_data = pd.read_csv("sp500_vix_data.csv") # Printing the first few rows of the merged S&P 500 and VIX data
print(merged_data.head())

## Senitment Analysis Implementation

### Unified FinBERT and GPT-4 Fall Back

In [None]:
### Hybrid: FinBERT + GPT-4 Sentiment Analysis Pipeline
### Follows implementation by ProsusAI https://github.com/ProsusAI/finBERT
#  FinBERT gives primary sentiment score (pos‑prob − neg‑prob)
#  The FinBERT-onyl score is extracted for benchmarking
#  GPT‑4 called upon for fallback when FinBERT is effectively
#  Fallback activated when FinBERT score is neutral (|score| < GPT4_CONFIDENCE_THRESHOLD)
#  Carried out with Python 3.13

import os
import time 
from datetime import datetime
from pathlib import Path
from typing import List, Tuple
import numpy as np
import pandas as pd
import torch
import openai
from transformers import AutoTokenizer, AutoModelForSequenceClassification

USE_GPT4_FALLBACK = True
GPT4_CONFIDENCE_THRESHOLD = 0.05                     
OPENAI_MODEL = "gpt-4o-mini" # GPT model choice supported by literature
OPENAI_MAX_TOKENS = 10

DERIVED_PATH = "derived"
Path(DERIVED_PATH).mkdir(exist_ok=True)

GPT4_CHECKPOINT_EVERY = 250   # save after every 250 GPT-4 calls

if USE_GPT4_FALLBACK: # Add API key to .env in same directory
    if not os.getenv("OPENAI_API_KEY"):
        raise EnvironmentError("OPENAI_API_KEY not set.")
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")

market_df = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])
news_df = pd.read_csv("nyt_aggregated_data.csv", parse_dates=["Date"])
news_df.rename(columns={"Summary": "summary", "Headline": "headline", "Sector": "sector"}, inplace=True)

trading_days = set(market_df["Date"].dt.normalize()) # Aligning news dates with trading calendar
news_df = news_df[news_df["Date"].dt.normalize().isin(trading_days)].reset_index(drop=True)

FINBERT_MODEL = "ProsusAI/finbert" # FinBERT model - ProsusAI
tokenizer = AutoTokenizer.from_pretrained(FINBERT_MODEL)
finbert = AutoModelForSequenceClassification.from_pretrained(FINBERT_MODEL)
finbert.eval()

@torch.inference_mode()
def finbert_score(texts: List[str], batch_size: int = 32) -> Tuple[List[float], List[np.ndarray]]:
    scores, probs_all = [], []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256)
        out = finbert(**enc)
        probs = torch.nn.functional.softmax(out.logits, dim=1).cpu().numpy()
        batch_scores = probs[:,1] - probs[:,2]
        scores.extend(batch_scores.tolist())
        probs_all.extend(probs)
    return scores, probs_all

def gpt4_sentiment_single(text):
    """Call GPT-4 for sentiment fallback, return -1, 0, or 1."""        
    system_msg = "You are a financial news analyst."
    user_msg = (
        "Classify the sentiment of the given NY Times news article summary {summary}, which is closely related to the {sector} industry, as positive for buy, negative for sell, or neutral for hold position, for the US Stock market and provide the probability values for your classification."
        "Answer with just one word.\n\n"
        f"Summary: \"{text}\""
    )
    try:
        response = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[{"role": "system", "content": system_msg},
                      {"role": "user", "content": user_msg}],
            max_tokens=OPENAI_MAX_TOKENS,
            temperature=0.0,
        )
        ans = response.choices[0].message.content.strip().lower()
    except Exception as e:
        print("GPT-4 error (treated as neutral):", e)
        return 0.0
    if ans.startswith("pos"):
        return 1.0
    if ans.startswith("neg"):
        return -1.0
    return 0.0

def add_sentiment(news_df, use_gpt4=False):
    finbert_path = os.path.join(DERIVED_PATH, "nyt_with_finbert_sentiment.csv") # Checkpoint 1: If FinBERT file exists, skip FinBERT and load
    if os.path.exists(finbert_path):
        print("Checkpoint: FinBERT sentiment file found. Loading and skipping FinBERT step.")
        news_df = pd.read_csv(finbert_path, parse_dates=["Date"])
    else:
        print("Scoring FinBERT…")
        start_finbert = time.time()
        fin_scores, fin_probs = finbert_score(news_df["summary"].tolist())
        end_finbert = time.time()
        news_df["FinBERT_score"] = fin_scores
        news_df["FinBERT_prob_neu"] = [p[0] for p in fin_probs]
        news_df["FinBERT_prob_pos"] = [p[1] for p in fin_probs]
        news_df["FinBERT_prob_neg"] = [p[2] for p in fin_probs]
        news_df["FinBERT_confidence"] = news_df["FinBERT_score"].abs()
        news_df["Sentiment"] = news_df["FinBERT_score"]  # Start with FinBERT
        print(f"FinBERT sentiment scored in {end_finbert - start_finbert:.2f} seconds.")
        news_df.to_csv(finbert_path, index=False)
        print(f"Checkpoint: FinBERT sentiment saved to {finbert_path}")

    num_gpt4 = 0
    gpt4_path = os.path.join(DERIVED_PATH, "nyt_with_gpt4_fallback_sentiment.csv")

    if use_gpt4:
        if os.path.exists(gpt4_path): # If checkpoint exists, load and skip fallback
            print("Checkpoint: GPT-4 fallback file found. Loading and skipping fallback step.")
            news_df = pd.read_csv(gpt4_path, parse_dates=["Date"])
        else:
            print("Running GPT-4 fallback…")
            low_conf_mask = news_df["FinBERT_confidence"] < GPT4_CONFIDENCE_THRESHOLD
            num_gpt4 = low_conf_mask.sum()
            print(f"News needing GPT-4 fallback: {num_gpt4} of {len(news_df)}")
            start_gpt4 = time.time()
            checkpoint_counter = 0
            for idx_num, idx in enumerate(news_df[low_conf_mask].index):
                news_df.at[idx, "Sentiment"] = gpt4_sentiment_single(news_df.at[idx, "summary"])
                checkpoint_counter += 1
                if checkpoint_counter % GPT4_CHECKPOINT_EVERY == 0: # Intermediate checkpoint
                    news_df.to_csv(gpt4_path, index=False)
                    print(f"Checkpoint: Saved GPT-4 fallback at {checkpoint_counter} / {num_gpt4} GPT-4 calls.")
            end_gpt4 = time.time()
            print(f"GPT-4 fallback completed in {end_gpt4 - start_gpt4:.2f} seconds.")
            news_df.to_csv(gpt4_path, index=False)
            print(f"Checkpoint: Final GPT-4 fallback saved to {gpt4_path}")
    else:
        num_gpt4 = 0

    return news_df, num_gpt4
   
Path(DERIVED_PATH).mkdir(exist_ok=True) # Main Pipeline
pipeline_start = time.time()
news_df, num_gpt4 = add_sentiment(news_df, use_gpt4=USE_GPT4_FALLBACK)
pipeline_end = time.time()
print(f"Total pipeline runtime: {pipeline_end - pipeline_start:.2f} seconds.")

# Calculaitng the daily sentiment aggregate
sent_daily = (
    news_df.groupby(news_df["Date"].dt.normalize())["Sentiment"]
    .mean()
    .reset_index()
)
sent_daily_path = os.path.join(DERIVED_PATH, "daily_sentiment_aggregate.csv")
sent_daily.to_csv(sent_daily_path, index=False)

# Merging with market data for modeling (LSTM)
market_merge = pd.merge(
    market_df, 
    sent_daily, 
    left_on=market_df["Date"].dt.normalize(), 
    right_on=sent_daily["Date"].dt.normalize(), 
    how="left", 
    suffixes=('', '_sent')
)
final_out_path = os.path.join(DERIVED_PATH, "final_merged_for_lstm.csv")
market_merge.to_csv(final_out_path, index=False)
print(f"Checkpoint: Final merged LSTM-ready data saved to {final_out_path}")

print(f"All CSVs saved to: {DERIVED_PATH}/")
print("Output files:")
print(" - nyt_with_finbert_sentiment.csv")
print(" - nyt_with_gpt4_fallback_sentiment.csv")
print(" - daily_sentiment_aggregate.csv")
print(" - final_merged_for_lstm.csv")


### FinBERT-only Sentiment Aggregate

In [None]:
finbert_only_path = os.path.join(DERIVED_PATH, "nyt_with_finbert_sentiment.csv") # Creating daily FinBERT-only sentiment aggregate for benchmarking
finbert_df = pd.read_csv(finbert_only_path, parse_dates=["Date"])

daily_finbert_sent = (
    finbert_df.groupby(finbert_df["Date"].dt.normalize())["FinBERT_score"]
    .mean()
    .reset_index()
)
daily_finbert_sent_path = os.path.join(DERIVED_PATH, "daily_FinBERT_sentiment_aggregate.csv")
daily_finbert_sent.to_csv(daily_finbert_sent_path, index=False)

# Merging with market data for modeling (LSTM) FinBERT-only version, acts as a benchmark
market_merge_finbert = pd.merge(
    market_df,
    daily_finbert_sent,
    left_on=market_df["Date"].dt.normalize(),
    right_on=daily_finbert_sent["Date"].dt.normalize(),
    how="left",
    suffixes=('', '_sent')
)
final_out_finbert_path = os.path.join(DERIVED_PATH, "final_merged_FinBERT_for_lstm.csv")
market_merge_finbert.to_csv(final_out_finbert_path, index=False)

print(f"Checkpoint: FinBERT-only daily sentiment aggregate saved to {daily_finbert_sent_path}")
print(f"Checkpoint: Final merged FinBERT-only LSTM-ready data saved to {final_out_finbert_path}")

#Note: The OpenAI API usage as shown might need adaptation for actual GPT-4 (which might require using openai.ChatCompletion.create with messages in the new API format rather than the older Completion.create). But the idea stands – send the article text and get a sentiment label. The prompt here is simplified; in practice, one could include examples or a role specification for consistency.

## LSTM Models

#### Implementation laid out by Jakob Aungiers https://github.com/jaungiers/LSTM-Neural-Network-for-Time-Series-Prediction

##### Conda virtual enviroment is implemented to run Python 3.11 throughout the remainder of this experiment

In [None]:
!pip install ipykernel
!pip install numpy pandas scikit-learn tensorflow keras matplotlib
!pip install --upgrade notebook
!pip install python-dotenv
!pip install joblib
!pip install scikit-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install threadpoolctl
!pip install tensorflow
!pip install scikit-image
!pip install scikit-learn-intelex
!pip install keras

In [None]:
import numpy as np
import pandas as pd
from datetime import date
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

### Hybrid (FinBERT+GPT) Sentiment LSTM Forecast

In [None]:
import numpy as np
import pandas as pd
from datetime import date
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

data_path = "derived/final_merged_for_lstm.csv"
df = pd.read_csv(data_path, parse_dates=["Date"])

df = df.drop(columns=[col for col in ['key_0', 'Date_sent'] if col in df.columns]) # Cleaning up columns

for col in df.columns: # Ensuring all columns except Date are numeric
    if col not in ['Date']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna().reset_index(drop=True)

# Feature engineering
exclude_cols = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume'] # Remove prices and volume
feature_cols = ['Return', 'VIX', 'Sentiment']

N_LAGS = 2 # Lagged features added below
for lag in range(1, N_LAGS+1):
    df[f"Return_lag{lag}"] = df["Return"].shift(lag)
    df[f"Sentiment_lag{lag}"] = df["Sentiment"].shift(lag)
feature_cols += [f"Return_lag{lag}" for lag in range(1, N_LAGS+1)]
feature_cols += [f"Sentiment_lag{lag}" for lag in range(1, N_LAGS+1)]
df = df.dropna().reset_index(drop=True)

# Train/val/test split by time
TRAIN_END = date(2021, 12, 31)
VAL_END = date(2022, 12, 31)
df["Date"] = pd.to_datetime(df["Date"])
df_train = df[df["Date"] <= pd.to_datetime(TRAIN_END)]
df_val = df[(df["Date"] > pd.to_datetime(TRAIN_END)) & (df["Date"] <= pd.to_datetime(VAL_END))]
df_test = df[df["Date"] > pd.to_datetime(VAL_END)]

# Scaling (IMPORTANT) 
scaler = StandardScaler()
X_train = scaler.fit_transform(df_train[feature_cols])
X_val = scaler.transform(df_val[feature_cols])
X_test = scaler.transform(df_test[feature_cols])
y_train = df_train["Return"].values
y_val = df_val["Return"].values
y_test = df_test["Return"].values

# Creating LSTM Sequences
SEQ_LEN = 5
def create_sequences(x, y, seq_len):
    xs, ys = [], []
    for i in range(len(x) - seq_len):
        xs.append(x[i : i + seq_len])
        ys.append(y[i + seq_len])
    return np.array(xs), np.array(ys)

X_train_seq, y_train_seq = create_sequences(X_train, y_train, SEQ_LEN)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, SEQ_LEN)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, SEQ_LEN)

# LSTM Model
model = Sequential([
    LSTM(50, input_shape=(SEQ_LEN, X_train_seq.shape[2])),
    Dense(1),
])
model.compile(optimizer="adam", loss="mse")
early_stop = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train
print("Training LSTM …")
model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=16,
    validation_data=(X_val_seq, y_val_seq),
    callbacks=[early_stop],
    verbose=1,
)

print("Evaluating …") # Evaluation
pred_test = model.predict(X_test_seq).flatten()
rmse = np.sqrt(mean_squared_error(y_test_seq, pred_test))
print(f"Test RMSE: {rmse:.6f}")

actual_dir = (y_test_seq > 0) # Directional accuracy
pred_dir = (pred_test > 0)
acc = accuracy_score(actual_dir, pred_dir)
print(f"Directional accuracy: {acc:.2%}")

df_out = df_test.iloc[SEQ_LEN:].copy().reset_index(drop=True)
df_out["Predicted_Return"] = pred_test
df_out.to_csv("derived/lstm_test_predictions.csv", index=False)

print(df_out[["Date", "Return", "Predicted_Return"]].head())
print("Final LSTM model summary:")
model.summary()

### FinBERT-only LSTM Forecast

In [None]:
import numpy as np
import pandas as pd
from datetime import date
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

data_path = "derived/final_merged_FinBERT_for_lstm.csv"
df = pd.read_csv(data_path, parse_dates=["Date"])
 
df = df.drop(columns=[col for col in ['key_0', 'Date_sent'] if col in df.columns]) # Cleaning columns

for col in ['Close', 'High', 'Low', 'Open', 'Volume']: # Removing column headers like '^GSPC' in Close/Open/etc (non-numeric entries)
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
if "VIX" in df.columns:
    df["VIX"] = pd.to_numeric(df["VIX"], errors='coerce')

for col in df.columns: # Ensure all except 'Date' are numeric
    if col != 'Date':
        df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna().reset_index(drop=True)

# Feature Engineering 
N_LAGS = 2  # Adding lags for target and sentiment
for lag in range(1, N_LAGS+1):
    df[f"Return_lag{lag}"] = df["Return"].shift(lag)
    df[f"FinBERT_score_lag{lag}"] = df["FinBERT_score"].shift(lag)
df = df.dropna().reset_index(drop=True)

# Train/val/test split 
TRAIN_END = date(2021, 12, 31)
VAL_END = date(2022, 12, 31)

df["Date"] = pd.to_datetime(df["Date"])
df_train = df[df["Date"] <= pd.to_datetime(TRAIN_END)]
df_val = df[(df["Date"] > pd.to_datetime(TRAIN_END)) & (df["Date"] <= pd.to_datetime(VAL_END))]
df_test = df[df["Date"] > pd.to_datetime(VAL_END)]

# Features and Scaling
FEATURES = ['Close', 'High', 'Low', 'Open', 'Volume', 'VIX', 'FinBERT_score'] + \
           [f"Return_lag{lag}" for lag in range(1, N_LAGS+1)] + \
           [f"FinBERT_score_lag{lag}" for lag in range(1, N_LAGS+1)]
X_train = df_train[FEATURES].astype(np.float32).values
X_val = df_val[FEATURES].astype(np.float32).values
X_test = df_test[FEATURES].astype(np.float32).values
y_train = df_train["Return"].values
y_val = df_val["Return"].values
y_test = df_test["Return"].values

# Scale Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Creating LSTM sequences (window=5)
SEQ_LEN = 5
def create_sequences(x, y, seq_len):
    xs, ys = [], []
    for i in range(len(x) - seq_len):
        xs.append(x[i : i + seq_len])
        ys.append(y[i + seq_len])
    return np.array(xs), np.array(ys)
X_train_seq, y_train_seq = create_sequences(X_train, y_train, SEQ_LEN)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, SEQ_LEN)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, SEQ_LEN)

print(f"Train samples: {X_train_seq.shape[0]}, Val: {X_val_seq.shape[0]}, Test: {X_test_seq.shape[0]}")

# LSTM Model
model = Sequential([
    LSTM(50, input_shape=(SEQ_LEN, X_train_seq.shape[2])),
    Dense(1),
])
model.compile(optimizer="adam", loss="mse")
early_stop = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train
print("Training LSTM …")
model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=16,
    validation_data=(X_val_seq, y_val_seq),
    callbacks=[early_stop],
    verbose=1,
)

print("Evaluating …") # Evaluation
pred_test = model.predict(X_test_seq).flatten()
rmse = np.sqrt(mean_squared_error(y_test_seq, pred_test))
print(f"Test RMSE: {rmse:.6f}")

actual_dir = (y_test_seq > 0) # Directional accuracy
pred_dir = (pred_test > 0)
acc = accuracy_score(actual_dir, pred_dir)
print(f"Directional accuracy: {acc:.2%}")

df_test = df_test.iloc[SEQ_LEN:].copy()  # aligning with y_test_seq
df_test["Predicted_Return"] = pred_test
df_test.to_csv("derived/lstm_FinBERT_only_test_predictions.csv", index=False)

print(df_test[["Date", "Return", "Predicted_Return"]].head())
print("Final LSTM model summary:")
model.summary()

#### Diagnostics

In [None]:
import pandas as pd

for name in ["derived/lstm_test_predictions.csv",
             "derived/lstm_FinBERT_only_test_predictions.csv"]:
    df = pd.read_csv(name, parse_dates=["Date"])
    dup = df.duplicated(subset=["Date"]).sum()
    print(name, "rows:", len(df), "duplicates:", dup)

## Evaluaiton and Visualizaiton

### Forecast Biasness Test

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.stats.stattools import durbin_watson

df_gpt   = pd.read_csv("derived/lstm_test_predictions.csv",
                       parse_dates=["Date"])
df_fbert = pd.read_csv("derived/lstm_FinBERT_only_test_predictions.csv",
                       parse_dates=["Date"])
market   = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])

df = (df_gpt[["Date", "Return", "Predicted_Return"]]
        .rename(columns={"Return":"Market_Return",
                         "Predicted_Return":"LSTM_GPT4"})
      .merge(df_fbert[["Date","Predicted_Return"]]
                .rename(columns={"Predicted_Return":"LSTM_FinBERT"}),
             on="Date", how="left")
      .merge(market[["Date","Return","VIX"]]
                .rename(columns={"Return":"Market_True_Return"}),
             on="Date", how="left")
      .dropna(subset=["Market_True_Return","LSTM_GPT4","LSTM_FinBERT"]))

y_true   = df["Market_True_Return"].values # Handles
pred_gpt = df["LSTM_GPT4"].values
pred_fb  = df["LSTM_FinBERT"].values

def signal_metrics(y, yhat): # Forecast-Evaluation Metrics 
    """Return dict of point & trading metrics."""
    mse  = np.mean((y - yhat)**2)
    mae  = np.mean(np.abs(y - yhat))
    rmse = np.sqrt(mse)
    strat_ret = np.where(yhat>0, 1, -1) * y # Trading signal is long if yhat>0, short otherwise
    sharpe = np.mean(strat_ret)/(np.std(strat_ret)+1e-9)*np.sqrt(252)
    r2 = sm.OLS(y, sm.add_constant(yhat)).fit().rsquared
    return dict(MSE=mse, MAE=mae, Directional_Acc=acc,
                Sharpe=sharpe, R2=r2)
   
m_gpt = signal_metrics(y_true, pred_gpt)
m_fb  = signal_metrics(y_true, pred_fb)

def dm_test(e1, e2, h=1): # Diebold–Mariano (squared-error loss)
    d   = e1-e2
    T   = len(d)
    var = np.var(d, ddof=1) + 2*sum(
          np.cov(d[:-k],d[k:])[0,1] for k in range(1,h))
    dm  = np.mean(d)/np.sqrt(var/T)
    p   = 2*(1-stats.norm.cdf(abs(dm)))
    return dm, p

dm_stat, dm_p = dm_test((y_true-pred_gpt)**2, (y_true-pred_fb)**2)

df["VIX"] = pd.to_numeric(df["VIX"], errors="coerce") # Regime (VIX median) Sharpe comparison
vix_med = df["VIX"].median()
reg   = {}
for regime, lab in [(df["VIX"]>vix_med,"High-VIX"),
                    (df["VIX"]<=vix_med,"Low-VIX")]:
    reg[lab] = dict(GPT4   = signal_metrics(
                                y_true[regime], pred_gpt[regime])["Sharpe"],
                    FinBERT = signal_metrics(
                                y_true[regime], pred_fb[regime])["Sharpe"])

def bias_tests(y, yhat, label): # Forecast-Bias Section
    err = y - yhat

    t, p = stats.ttest_1samp(err, 0.0) # Mean-error t-test

    X = sm.add_constant(yhat) # Mincer–Zarnowitz regression
    mz = sm.OLS(y, X).fit()

    # Joint test α=0, β=1  (forecast unbiased)
    R = np.eye(2)
    q = np.array([0,1])
    ftest = mz.f_test((R,q))
    return dict(
        Model          = label,
        Mean_Error     = err.mean(),
        MeanErr_tstat  = t,
        MeanErr_pval   = p,
        MZ_alpha       = mz.params[0],
        MZ_beta        = mz.params[1],
        MZ_alpha_p     = mz.pvalues[0],
        MZ_beta_p      = mz.pvalues[1],
        MZ_F_pvalue    = float(ftest.pvalue)
    )

bias_gpt = bias_tests(y_true, pred_gpt, "Hybrid (GPT-4)")
bias_fb  = bias_tests(y_true, pred_fb , "FinBERT-only")

from tabulate import tabulate

# Forecast-evaluation
eval_tbl = pd.DataFrame([m_gpt, m_fb], index=["Hybrid","FinBERT"])
print("Forecast-Evaluation Metrics")
print(tabulate(eval_tbl, headers="keys", tablefmt="github", floatfmt=".4f"))

print("\nDiebold–Mariano statistic: {:.3f}   p-value: {:.3f}" # DM-test summary
      .format(dm_stat, dm_p))

print("Regime-dependent Sharpe") # Regime Sharpe
print(tabulate(pd.DataFrame(reg).T, headers="keys",
               tablefmt="github", floatfmt=".3f"))

bias_tbl = pd.DataFrame([bias_gpt, bias_fb]).set_index("Model") # Bias tests
print("Forecast-Bias Diagnostics")
print(tabulate(bias_tbl, headers="keys", tablefmt="github", floatfmt=".4f"))

from sklearn.metrics import accuracy_score # computing validation accuracy for both models

print("Validation Accuracy")

val_acc_gpt = accuracy_score(
    (df["Market_True_Return"].values > 0),   # true direction
    (df["LSTM_GPT4"].values          > 0)    # predicted direction
)

val_acc_fb = accuracy_score(
    (df["Market_True_Return"].values > 0),
    (df["LSTM_FinBERT"].values       > 0)
)

print(f"Hybrid (GPT-4) Validation Accuracy:  {val_acc_gpt:.2%}")
print(f"FinBERT-only Validation Accuracy:    {val_acc_fb:.2%}")

### Evaluation Metrics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)
from scipy.stats import norm

df_gpt = pd.read_csv("derived/lstm_test_predictions.csv", parse_dates=["Date"])
df_fbert = pd.read_csv("derived/lstm_FinBERT_only_test_predictions.csv", parse_dates=["Date"])
market_df = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])

df = pd.DataFrame({ # Aligning data by date
    "Date": df_gpt["Date"],
    "Market_Return": df_gpt["Return"],
    "LSTM_GPT4": df_gpt["Predicted_Return"],
    "LSTM_FinBERT": df_fbert["Predicted_Return"],
})
df = df.merge(market_df[["Date", "Return", "VIX"]].rename(columns={"Return": "Market_True_Return"}), on="Date", how="left")
df["VIX"] = pd.to_numeric(df["VIX"], errors="coerce")

def trading_signal_returns(true_returns, predicted_returns): # Signal-based trading returns
    signal = np.where(predicted_returns > 0, 1, -1)
    return signal * true_returns

def compute_all_metrics(true_returns, predicted_returns, rolling_window=60): # Computing metrics for each model (with rolling mean/std)
    if len(true_returns) == 0 or len(predicted_returns) == 0:
        raise ValueError("Empty input arrays! Check your mask and input data.")
    mse = mean_squared_error(true_returns, predicted_returns)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_returns, predicted_returns)

    true_up = (true_returns > 0).astype(int)
    pred_up = (predicted_returns > 0).astype(int)

    acc = accuracy_score(true_up, pred_up)
    prec = precision_score(true_up, pred_up, zero_division=0)
    rec = recall_score(true_up, pred_up, zero_division=0)
    f1 = f1_score(true_up, pred_up, zero_division=0)
    try:
        roc = roc_auc_score(true_up, predicted_returns)
    except:
        roc = np.nan

    cm = confusion_matrix(true_up, pred_up)
    strat_returns = trading_signal_returns(true_returns, predicted_returns)
    cum_return = np.cumprod(1 + strat_returns)[-1] - 1 if len(strat_returns) > 0 else np.nan
    sharpe = np.mean(strat_returns) / (np.std(strat_returns) + 1e-9) * np.sqrt(252)
    roll_sharpe = pd.Series(strat_returns).rolling(rolling_window).apply(
        lambda x: np.mean(x) / (np.std(x) + 1e-9) * np.sqrt(252), raw=True)
    roll_acc = pd.Series(pred_up == true_up).rolling(rolling_window).mean()
    roll_cum_return = (1 + pd.Series(strat_returns)).cumprod() - 1
    rolling_sharpe_mean, rolling_sharpe_std = roll_sharpe.mean(), roll_sharpe.std()
    rolling_acc_mean, rolling_acc_std = roll_acc.mean(), roll_acc.std()
    return {
        "MSE": mse, "RMSE": rmse, "MAE": mae,
        "Direction_Acc": acc, "Precision": prec, "Recall": rec, "F1": f1, "ROC_AUC": roc,
        "Sharpe": sharpe, "Cumulative_Return": cum_return,
        "Rolling_Sharpe_Mean": rolling_sharpe_mean, "Rolling_Sharpe_Std": rolling_sharpe_std,
        "Rolling_Acc_Mean": rolling_acc_mean, "Rolling_Acc_Std": rolling_acc_std,
        "Confusion_Matrix": cm,
        "Rolling_Sharpe": roll_sharpe,
        "Rolling_Acc": roll_acc,
        "Rolling_CumReturn": roll_cum_return,
        "Signal_Returns": strat_returns
    }

mask_gpt = df['Market_True_Return'].notnull() & df['LSTM_GPT4'].notnull() # Creating valid data masks and calculate for both models
mask_fbert = df['Market_True_Return'].notnull() & df['LSTM_FinBERT'].notnull()

print("\nValid rows for GPT:", mask_gpt.sum())
print("Valid rows for FinBERT:", mask_fbert.sum())

if mask_gpt.sum() == 0 or mask_fbert.sum() == 0:
    raise ValueError("No valid data rows for at least one model. Check input data and merges!")

metrics_gpt = compute_all_metrics(
    df.loc[mask_gpt, 'Market_True_Return'].values,
    df.loc[mask_gpt, 'LSTM_GPT4'].values
)
metrics_finbert = compute_all_metrics(
    df.loc[mask_fbert, 'Market_True_Return'].values,
    df.loc[mask_fbert, 'LSTM_FinBERT'].values
)

# Table (include rolling mean/std)
comparison_table = pd.DataFrame({
    "Hybrid (FinBERT+GPT-4)": {k: v for k, v in metrics_gpt.items() if not isinstance(v, (np.ndarray, pd.Series, list))},
    "FinBERT-only": {k: v for k, v in metrics_finbert.items() if not isinstance(v, (np.ndarray, pd.Series, list))}
})
print("\n===== Model Comparison Table =====\n")
print(comparison_table)
comparison_table.to_csv("model_performance_comparison.csv")

# 4. Regime split (by VIX): show model dominance in high/low volatility
vix_median = df["VIX"].median()
df["VIX_regime"] = np.where(df["VIX"] > vix_median, "High_VIX", "Low_VIX")
def regime_metrics(regime):
    idx = df["VIX_regime"] == regime
    gpt_metrics = compute_all_metrics(
        df.loc[idx & mask_gpt, "Market_True_Return"].values, df.loc[idx & mask_gpt, "LSTM_GPT4"].values
    )
    finbert_metrics = compute_all_metrics(
        df.loc[idx & mask_fbert, "Market_True_Return"].values, df.loc[idx & mask_fbert, "LSTM_FinBERT"].values
    )
    return gpt_metrics, finbert_metrics
for regime in ["High_VIX", "Low_VIX"]:
    gpt_metrics, finbert_metrics = regime_metrics(regime)
    print(f"\n=== {regime} Regime ===")
    print("Hybrid Sharpe:", gpt_metrics["Sharpe"], "Cumulative:", gpt_metrics["Cumulative_Return"])
    print("FinBERT-only Sharpe:", finbert_metrics["Sharpe"], "Cumulative:", finbert_metrics["Cumulative_Return"])

### Resulting Plots

In [None]:
import matplotlib.pyplot as plt
print(plt.style.available)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score, roc_curve, r2_score
)

fbert = pd.read_csv("derived/lstm_FinBERT_only_test_predictions.csv", parse_dates=["Date"])
gpt4 = pd.read_csv("derived/lstm_test_predictions.csv", parse_dates=["Date"])
market = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])

for df in [fbert, gpt4]: # changing objects (str) columns to numeric
    for col in df.columns:
        if col != "Date":
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df.dropna(inplace=True)

plt.style.use("seaborn-v0_8-darkgrid") # General settings
sns.set(font_scale=1.2)
window = 21  # 1 month rolling was chosen

df = fbert[["Date", "Return", "VIX", "FinBERT_score", "Predicted_Return"]].copy() # Mergeing for side-by-side plots
df = df.rename(columns={"Predicted_Return": "FinBERT_Pred"})
df["GPT4_Pred"] = gpt4["Predicted_Return"].values
df["GPT4_Sentiment"] = gpt4["Sentiment"].values if "Sentiment" in gpt4 else np.nan


# Actual vs LSTM Forecasts: Time Series Overlap
plt.figure(figsize=(17,7))
plt.plot(df["Date"], df["Return"], color='black', label='Market Return', linewidth=2)
plt.plot(df["Date"], df["FinBERT_Pred"], color='orange', label='FinBERT-only LSTM', alpha=0.8)
plt.plot(df["Date"], df["GPT4_Pred"], color='red', label='FinBERT+GPT-4 LSTM', alpha=0.7)
plt.ylabel("Return")
plt.title("Market Returns vs LSTM Model Forecasts")
plt.legend()
plt.show()


# Model Error: Residuals Over Time 
plt.figure(figsize=(17,7))
plt.plot(df["Date"], df["Return"] - df["FinBERT_Pred"], label="Error: FinBERT LSTM", color="orange", alpha=0.6)
plt.plot(df["Date"], df["Return"] - df["GPT4_Pred"], label="Error: GPT-4 LSTM", color="dodgerblue", alpha=0.6)
plt.axhline(0, color="black", linewidth=1, linestyle=":")
plt.ylabel("Residual (Error)")
plt.title("Model Residuals: Market - Model Forecast")
plt.legend()
plt.show()


# Distribution of Forecasts: Histogram & KDE 
plt.figure(figsize=(14,5))
sns.histplot(df["Return"], label="Market Return", color="black", kde=True, stat="density", bins=40)
sns.histplot(df["FinBERT_Pred"], label="FinBERT LSTM", color="orange", kde=True, stat="density", bins=40, alpha=0.5)
sns.histplot(df["GPT4_Pred"], label="GPT-4 LSTM", color="dodgerblue", kde=True, stat="density", bins=40, alpha=0.5)
plt.legend()
plt.title("Distribution of Market Returns vs LSTM Model Forecasts")
plt.show()


# Actual vs Predicted: Scatter Plots and Regression Fit
fig, axs = plt.subplots(1, 2, figsize=(16,6), sharey=True)
sns.regplot(x=df["Return"], y=df["FinBERT_Pred"], ax=axs[0], line_kws={"color": "orange"})
axs[0].set_title("FinBERT LSTM: Actual vs Predicted")
axs[0].set_xlabel("Actual Return")
axs[0].set_ylabel("Predicted Return")
sns.regplot(x=df["Return"], y=df["GPT4_Pred"], ax=axs[1], line_kws={"color": "dodgerblue"})
axs[1].set_title("GPT-4 LSTM: Actual vs Predicted")
axs[1].set_xlabel("Actual Return")
plt.show()


# Rolling Model RMSE (21d window)
rmse_fbert = (df["Return"] - df["FinBERT_Pred"]).rolling(window).apply(lambda x: np.sqrt(np.mean(x**2)))
rmse_gpt4 = (df["Return"] - df["GPT4_Pred"]).rolling(window).apply(lambda x: np.sqrt(np.mean(x**2)))
plt.figure(figsize=(17,6))
plt.plot(df["Date"], rmse_fbert, label="Rolling RMSE: FinBERT LSTM", color="orange")
plt.plot(df["Date"], rmse_gpt4, label="Rolling RMSE: GPT-4 LSTM", color="dodgerblue")
plt.ylabel("RMSE")
plt.title("21-Day Rolling RMSE: Model Performance Over Time")
plt.legend()
plt.show()


# ROC Curve: Directional Signal of Models
from sklearn.metrics import roc_curve, auc
true_bin = (df["Return"] > 0).astype(int)
fpr_fbert, tpr_fbert, _ = roc_curve(true_bin, df["FinBERT_Pred"])
fpr_gpt4, tpr_gpt4, _ = roc_curve(true_bin, df["GPT4_Pred"])
auc_fbert = auc(fpr_fbert, tpr_fbert)
auc_gpt4 = auc(fpr_gpt4, tpr_gpt4)
plt.figure(figsize=(8,6))
plt.plot(fpr_fbert, tpr_fbert, label=f"FinBERT LSTM (AUC={auc_fbert:.2f})", color="orange")
plt.plot(fpr_gpt4, tpr_gpt4, label=f"GPT-4 LSTM (AUC={auc_gpt4:.2f})", color="dodgerblue")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve: Model Directional Prediction Accuracy")
plt.legend()
plt.show()


# Error Autocorrelation
from pandas.plotting import autocorrelation_plot
plt.figure(figsize=(8,4))
autocorrelation_plot(df["Return"] - df["FinBERT_Pred"])
plt.title("Error Autocorrelation: FinBERT LSTM")
plt.show()
plt.figure(figsize=(8,4))
autocorrelation_plot(df["Return"] - df["GPT4_Pred"])
plt.title("Error Autocorrelation: GPT-4 LSTM")
plt.show()


# Cumulative Return Plot
df = pd.DataFrame({ # Aligning data by date
    "Date": df_gpt["Date"],
    "Market_Return": df_gpt["Return"],  # Use market_df["Return"] if better aligned
    "LSTM_GPT4": df_gpt["Predicted_Return"],
    "LSTM_FinBERT": df_fbert["Predicted_Return"],
})
df = df.merge(market_df[["Date", "Return", "VIX"]].rename(columns={"Return": "Market_True_Return"}), on="Date", how="left")
df["VIX"] = pd.to_numeric(df["VIX"], errors="coerce")

def trading_signal_returns(true_returns, predicted_returns): # Signal-based trading returns
    signal = np.where(predicted_returns > 0, 1, -1)  # Long/short signal
    return signal * true_returns

mask_gpt = df['Market_True_Return'].notnull() & df['LSTM_GPT4'].notnull() # Establishing valid data masks and calculate for both models
mask_fbert = df['Market_True_Return'].notnull() & df['LSTM_FinBERT'].notnull()

print("\nValid rows for GPT:", mask_gpt.sum())
print("Valid rows for FinBERT:", mask_fbert.sum())

if mask_gpt.sum() == 0 or mask_fbert.sum() == 0:
    raise ValueError("No valid data rows for at least one model. Check input data and merges!")

metrics_gpt = compute_all_metrics(
    df.loc[mask_gpt, 'Market_True_Return'].values,
    df.loc[mask_gpt, 'LSTM_GPT4'].values
)
metrics_finbert = compute_all_metrics(
    df.loc[mask_fbert, 'Market_True_Return'].values,
    df.loc[mask_fbert, 'LSTM_FinBERT'].values
)

plt.figure(figsize=(12,5))

dates = df.loc[mask_gpt, 'Date'] # Align dates

gpt_signal = np.where(df.loc[mask_gpt, 'LSTM_GPT4'] > 0, 1, -1) # Computing trading strategy returns for both models
finbert_signal = np.where(df.loc[mask_fbert, 'LSTM_FinBERT'] > 0, 1, -1)

gpt_strat_returns = gpt_signal * df.loc[mask_gpt, 'Market_True_Return'].values
finbert_strat_returns = finbert_signal * df.loc[mask_fbert, 'Market_True_Return'].values

gpt_cum_return = np.cumsum(gpt_strat_returns) # Computing cumulative returns
finbert_cum_return = np.cumsum(finbert_strat_returns)
market_cum_return = np.cumsum(df.loc[mask_gpt, 'Market_True_Return'].values)

dates = df.loc[mask_gpt, 'Date'].reset_index(drop=True)

plt.figure(figsize=(12, 5))
plt.plot(dates, market_cum_return, label="Market Cumulative Return", color='black', linewidth=2)
plt.plot(dates, gpt_cum_return, label="Hybrid Cumulative Return", color='red')
plt.plot(dates, finbert_cum_return, label="FinBERT-only Cumulative Return", color='darkorange')
plt.title("Cumulative Strategy Returns Over Time")
plt.xlabel("Date")
plt.ylabel("Cumulative Return")
plt.legend()
plt.tight_layout()
plt.show()


# Rolling Directional Accuracy and Sharpe Ratio 
win = 63      
def rolling_directional_acc(true_ret, pred_ret, win=win):
    """
    Percentage of days inside a rolling window where sign(pred)==sign(true).
    win=63 ≈ one quarter of trading days.
    """
    hit = (np.sign(true_ret) == np.sign(pred_ret)).astype(int)
    return pd.Series(hit).rolling(win, min_periods=1).mean() * 100  # % scale

mkt_true = df.loc[mask_gpt, "Market_True_Return"].values  # Aligning the same true-return vector you used for Sharpe
pred_gpt = df.loc[mask_gpt, "LSTM_GPT4"].values
pred_fbt = df.loc[mask_fbert, "LSTM_FinBERT"].values
dates_da = df.loc[mask_gpt, "Date"].values

da_gpt   = rolling_directional_acc(mkt_true, pred_gpt)
da_fbt   = rolling_directional_acc(mkt_true, pred_fbt)

dates_all = df.loc[mask_gpt, "Date"].values 

def rolling_sharpe(returns, win=win): # Defining sharpe_gpt and sharpe_fbert
    """
    Calculate rolling Sharpe ratio over a specified window.
    Returns annualised Sharpe ratio.
    """
    mean_ret = returns.rolling(win).mean()
    std_ret  = returns.rolling(win).std(ddof=0)  # population std
    return (mean_ret / (std_ret + 1e-9)) * np.sqrt(252)  # annualised
sharpe_gpt   = rolling_sharpe(df.loc[mask_gpt, "Market_True_Return"] - df.loc[mask_gpt, "LSTM_GPT4"])
sharpe_fbert = rolling_sharpe(df.loc[mask_fbert, "Market_True_Return"] - df.loc[mask_fbert, "LSTM_FinBERT"])

trim = win - 1 # Trimming FIRST (win-1) observations where rolling metrics
dates_trim        = dates_all[trim:]          # shared for both plots
sharpe_gpt_trim   = sharpe_gpt.iloc[trim:]
sharpe_fbt_trim   = sharpe_fbert.iloc[trim:]
da_gpt_trim       = da_gpt.iloc[trim:]
da_fbt_trim       = da_fbt.iloc[trim:]

plt.figure(figsize=(15,5)) # Rolling Sharpe (after trim)
plt.plot(dates_trim, sharpe_gpt_trim,   label="Hybrid rolling Sharpe",  c="red")
plt.plot(dates_trim, sharpe_fbt_trim,   label="FinBERT rolling Sharpe", c="darkorange")
plt.axhline(0, ls='--', c='k')
plt.title(f"Rolling Sharpe Ratio ({win}-day Window)")
plt.ylabel("Sharpe Ratio (annualised)")
plt.xlabel("Date")
plt.legend(); plt.tight_layout(); plt.show()

plt.figure(figsize=(15,5)) # Rolling Directional-Accuracy (after trim)
plt.plot(dates_trim, da_gpt_trim, label="Hybrid directional accuracy",  c="red")
plt.plot(dates_trim, da_fbt_trim, label="FinBERT directional accuracy", c="darkorange")
plt.axhline(50, ls='--', c='k', lw=0.8)       # coin-flip baseline
plt.ylim(0, 100)
plt.title(f"Rolling Directional Accuracy ({win}-day Window)")
plt.ylabel("Accuracy (%)")
plt.xlabel("Date")
plt.legend(); plt.tight_layout(); plt.show()


# Confusion Matrix
from sklearn.metrics import confusion_matrix

def plot_conf_mat(y_true, y_pred, title, ax):
    cm = confusion_matrix(y_true > 0, y_pred > 0)
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                xticklabels=["Down", "Up"], yticklabels=["Down", "Up"], ax=ax)
    ax.set_xlabel("Model Signal"); ax.set_ylabel("Market Direction"); ax.set_title(title)

fig, axes = plt.subplots(1,2,figsize=(10,4))
plot_conf_mat(df.loc[mask_gpt,"Market_True_Return"],
              df.loc[mask_gpt,"LSTM_GPT4"],
              "Hybrid", axes[0])
plot_conf_mat(df.loc[mask_fbert,"Market_True_Return"],
              df.loc[mask_fbert,"LSTM_FinBERT"],
              "FinBERT-only", axes[1])
plt.tight_layout(); plt.show()


# Turnover Function Vs Net-Long Ratio
def turnover(signal):
    return (np.abs(np.diff(signal)) / 2).mean() # Average fraction of days where direction flips

sig_gpt   = np.where(df.loc[mask_gpt,  'LSTM_GPT4']   > 0, 1, -1)
sig_fbert = np.where(df.loc[mask_fbert,'LSTM_FinBERT']> 0, 1, -1)

print(f"Hybrid turnover:  {turnover(sig_gpt):.2%}")
print(f"FinBERT turnover: {turnover(sig_fbert):.2%}")
print(f"Hybrid net-long ratio:  {(sig_gpt==1).mean():.2%}")
print(f"FinBERT net-long ratio: {(sig_fbert==1).mean():.2%}")

import matplotlib.pyplot as plt
import numpy as np

sig_gpt   = np.where(df.loc[mask_gpt,  'LSTM_GPT4']   > 0, 1, -1) # Re-computing signals (or reuse sig_gpt / sig_fbert from earlier)
sig_fbert = np.where(df.loc[mask_fbert,'LSTM_FinBERT']> 0, 1, -1)

def turnover(s):                      # share of days that flip direction
    return (np.abs(np.diff(s)) / 2).mean()

turn = [turnover(sig_gpt), turnover(sig_fbert)]
netL = [(sig_gpt == 1).mean(), (sig_fbert == 1).mean()]   # net-long ratios

labels = ["Hybrid\n(FinBERT+GPT-4)", "FinBERT-only"]
x = np.arange(len(labels))

fig, ax1 = plt.subplots(figsize=(8, 5))
width = 0.35

bars1 = ax1.bar(x - width/2, turn, width,    # Left y-axis => Turnover
                color="steelblue", alpha=.85, label="Turnover")
ax1.set_ylabel("Turnover (% of days)", color="steelblue")
ax1.set_ylim(0, 1)
ax1.set_xticks(x)
ax1.set_xticklabels(labels, fontsize=11)
ax1.tick_params(axis='y', labelcolor="steelblue")

ax2 = ax1.twinx()   # Right y-axis => Net-Long exposure
bars2 = ax2.bar(x + width/2, netL, width,
                color="darkorange", alpha=.85, label="Net-Long Ratio")
ax2.set_ylabel("Net-Long Exposure", color="darkorange")
ax2.set_ylim(0, 1)
ax2.tick_params(axis='y', labelcolor="darkorange")

for rect in bars1 + bars2:   # Annotating bars
    height = rect.get_height()
    ax = ax1 if rect in bars1 else ax2
    ax.text(rect.get_x() + rect.get_width()/2, height + 0.02,
            f"{height:.2%}", ha='center', va='bottom', fontsize=9)

plt.title("Turnover vs Net-Long Ratio", pad=18)
plt.tight_layout()
plt.show()

## Practical Application

### Practical Applicaiton: Hybrid Method (Shorter Time Split)

In [None]:
import numpy as np
import pandas as pd
from datetime import date
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

data_path = "derived/final_merged_for_lstm.csv"
df = pd.read_csv(data_path, parse_dates=["Date"])

df = df.drop(columns=[col for col in ['key_0', 'Date_sent'] if col in df.columns]) # Clean Up Columns

for col in df.columns: # Ensuring all columns except Date are numeric
    if col not in ['Date']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna().reset_index(drop=True)

# Feature Engineering
exclude_cols = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume'] # Remove prices and volume
feature_cols = ['Return', 'VIX', 'Sentiment']
N_LAGS = 2 #lagged features
for lag in range(1, N_LAGS+1):
    df[f"Return_lag{lag}"] = df["Return"].shift(lag)
    df[f"Sentiment_lag{lag}"] = df["Sentiment"].shift(lag)
feature_cols += [f"Return_lag{lag}" for lag in range(1, N_LAGS+1)]
feature_cols += [f"Sentiment_lag{lag}" for lag in range(1, N_LAGS+1)]
df = df.dropna().reset_index(drop=True)

# Train/val/test split by time
TRAIN_END = date(2022, 12, 31)
VAL_END = date(2024, 5, 31)
df["Date"] = pd.to_datetime(df["Date"])
df_train = df[df["Date"] <= pd.to_datetime(TRAIN_END)]
df_val = df[(df["Date"] > pd.to_datetime(TRAIN_END)) & (df["Date"] <= pd.to_datetime(VAL_END))]
df_test = df[df["Date"] > pd.to_datetime(VAL_END)]

# Scaling 
scaler = StandardScaler()
X_train = scaler.fit_transform(df_train[feature_cols])
X_val = scaler.transform(df_val[feature_cols])
X_test = scaler.transform(df_test[feature_cols])
y_train = df_train["Return"].values
y_val = df_val["Return"].values
y_test = df_test["Return"].values

# Creating LSTM Sequences
SEQ_LEN = 5
def create_sequences(x, y, seq_len):
    xs, ys = [], []
    for i in range(len(x) - seq_len):
        xs.append(x[i : i + seq_len])
        ys.append(y[i + seq_len])
    return np.array(xs), np.array(ys)

X_train_seq, y_train_seq = create_sequences(X_train, y_train, SEQ_LEN)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, SEQ_LEN)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, SEQ_LEN)

# LSTM Model
model = Sequential([
    LSTM(50, input_shape=(SEQ_LEN, X_train_seq.shape[2])),
    Dense(1),
])
model.compile(optimizer="adam", loss="mse")
early_stop = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train
print("Training LSTM …")
model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=16,
    validation_data=(X_val_seq, y_val_seq),
    callbacks=[early_stop],
    verbose=1,
)

# Evaluation
print("Evaluating …")
pred_test = model.predict(X_test_seq).flatten()
rmse = np.sqrt(mean_squared_error(y_test_seq, pred_test))
print(f"Test RMSE: {rmse:.6f}")

# Directional accuracy
actual_dir = (y_test_seq > 0)
pred_dir = (pred_test > 0)
acc = accuracy_score(actual_dir, pred_dir)
print(f"Directional accuracy: {acc:.2%}")

df_out = df_test.iloc[SEQ_LEN:].copy().reset_index(drop=True)
df_out["Predicted_Return"] = pred_test
df_out.to_csv("derived/lstm_prac_test_predictions.csv", index=False)

print(df_out[["Date", "Return", "Predicted_Return"]].head())
print("Final LSTM model summary:")
model.summary()

### Practical Applicaiton: FinBERT-only Method (Shorter Time Split)

In [None]:
import numpy as np
import pandas as pd
from datetime import date
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

data_path = "derived/final_merged_FinBERT_for_lstm.csv"
df = pd.read_csv(data_path, parse_dates=["Date"])

df = df.drop(columns=[col for col in ['key_0', 'Date_sent'] if col in df.columns]) # Clean columns 

for col in ['Close', 'High', 'Low', 'Open', 'Volume']: # Removing non-numeric entry columns
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
if "VIX" in df.columns:
    df["VIX"] = pd.to_numeric(df["VIX"], errors='coerce')

for col in df.columns: # Set all except 'Date' are numeric
    if col != 'Date':
        df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna().reset_index(drop=True)

# Feature Engineering
N_LAGS = 2
for lag in range(1, N_LAGS+1):
    df[f"Return_lag{lag}"] = df["Return"].shift(lag)
    df[f"FinBERT_score_lag{lag}"] = df["FinBERT_score"].shift(lag)
df = df.dropna().reset_index(drop=True)

# Train/val/test split 
TRAIN_END = date(2022, 12, 31)
VAL_END = date(2024, 5, 31)

df["Date"] = pd.to_datetime(df["Date"])
df_train = df[df["Date"] <= pd.to_datetime(TRAIN_END)]
df_val = df[(df["Date"] > pd.to_datetime(TRAIN_END)) & (df["Date"] <= pd.to_datetime(VAL_END))]
df_test = df[df["Date"] > pd.to_datetime(VAL_END)]

# Features and Scaling
FEATURES = ['Close', 'High', 'Low', 'Open', 'Volume', 'VIX', 'FinBERT_score'] + \
           [f"Return_lag{lag}" for lag in range(1, N_LAGS+1)] + \
           [f"FinBERT_score_lag{lag}" for lag in range(1, N_LAGS+1)]
X_train = df_train[FEATURES].astype(np.float32).values
X_val = df_val[FEATURES].astype(np.float32).values
X_test = df_test[FEATURES].astype(np.float32).values
y_train = df_train["Return"].values
y_val = df_val["Return"].values
y_test = df_test["Return"].values

# Scale Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Create LSTM sequences (window=5)
SEQ_LEN = 5
def create_sequences(x, y, seq_len):
    xs, ys = [], []
    for i in range(len(x) - seq_len):
        xs.append(x[i : i + seq_len])
        ys.append(y[i + seq_len])
    return np.array(xs), np.array(ys)
X_train_seq, y_train_seq = create_sequences(X_train, y_train, SEQ_LEN)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, SEQ_LEN)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, SEQ_LEN)

print(f"Train samples: {X_train_seq.shape[0]}, Val: {X_val_seq.shape[0]}, Test: {X_test_seq.shape[0]}")

# LSTM Model
model = Sequential([
    LSTM(50, input_shape=(SEQ_LEN, X_train_seq.shape[2])),
    Dense(1),
])
model.compile(optimizer="adam", loss="mse")
early_stop = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train
print("Training LSTM …")
model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=16,
    validation_data=(X_val_seq, y_val_seq),
    callbacks=[early_stop],
    verbose=1,
)

print("Evaluating …") # Evaluation
pred_test = model.predict(X_test_seq).flatten()
rmse = np.sqrt(mean_squared_error(y_test_seq, pred_test))
print(f"Test RMSE: {rmse:.6f}")

actual_dir = (y_test_seq > 0) # Directional accuracy
pred_dir = (pred_test > 0)
acc = accuracy_score(actual_dir, pred_dir)
print(f"Directional accuracy: {acc:.2%}")

df_test = df_test.iloc[SEQ_LEN:].copy()  # aligning with y_test_seq
df_test["Predicted_Return"] = pred_test
df_test.to_csv("derived/lstm_prac_FinBERT_only_test_predictions.csv", index=False)

print(df_test[["Date", "Return", "Predicted_Return"]].head())
print("Final LSTM model summary:")
model.summary()

### Evaluation Metrics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)
from scipy.stats import norm

df_gpt = pd.read_csv("derived/lstm_prac_test_predictions.csv", parse_dates=["Date"])
df_fbert = pd.read_csv("derived/lstm_prac_FinBERT_only_test_predictions.csv", parse_dates=["Date"])
market_df = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])

df = pd.DataFrame({ # Align data by date
    "Date": df_gpt["Date"],
    "Market_Return": df_gpt["Return"],
    "LSTM_GPT4": df_gpt["Predicted_Return"],
    "LSTM_FinBERT": df_fbert["Predicted_Return"],
})
df = df.merge(market_df[["Date", "Return", "VIX"]].rename(columns={"Return": "Market_True_Return"}), on="Date", how="left")
df["VIX"] = pd.to_numeric(df["VIX"], errors="coerce")

def trading_signal_returns(true_returns, predicted_returns): # Signal-based trading returns
    signal = np.where(predicted_returns > 0, 1, -1)
    return signal * true_returns

def compute_all_metrics(true_returns, predicted_returns, rolling_window=60): # Computing all metrics for each model (with rolling mean/std)
    if len(true_returns) == 0 or len(predicted_returns) == 0:
        raise ValueError("Empty input arrays! Check your mask and input data.")
    mse = mean_squared_error(true_returns, predicted_returns)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_returns, predicted_returns)

    true_up = (true_returns > 0).astype(int)
    pred_up = (predicted_returns > 0).astype(int)

    acc = accuracy_score(true_up, pred_up)
    prec = precision_score(true_up, pred_up, zero_division=0)
    rec = recall_score(true_up, pred_up, zero_division=0)
    f1 = f1_score(true_up, pred_up, zero_division=0)
    try:
        roc = roc_auc_score(true_up, predicted_returns)
    except:
        roc = np.nan

    cm = confusion_matrix(true_up, pred_up)
    strat_returns = trading_signal_returns(true_returns, predicted_returns)
    cum_return = np.cumprod(1 + strat_returns)[-1] - 1 if len(strat_returns) > 0 else np.nan
    sharpe = np.mean(strat_returns) / (np.std(strat_returns) + 1e-9) * np.sqrt(252)
    roll_sharpe = pd.Series(strat_returns).rolling(rolling_window).apply(
        lambda x: np.mean(x) / (np.std(x) + 1e-9) * np.sqrt(252), raw=True)
    roll_acc = pd.Series(pred_up == true_up).rolling(rolling_window).mean()
    roll_cum_return = (1 + pd.Series(strat_returns)).cumprod() - 1
    rolling_sharpe_mean, rolling_sharpe_std = roll_sharpe.mean(), roll_sharpe.std()
    rolling_acc_mean, rolling_acc_std = roll_acc.mean(), roll_acc.std()
    return {
        "MSE": mse, "RMSE": rmse, "MAE": mae,
        "Direction_Acc": acc, "Precision": prec, "Recall": rec, "F1": f1, "ROC_AUC": roc,
        "Sharpe": sharpe, "Cumulative_Return": cum_return,
        "Rolling_Sharpe_Mean": rolling_sharpe_mean, "Rolling_Sharpe_Std": rolling_sharpe_std,
        "Rolling_Acc_Mean": rolling_acc_mean, "Rolling_Acc_Std": rolling_acc_std,
        "Confusion_Matrix": cm,
        "Rolling_Sharpe": roll_sharpe,
        "Rolling_Acc": roll_acc,
        "Rolling_CumReturn": roll_cum_return,
        "Signal_Returns": strat_returns
    }

mask_gpt = df['Market_True_Return'].notnull() & df['LSTM_GPT4'].notnull() # Data masks and calculated for both models
mask_fbert = df['Market_True_Return'].notnull() & df['LSTM_FinBERT'].notnull()

print("\nValid rows for GPT:", mask_gpt.sum())
print("Valid rows for FinBERT:", mask_fbert.sum())

if mask_gpt.sum() == 0 or mask_fbert.sum() == 0:
    raise ValueError("No valid data rows for at least one model. Check input data and merges!")

metrics_gpt = compute_all_metrics(
    df.loc[mask_gpt, 'Market_True_Return'].values,
    df.loc[mask_gpt, 'LSTM_GPT4'].values
)
metrics_finbert = compute_all_metrics(
    df.loc[mask_fbert, 'Market_True_Return'].values,
    df.loc[mask_fbert, 'LSTM_FinBERT'].values
)

comparison_table = pd.DataFrame({ # Table
    "Hybrid (FinBERT+GPT-4)": {k: v for k, v in metrics_gpt.items() if not isinstance(v, (np.ndarray, pd.Series, list))},
    "FinBERT-only": {k: v for k, v in metrics_finbert.items() if not isinstance(v, (np.ndarray, pd.Series, list))}
})
print("Model Comparison Table")
print(comparison_table)
comparison_table.to_csv("model_performance_comparison.csv")

print("Confusion Matrices") # Confusion matrices for appendix
print("Hybrid (FinBERT+GPT-4):\n", metrics_gpt["Confusion_Matrix"])
print("FinBERT-only:\n", metrics_finbert["Confusion_Matrix"])
np.savetxt("hybrid_confusion_matrix.csv", metrics_gpt["Confusion_Matrix"], delimiter=",")
np.savetxt("finbert_confusion_matrix.csv", metrics_finbert["Confusion_Matrix"], delimiter=",")

vix_median = df["VIX"].median() # Regime split (by VIX) shows model dominance in high/low volatility
df["VIX_regime"] = np.where(df["VIX"] > vix_median, "High_VIX", "Low_VIX")
def regime_metrics(regime):
    idx = df["VIX_regime"] == regime
    gpt_metrics = compute_all_metrics(
        df.loc[idx & mask_gpt, "Market_True_Return"].values, df.loc[idx & mask_gpt, "LSTM_GPT4"].values
    )
    finbert_metrics = compute_all_metrics(
        df.loc[idx & mask_fbert, "Market_True_Return"].values, df.loc[idx & mask_fbert, "LSTM_FinBERT"].values
    )
    return gpt_metrics, finbert_metrics
for regime in ["High_VIX", "Low_VIX"]:
    gpt_metrics, finbert_metrics = regime_metrics(regime)
    print(f"\n=== {regime} Regime ===")
    print("Hybrid Sharpe:", gpt_metrics["Sharpe"], "Cumulative:", gpt_metrics["Cumulative_Return"])
    print("FinBERT-only Sharpe:", finbert_metrics["Sharpe"], "Cumulative:", finbert_metrics["Cumulative_Return"])

### Shortened Time-frame Plot

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)

df_gpt = pd.read_csv("derived/lstm_prac_test_predictions.csv", parse_dates=["Date"])
df_fbert = pd.read_csv("derived/lstm_prac_FinBERT_only_test_predictions.csv", parse_dates=["Date"])
market_df = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])

df = pd.DataFrame({ # Align data by date
    "Date": df_gpt["Date"],
    "Market_Return": df_gpt["Return"],  # Use market_df["Return"] if better aligned
    "LSTM_GPT4": df_gpt["Predicted_Return"],
    "LSTM_FinBERT": df_fbert["Predicted_Return"],
})
df = df.merge(market_df[["Date", "Return", "VIX"]].rename(columns={"Return": "Market_True_Return"}), on="Date", how="left")
df["VIX"] = pd.to_numeric(df["VIX"], errors="coerce")

def trading_signal_returns(true_returns, predicted_returns): # Signal-based trading returns
    signal = np.where(predicted_returns > 0, 1, -1)  # Long/short signal
    return signal * true_returns

mask_gpt = df['Market_True_Return'].notnull() & df['LSTM_GPT4'].notnull() # Data masks and calculated for both models
mask_fbert = df['Market_True_Return'].notnull() & df['LSTM_FinBERT'].notnull()

print("\nValid rows for GPT:", mask_gpt.sum())
print("Valid rows for FinBERT:", mask_fbert.sum())

if mask_gpt.sum() == 0 or mask_fbert.sum() == 0:
    raise ValueError("No valid data rows for at least one model. Check input data and merges!")

metrics_gpt = compute_all_metrics(
    df.loc[mask_gpt, 'Market_True_Return'].values,
    df.loc[mask_gpt, 'LSTM_GPT4'].values
)
metrics_finbert = compute_all_metrics(
    df.loc[mask_fbert, 'Market_True_Return'].values,
    df.loc[mask_fbert, 'LSTM_FinBERT'].values
)

plt.figure(figsize=(12,5))

dates = df.loc[mask_gpt, 'Date'] # Align dates

gpt_signal = np.where(df.loc[mask_gpt, 'LSTM_GPT4'] > 0, 1, -1) # Trading strategy returns for both models
finbert_signal = np.where(df.loc[mask_fbert, 'LSTM_FinBERT'] > 0, 1, -1)

gpt_strat_returns = gpt_signal * df.loc[mask_gpt, 'Market_True_Return'].values
finbert_strat_returns = finbert_signal * df.loc[mask_fbert, 'Market_True_Return'].values

gpt_cum_return = np.cumsum(gpt_strat_returns) # Cumulative returns
finbert_cum_return = np.cumsum(finbert_strat_returns)
market_cum_return = np.cumsum(df.loc[mask_gpt, 'Market_True_Return'].values)

dates = df.loc[mask_gpt, 'Date'].reset_index(drop=True)

plt.figure(figsize=(12, 5)) # Plotting cumulative returns
plt.plot(dates, market_cum_return, label="Market Cumulative Return", color='black', linewidth=2)
plt.plot(dates, gpt_cum_return, label="Hybrid Cumulative Return", color='red')
plt.plot(dates, finbert_cum_return, label="FinBERT-only Cumulative Return", color='darkorange')
plt.title("Cumulative Strategy Returns Over Time")
plt.xlabel("Date")
plt.ylabel("Cumulative Return")
plt.legend()
plt.tight_layout()
plt.show()