# Thesis 

#### Using a Python 3.13 kernel

### Setup and Data Collection

In [None]:
!pip install --upgrade notebook
!pip install python-dotenv
!pip install joblib
!pip install scikit-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install threadpoolctl
!pip install tensorflow
!pip install scikit-image
!pip install scikit-learn-intelex
!pip install keras


In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
print(os.getenv("OPENAI_API_KEY"))

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import openai         
import transformers    
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.tsa.stattools as stt
import statsmodels.tsa.holtwinters as smt
import statsmodels.tsa.seasonal as smt
import statsmodels.tsa.arima.model as smt


## S&P 500 Index Data Retrieval

In [None]:
import pandas as pd
import yfinance as yf
import numpy as np

# SP500 data
sp500 = yf.download("^GSPC", start="2015-01-01", end="2025-01-01")

# Calculating log returns: log(Close_t / Close_{t-1})
sp500['Return'] = np.log(sp500['Close'] / sp500['Close'].shift(1))

# Reset index for merging
sp500 = sp500.reset_index()
sp500['Date'] = pd.to_datetime(sp500['Date']).dt.strftime('%Y-%m-%d')

# VIX data
vix = yf.download("^VIX", start="2015-01-01", end="2025-01-01")[['Close']]
vix.rename(columns={'Close':'VIX'}, inplace=True)
vix = vix.reset_index()
vix['Date'] = pd.to_datetime(vix['Date']).dt.strftime('%Y-%m-%d')

# Merging the two DataFrames on Date
merged_data = pd.merge(sp500, vix, on='Date', how='inner')
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

# Drop any rows with NA values (from log return calculation)
merged_data.dropna(inplace=True)

merged_data.to_csv('sp500_vix_data.csv', index=False)

## New York Times Article Summary Scraping

### General Article Scriping 

In [None]:
import os, time, json, sys
from datetime import date
from typing import List, Dict

import requests
import pandas as pd

# Configuration:
NYT_API_KEY = os.getenv("NYT_API_KEY", "NYT_API_KEY_NOT_SET")  # Set your NYT API key here or in .env file

START_DATE  = date(2015, 1, 1)
END_DATE    = date(2025, 1, 1)
CSV_PATH    = "nyt_business_archive.csv"
REQS_PER_MIN = 5
SLEEP_SEC   = 60 / REQS_PER_MIN        # The 12s pause guarantees we stay within the API rate limit of 5-calls per minute
CHECKPOINT  = "archive_checkpoint.json"
ARCHIVE_URL = "https://api.nytimes.com/svc/archive/v1/{year}/{month}.json"


# Wrapper that handles 429 and retries
def archive_month(year: int, month: int, max_retries: int = 5) -> Dict:
    """Fetch one month from NYT Archive API with rate-limit back-off."""
    url    = ARCHIVE_URL.format(year=year, month=month)
    params = {"api-key": NYT_API_KEY}

    for attempt in range(max_retries):
        resp = requests.get(url, params=params, timeout=60)
        if resp.status_code == 429:                # Too Many Requests
            wait = int(resp.headers.get("Retry-After", "15"))
            print(f"⚠️  429 received — sleeping {wait}s and retrying …")
            time.sleep(wait)
            continue
        resp.raise_for_status()
        return resp.json()

    raise RuntimeError(f"Failed to fetch {year}-{month:02} after {max_retries} retries")


def iter_months(start: date, end: date):
    y, m = start.year, start.month
    while True:
        current = date(y, m, 1)
        if current > end.replace(day=1):
            break
        yield current
        m += 1
        if m == 13:
            m, y = 1, y + 1


def load_checkpoint():
    if os.path.isfile(CHECKPOINT):
        with open(CHECKPOINT) as fp:
            return date.fromisoformat(json.load(fp)["last_month"])
    return None


def save_checkpoint(d: date):
    with open(CHECKPOINT, "w") as fp:
        json.dump({"last_month": d.isoformat()}, fp)


def append_rows(rows: List[Dict]):
    pd.DataFrame(rows).to_csv(
        CSV_PATH,
        mode="a",
        index=False,
        header=not os.path.isfile(CSV_PATH),
    )


def is_business(doc: Dict) -> bool:
    sec  = (doc.get("section_name") or "").lower()
    desk = (doc.get("news_desk")    or "").lower()
    return "business" in sec or "business" in desk


def main():
    resume_from = load_checkpoint()
    months      = list(iter_months(START_DATE, END_DATE))
    if resume_from:
        months = [m for m in months if m > resume_from]
        print(f"Resuming after {resume_from} — {len(months)} months left.")

    for first in months:
        print(f"Fetching {first:%Y-%m} …", end=" ", flush=True)
        data = archive_month(first.year, first.month)        # uses new wrapper
        docs = data["response"]["docs"]

        rows = [
            {
                "Date": doc["pub_date"][:10],
                "Headline": doc["headline"]["main"],
                "Summary": doc.get("abstract") or doc.get("snippet", ""),
                "Section": doc.get("section_name") or doc.get("news_desk"),
                "URL": doc["web_url"],
            }
            for doc in docs if is_business(doc)
        ]

        append_rows(rows)
        save_checkpoint(first)
        print(f"kept {len(rows):3d} / {len(docs):3d} docs")
        time.sleep(SLEEP_SEC)                               # NEW slower pause

    print("✅ All months processed. CSV complete!")


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nInterrupted — progress saved, just rerun to continue.")

### Aggregated Sector Mapping Article Scraping

In [None]:
import os, time, json, sys, re
from datetime import date
from typing import List, Dict

import requests
import pandas as pd

path = "nyt_business_archive.csv"

df = pd.read_csv(path, header=None)  # no header to see all columns
max_cols = df.shape[1]

if max_cols == 5 and "Summary" not in pd.read_csv(path, nrows=0).columns:
    df.columns = ["Date", "Headline", "Summary", "Section", "URL"]
    df.to_csv(path, index=False)
    print("✔ Added missing 'Summary' header and rewrote file.")
else:
    print("No header problem detected — nothing changed.")

# Configuration:
NYT_API_KEY  = os.getenv("NYT_API_KEY", "SyfY3kXFPKxqBqdhMTAVblDmVJwLQEAV")
START_DATE   = date(2015, 1, 1)
END_DATE     = date(2025, 1, 1)
CSV_PATH     = "nyt_business_archive.csv"
AGG_PATH     = "nyt_aggregated_data.csv"
REQS_PER_MIN = 5
SLEEP_SEC    = 60 / REQS_PER_MIN       # 12 s → 5 req/min
CHECKPOINT   = "archive_checkpoint.json"
ARCHIVE_URL  = "https://api.nytimes.com/svc/archive/v1/{year}/{month}.json"


# NYT API call with 429 back-off to handle rate limits
def archive_month(year: int, month: int, retries=5) -> Dict:
    url, params = ARCHIVE_URL.format(year=year, month=month), {"api-key": NYT_API_KEY}
    for _ in range(retries):
        r = requests.get(url, params=params, timeout=60)
        if r.status_code == 429:                       # Too Many Requests
            wait = int(r.headers.get("Retry-After", "15"))
            print(f"⚠️  429 – sleeping {wait}s")
            time.sleep(wait)
            continue
        r.raise_for_status()
        return r.json()
    raise RuntimeError(f"Failed after {retries} retries: {year}-{month:02}")

# Generator to iterate over months in a date range
def iter_months(start: date, end: date):
    y, m = start.year, start.month
    while True:
        first = date(y, m, 1)
        if first > end.replace(day=1):
            break
        yield first
        m += 1
        if m == 13:
            m, y = 1, y + 1

# Checkpoint management for resuming crawls
def load_checkpoint():
    if os.path.isfile(CHECKPOINT):
        with open(CHECKPOINT) as fp:
            return date.fromisoformat(json.load(fp)["last_month"])
    return None

# Save the last processed month to a checkpoint file
def save_checkpoint(d: date):
    with open(CHECKPOINT, "w") as fp:
        json.dump({"last_month": d.isoformat()}, fp)

# Append rows to the CSV file, creating it if it doesn't exist
def append_rows(rows: List[Dict]):
    pd.DataFrame(rows).to_csv(
        CSV_PATH,
        mode="a",
        index=False,
        header=not os.path.isfile(CSV_PATH),
    )


# Business filter
def is_business(doc: Dict) -> bool:
    sec  = (doc.get("section_name") or "").lower()
    desk = (doc.get("news_desk")    or "").lower()
    return "business" in sec or "business" in desk

# Main crawling function to fetch NYT archive data
def crawl_nyt_archive():
    resume_from = load_checkpoint()
    months      = list(iter_months(START_DATE, END_DATE))
    if resume_from:
        months = [m for m in months if m > resume_from]
        print(f"Resuming after {resume_from} – {len(months)} months left.")

    for first in months:
        print(f"Fetching {first:%Y-%m} …", end=" ", flush=True)
        data = archive_month(first.year, first.month)
        docs = data["response"]["docs"]

        rows = [
            {
                "Date": doc["pub_date"][:10],
                "Headline": doc["headline"]["main"],
                "Summary": doc.get("abstract") or doc.get("snippet", ""),
                "Section": doc.get("section_name") or doc.get("news_desk"),
                "URL": doc["web_url"],
            }
            for doc in docs if is_business(doc)
        ]

        append_rows(rows)
        save_checkpoint(first)
        print(f"kept {len(rows):3d} / {len(docs):3d}")
        time.sleep(SLEEP_SEC)

    print("✅ NYT crawl complete.")


# Sector classification & aggregation
SECTOR_MAP: dict[str, list[str]] = {
    # Information Tech Secotr:
    "Software & IT Services": [
        "software", "saas", "cloud", "it services", "consulting",
        "microsoft", "adobe", "oracle", "sap", "salesforce", "servicenow",
        "workday", "vmware", "accenture", "infosys", "tcs", "capgemini",
    ],
    "Hardware & Devices": [
        "hardware", "pc", "laptop", "smartphone", "iphone", "ipad",
        "dell", "hp", "lenovo", "asus", "acer", "logitech",
    ],
    "Semiconductors": [
        "chip", "chips", "semiconductor", "fab", "foundry",
        "intel", "amd", "nvidia", "qualcomm", "tsmc", "broadcom",
        "micron", "arm holdings", "sk hynix",
    ],
    "Internet & Social Media": [
        "google", "alphabet", "youtube", "search engine",
        "meta", "facebook", "instagram", "whatsapp", "threads",
        "twitter", "x corp", "snapchat", "tiktok", "reddit",
        "linkedin", "pinterest", "social media",
    ],
    # Communication & Media Sector:
    "Telecommunications": [
        "telecom", "5g", "wireless", "broadband",
        "verizon", "at&t", "t-mobile", "comcast", "charter",
        "vodafone", "telefonica", "bt group", "rogers", "singtel",
    ],
    "Media & Entertainment": [
        "media", "streaming", "disney", "espn", "hulu",
        "netflix", "warner bros", "hbo", "paramount", "peacock",
        "sony pictures", "universal", "box office", "cinema",
    ],
    # Consumer Sector:
    "Retail & E-Commerce": [
        "retail", "e-commerce", "amazon", "alibaba", "shopify",
        "ebay", "etsy", "walmart", "target", "costco", "kroger",
        "best buy", "flipkart", "mercado libre",
    ],
    "Consumer Goods & Apparel": [
        "nike", "adidas", "lululemon", "puma", "under armour",
        "apparel", "footwear", "luxury", "lvmh", "gucci", "burberry",
        "rolex", "hermes", "tapestry",
    ],
    "Food & Beverage": [
        "food", "beverage", "coca-cola", "pepsico", "nestlé",
        "restaurant", "fast food", "mcdonald", "starbucks",
        "yum brands", "kfc", "pizza hut", "chipotle",
        "kraft", "general mills", "heinz", "tyson foods",
    ],
    "Hospitality & Leisure": [
        "hotel", "marriott", "hilton", "hyatt", "airbnb",
        "booking.com", "expedia", "travel", "cruise", "carnival",
        "royal caribbean", "las vegas sands", "mgm resorts",
    ],
    "Automotive": [
        "automotive", "auto", "car", "vehicle", "ev",
        "tesla", "general motors", "ford", "stellantis",
        "volkswagen", "toyota", "nissan", "bmw", "mercedes",
        "hyundai", "kia", "rivian", "lucid",
    ],
    # Healthcare Sector:
    "Pharmaceuticals": [
        "pharma", "drug", "medicine", "vaccine", "fda",
        "pfizer", "moderna", "johnson & johnson", "merck",
        "novartis", "roche", "astrazeneca", "bayer", "gsk",
    ],
    "Biotechnology": [
        "biotech", "gene therapy", "crispr", "genomics",
        "illumina", "gilead", "amgen", "biogen", "regeneron",
        "vertex", "bluebird bio",
    ],
    "Medical Devices & Services": [
        "medical device", "medtech", "diagnostics", "surgical",
        "medtronic", "boston scientific", "abbott", "stryker",
        "philips healthcare", "siemens healthineers", "cardinal health",
        "hospital", "clinic", "healthcare services",
    ],
    # Energy & Utilities Sector:
    "Oil & Gas": [
        "oil", "gas", "petroleum", "upstream", "downstream",
        "exxon", "chevron", "bp", "shell", "totalenergies",
        "conocophillips", "aramco", "occidental", "slb",
    ],
    "Renewables & Clean Energy": [
        "renewable", "solar", "wind", "geothermal", "hydro",
        "clean energy", "green energy", "next era", "sunpower",
        "first solar", "enphase", "vestas", "siemens gamesa",
        "hydrogen", "electrolyzer", "fuel cell",
    ],
    "Utilities": [
        "utility", "power grid", "electricity", "water utility",
        "natural gas utility", "duke energy", "southern company",
        "dominion", "pg&e", "national grid", "aes",
    ],
    # Financials Sector:
    "Banks": [
        "bank", "commercial bank", "jpmorgan", "bank of america",
        "citigroup", "wells fargo", "goldman sachs", "morgan stanley",
        "u.s. bancorp", "hsbc", "barclays", "santander", "dbs",
    ],
    "Investment & Asset Management": [
        "asset manager", "blackrock", "vanguard", "fidelity",
        "state street", "schwab", "hedge fund", "private equity",
        "kkr", "carried interest", "mutual fund", "etf",
        "sovereign wealth fund",
    ],
    "Insurance": [
        "insurance", "insurer", "aig", "allstate", "progressive",
        "metlife", "prudential", "chubb", "berkshire hathaway insurance",
        "reinsurance", "lloyd's", "actuarial",
    ],
    "Fintech & Payments": [
        "fintech", "payment", "visa", "mastercard",
        "american express", "paypal", "block inc", "square",
        "stripe", "sofi", "robinhood", "buy now pay later",
        "klarna", "affirm", "ant financial",
    ],
    "Cryptocurrency & Blockchain": [
        "bitcoin", "ethereum", "crypto", "blockchain",
        "coinbase", "binance", "defi", "nft",
        "stablecoin", "mining rig", "hashrate",
    ],
    # Industrial Sector:
    "Aerospace & Defense": [
        "aerospace", "defense", "boeing", "airbus", "northrop",
        "lockheed martin", "raytheon", "bae systems", "general dynamics",
        "drones", "satellite", "nasa contract",
    ],
    "Transportation & Logistics": [
        "shipping", "freight", "logistics", "supply chain",
        "fedex", "ups", "dhl", "maersk", "csx", "union pacific",
        "delta airlines", "american airlines", "united airlines",
        "railroad", "port congestion",
    ],
    "Manufacturing & Machinery": [
        "manufacturing", "factory", "industrial", "caterpillar",
        "3m", "general electric", "siemens", "honeywell", "emerson",
        "robotics", "automation", "abb", "fanuc",
    ],
    "Construction & Engineering": [
        "construction", "engineering", "infrastructure",
        "bechtel", "fluor", "jacobs", "skanska", "kiewit",
        "turner construction", "architect", "building materials",
    ],
    "Chemicals & Specialty Materials": [
        "chemical", "chemicals", "specialty chemical",
        "dupont", "dow", "basf", "lyondellbasell", "air products",
        "eastman", "evonik", "synthetic rubber", "petrochemical",
    ],
    "Metals & Mining": [
        "mining", "metal", "steel", "aluminum", "copper",
        "iron ore", "rio tinto", "bhp", "vale", "newmont",
        "glencore", "lithium", "nickel", "rare earth",
    ],
    "Agriculture": [
        "agriculture", "farming", "crop", "soybean", "corn",
        "wheat", "cargill", "archer daniels midland", "bunge",
        "deere", "monsanto", "fertilizer", "nutrien", "potash",
    ],
    # Real Estate Sector:
    "Real Estate": [
        "real estate", "realtor", "reit", "property", "mortgage",
        "office vacancy", "housing market", "zillow", "redfin",
        "wework", "commercial property", "residential property",
        "industrial park", "logistics park",
    ],
    # ESG / Government / Education Sectors:
    "Environmental & ESG": [
        "esg", "sustainability", "carbon", "emissions",
        "carbon credit", "offset", "green bond", "climate risk",
        "cop28", "environmental regulation",
    ],
    "Government & Policy": [
        "government", "regulation", "legislation", "policy",
        "federal reserve", "congress", "white house",
        "eu commission", "trade tariff", "sanction", "geopolitics",
    ],
    "Education": [
        "education", "edtech", "university", "college", "school",
        "coursera", "edx", "udemy", "chegg", "student loan",
    ],
}

# Function to assign sectors based on keywords in the summary text
def assign_sector(text: str) -> str:
    text_low = text.lower()
    for sector, keywords in SECTOR_MAP.items():
        if any(kw in text_low for kw in keywords):
            return sector
    return "General"

# Aggregation function to combine headlines and summaries by Date and Sector
def aggregate_nyt(df: pd.DataFrame) -> pd.DataFrame:
    """Combine all Business headlines/summaries into one row per Date × Sector."""
    return (
        df.groupby(["Date", "Sector"])
          .agg({
              "Headline": lambda x: " | ".join(x.dropna().astype(str)),
              "Summary":  lambda x: " | ".join(x.dropna().astype(str)),
          })
          .reset_index()
    )

# Main function to run the entire process
def main():
    # Resume NYT archive
    crawl_nyt_archive()

    # Loading the full Business CSV
    nyt_df = pd.read_csv(CSV_PATH)
    nyt_df["Headline"] = nyt_df["Headline"].astype(str)
    nyt_df["Summary"] = nyt_df["Summary"].fillna("").astype(str)

    # Assigning sectors
    nyt_df["Sector"] = nyt_df["Summary"].fillna("").apply(assign_sector)

    # Aggregate
    nyt_aggregated = aggregate_nyt(nyt_df)
    nyt_aggregated.to_csv(AGG_PATH, index=False)
    print(f"Aggregated file written: {AGG_PATH}")

    # Clean-up if news_df already exists
    if "news_df" in globals():
        news_df["Article"] = (
            news_df["Article"]
            .str.replace(r"By .*? \|", "", regex=True)
            .str.replace(r"\n+", " ", regex=True)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
        )
        print("Reuters news_df cleaned.")

# Main function when this script is executed directly
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nInterrupted – progress saved, just rerun to continue.")

In [None]:
# Print the first few rows of the NYT scrapped aggregate data file
nyt_aggregated = pd.read_csv("nyt_aggregated_data.csv")
print(nyt_aggregated.head())
# Print the first few rows of the merged S&P 500 and VIX data
merged_data = pd.read_csv("sp500_vix_data.csv")
print(merged_data.head())


## Senitment Analysis Implementation

### Unified FinBERT and GPT-4 Fall Back

In [None]:
import os
import time 
from datetime import datetime
from pathlib import Path
from typing import List, Tuple
import numpy as np
import pandas as pd
import torch
import openai
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Config
USE_GPT4_FALLBACK = True
GPT4_CONFIDENCE_THRESHOLD = 0.05                     
OPENAI_MODEL = "gpt-4o-mini"
OPENAI_MAX_TOKENS = 10

DERIVED_PATH = "derived"
Path(DERIVED_PATH).mkdir(exist_ok=True)

GPT4_CHECKPOINT_EVERY = 250   # save after every 250 GPT-4 calls

# Ensure OpenAI key only if fallback is active
if USE_GPT4_FALLBACK:
    if not os.getenv("OPENAI_API_KEY"):
        raise EnvironmentError("OPENAI_API_KEY not set.")
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")

# Load market_dfdata
 = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])
news_df = pd.read_csv("nyt_aggregated_data.csv", parse_dates=["Date"])
news_df.rename(columns={"Summary": "summary", "Headline": "headline", "Sector": "sector"}, inplace=True)

# Aligning news dates with trading calendar
trading_days = set(market_df["Date"].dt.normalize())
news_df = news_df[news_df["Date"].dt.normalize().isin(trading_days)].reset_index(drop=True)

# FinBERT model - ProsusAI
FINBERT_MODEL = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(FINBERT_MODEL)
finbert = AutoModelForSequenceClassification.from_pretrained(FINBERT_MODEL)
finbert.eval()

@torch.inference_mode()
def finbert_score(texts: List[str], batch_size: int = 32) -> Tuple[List[float], List[np.ndarray]]:
    scores, probs_all = [], []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256)
        out = finbert(**enc)
        probs = torch.nn.functional.softmax(out.logits, dim=1).cpu().numpy()
        batch_scores = probs[:,1] - probs[:,2]
        scores.extend(batch_scores.tolist())
        probs_all.extend(probs)
    return scores, probs_all

def gpt4_sentiment_single(text):
    """Call GPT-4 for sentiment fallback, return -1, 0, or 1."""        
    system_msg = "You are a financial news analyst."
    user_msg = (
        "Classify the sentiment of the given NY Times news article summary {summary}, which is closely related to the {sector} industry, as positive for buy, negative for sell, or neutral for hold position, for the US Stock market and provide the probability values for your classification."
        "Answer with just one word.\n\n"
        f"Summary: \"{text}\""
    )
    try:
        response = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[{"role": "system", "content": system_msg},
                      {"role": "user", "content": user_msg}],
            max_tokens=OPENAI_MAX_TOKENS,
            temperature=0.0,
        )
        ans = response.choices[0].message.content.strip().lower()
    except Exception as e:
        print("GPT-4 error (treated as neutral):", e)
        return 0.0
    if ans.startswith("pos"):
        return 1.0
    if ans.startswith("neg"):
        return -1.0
    return 0.0

def add_sentiment(news_df, use_gpt4=False):
    # Checkpoint 1: If FinBERT file exists, skip FinBERT and load
    finbert_path = os.path.join(DERIVED_PATH, "nyt_with_finbert_sentiment.csv")
    if os.path.exists(finbert_path):
        print("Checkpoint: FinBERT sentiment file found. Loading and skipping FinBERT step.")
        news_df = pd.read_csv(finbert_path, parse_dates=["Date"])
    else:
        print("Scoring FinBERT…")
        start_finbert = time.time()
        fin_scores, fin_probs = finbert_score(news_df["summary"].tolist())
        end_finbert = time.time()
        news_df["FinBERT_score"] = fin_scores
        news_df["FinBERT_prob_neu"] = [p[0] for p in fin_probs]
        news_df["FinBERT_prob_pos"] = [p[1] for p in fin_probs]
        news_df["FinBERT_prob_neg"] = [p[2] for p in fin_probs]
        news_df["FinBERT_confidence"] = news_df["FinBERT_score"].abs()
        news_df["Sentiment"] = news_df["FinBERT_score"]  # Start with FinBERT
        print(f"FinBERT sentiment scored in {end_finbert - start_finbert:.2f} seconds.")
        news_df.to_csv(finbert_path, index=False)
        print(f"Checkpoint: FinBERT sentiment saved to {finbert_path}")

    num_gpt4 = 0
    gpt4_path = os.path.join(DERIVED_PATH, "nyt_with_gpt4_fallback_sentiment.csv")

    if use_gpt4:
        # If checkpoint exists, load and skip fallback
        if os.path.exists(gpt4_path):
            print("Checkpoint: GPT-4 fallback file found. Loading and skipping fallback step.")
            news_df = pd.read_csv(gpt4_path, parse_dates=["Date"])
        else:
            print("Running GPT-4 fallback…")
            low_conf_mask = news_df["FinBERT_confidence"] < GPT4_CONFIDENCE_THRESHOLD
            num_gpt4 = low_conf_mask.sum()
            print(f"News needing GPT-4 fallback: {num_gpt4} of {len(news_df)}")
            start_gpt4 = time.time()
            checkpoint_counter = 0
            for idx_num, idx in enumerate(news_df[low_conf_mask].index):
                news_df.at[idx, "Sentiment"] = gpt4_sentiment_single(news_df.at[idx, "summary"])
                checkpoint_counter += 1
                # Intermediate checkpoint
                if checkpoint_counter % GPT4_CHECKPOINT_EVERY == 0:
                    news_df.to_csv(gpt4_path, index=False)
                    print(f"Checkpoint: Saved GPT-4 fallback at {checkpoint_counter} / {num_gpt4} GPT-4 calls.")
            end_gpt4 = time.time()
            print(f"GPT-4 fallback completed in {end_gpt4 - start_gpt4:.2f} seconds.")
            news_df.to_csv(gpt4_path, index=False)
            print(f"Checkpoint: Final GPT-4 fallback saved to {gpt4_path}")
    else:
        num_gpt4 = 0

    return news_df, num_gpt4
   
# Main Pipeline
Path(DERIVED_PATH).mkdir(exist_ok=True)
pipeline_start = time.time()
news_df, num_gpt4 = add_sentiment(news_df, use_gpt4=USE_GPT4_FALLBACK)
pipeline_end = time.time()
print(f"Total pipeline runtime: {pipeline_end - pipeline_start:.2f} seconds.")

# Daily sentiment aggregate
sent_daily = (
    news_df.groupby(news_df["Date"].dt.normalize())["Sentiment"]
    .mean()
    .reset_index()
)
sent_daily_path = os.path.join(DERIVED_PATH, "daily_sentiment_aggregate.csv")
sent_daily.to_csv(sent_daily_path, index=False)

# Merge with market data for modeling (LSTM)
market_merge = pd.merge(
    market_df, 
    sent_daily, 
    left_on=market_df["Date"].dt.normalize(), 
    right_on=sent_daily["Date"].dt.normalize(), 
    how="left", 
    suffixes=('', '_sent')
)
final_out_path = os.path.join(DERIVED_PATH, "final_merged_for_lstm.csv")
market_merge.to_csv(final_out_path, index=False)
print(f"Checkpoint: Final merged LSTM-ready data saved to {final_out_path}")

print(f"All CSVs saved to: {DERIVED_PATH}/")
print("Output files:")
print(" - nyt_with_finbert_sentiment.csv")
print(" - nyt_with_gpt4_fallback_sentiment.csv")
print(" - daily_sentiment_aggregate.csv")
print(" - final_merged_for_lstm.csv")



#### FinBERT-only Sentiment Aggregate

In [None]:
# Extra: Create daily FinBERT-only sentiment aggregate
finbert_only_path = os.path.join(DERIVED_PATH, "nyt_with_finbert_sentiment.csv")
finbert_df = pd.read_csv(finbert_only_path, parse_dates=["Date"])

daily_finbert_sent = (
    finbert_df.groupby(finbert_df["Date"].dt.normalize())["FinBERT_score"]
    .mean()
    .reset_index()
)
daily_finbert_sent_path = os.path.join(DERIVED_PATH, "daily_FinBERT_sentiment_aggregate.csv")
daily_finbert_sent.to_csv(daily_finbert_sent_path, index=False)

# Merge with market data for modeling (LSTM), FinBERT-only version
market_merge_finbert = pd.merge(
    market_df,
    daily_finbert_sent,
    left_on=market_df["Date"].dt.normalize(),
    right_on=daily_finbert_sent["Date"].dt.normalize(),
    how="left",
    suffixes=('', '_sent')
)
final_out_finbert_path = os.path.join(DERIVED_PATH, "final_merged_FinBERT_for_lstm.csv")
market_merge_finbert.to_csv(final_out_finbert_path, index=False)

print(f"Checkpoint: FinBERT-only daily sentiment aggregate saved to {daily_finbert_sent_path}")
print(f"Checkpoint: Final merged FinBERT-only LSTM-ready data saved to {final_out_finbert_path}")

In [None]:
import pandas as pd
import os

DERIVED_PATH = 'derived'

# Load market data (adjust path if needed)
market_df = pd.read_csv("sp500_vix_data.csv", parse_dates=["Date"])
market_df['Date'] = pd.to_datetime(market_df['Date']).dt.strftime('%Y-%m-%d')

# FinBERT-only sentiment
finbert_sent_df = pd.read_csv(os.path.join(DERIVED_PATH, "daily_FinBERT_sentiment_aggregate.csv"))
finbert_sent_df['Date'] = pd.to_datetime(finbert_sent_df['Date']).dt.strftime('%Y-%m-%d')

merged_finbert = pd.merge(
    market_df, finbert_sent_df, on='Date', how='left'
)
merged_finbert.to_csv(os.path.join(DERIVED_PATH, "final_merged_FinBERT_for_lstm.csv"), index=False)
print("Checkpoint: FinBERT-only LSTM-ready data saved.")

# Hybrid (GPT-4 fallback) sentiment
hybrid_sent_df = pd.read_csv(os.path.join(DERIVED_PATH, "daily_sentiment_aggregate.csv"))
hybrid_sent_df['Date'] = pd.to_datetime(hybrid_sent_df['Date']).dt.strftime('%Y-%m-%d')

merged_hybrid = pd.merge(
    market_df, hybrid_sent_df, on='Date', how='left'
)
merged_hybrid.to_csv(os.path.join(DERIVED_PATH, "final_merged_for_lstm.csv"), index=False)
print("Checkpoint: Hybrid LSTM-ready data saved.")


#Note: The OpenAI API usage as shown might need adaptation for actual GPT-4 (which might require using openai.ChatCompletion.create with messages in the new API format rather than the older Completion.create). But the idea stands – send the article text and get a sentiment label. The prompt here is simplified; in practice, one could include examples or a role specification for consistency.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("nyt_aggregated_data.csv")

# Count sector frequencies
sector_counts = df['Sector'].value_counts()
N = 12 
top_sectors = sector_counts.index[:N]
sizes = sector_counts.values[:N].tolist()
labels = sector_counts.index[:N].tolist()

# "Other" combines all the rest
if len(sector_counts) > N:
    other_size = sector_counts.values[N:].sum()
    sizes.append(other_size)
    labels.append("Other")

# Colors
colors = plt.get_cmap('tab20').colors[:len(labels)]

fig, ax = plt.subplots(figsize=(13, 6))
wedges, texts, autotexts = ax.pie(
    sizes,
    labels=None,
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    pctdistance=0.75,
    wedgeprops=dict(width=0.38, edgecolor='w')
)

# Improved labeling
bbox_props = dict(boxstyle="round,pad=0.25", fc="white", ec="k", lw=0.7)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=10, va="center")

# Finding the index of the "Other" label
other_index = labels.index("Other") if "Other" in labels else None

for i, (p, label) in enumerate(zip(wedges, labels)):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))

    if label == "Other":
        label_x, label_y = -2.1, 0
        horizontalalignment = "right"
        
        ax.annotate(
            label,
            xy=(x, y), xytext=(label_x, label_y),
            horizontalalignment=horizontalalignment,
            fontsize=10,
            arrowprops=dict(arrowstyle="-", lw=1.8, color="gray", connectionstyle="arc3,rad=0"),
            bbox=bbox_props, va="center"
        )
    else:
        label_x = 1.44 * np.sign(x)
        label_y = 1.5 * y
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = f"angle,angleA=0,angleB={ang}"
        ax.annotate(
            label,
            xy=(x, y), xytext=(label_x, label_y),
            horizontalalignment=horizontalalignment,
            fontsize=12,
            arrowprops=dict(arrowstyle="-", lw=1.3, color="gray", connectionstyle=connectionstyle),
            bbox=bbox_props, va="center"
        )

plt.setp(autotexts, size=10, color='black')
plt.title('NYT Aggregated Data – Sector Distribution', fontsize=20, fontweight='bold', pad=25)
plt.tight_layout()
plt.savefig("nyt_sector_donut_final.png", dpi=300)
plt.show()