# 01_load_data.ipynb

In [1]:
#Importing libraries
import os
import pandas as pd
import yfinance as yf
from fredapi import Fred
from pytrends.request import TrendReq
from dotenv import load_dotenv
from datetime import datetime

In [2]:
# Load API keys from .env if needed
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# 📅 Set time range
start_date = "2004-01-01"
end_date = datetime.today().strftime('%Y-%m-%d')

In [4]:
# 1️⃣ Load S&P 500 from Yahoo Finance
sp500 = yf.download("^GSPC", start=start_date, end=end_date, interval='1wk')
sp500.reset_index(inplace=True)
sp500 = sp500[["Date", "Close", "Volume"]]
sp500["Date"] = pd.to_datetime(sp500["Date"]) + pd.offsets.Week(weekday=4)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [5]:
# 2️⃣ Load VIX Index
vix = yf.download("^VIX", start=start_date, end=end_date, interval='1wk')
vix.reset_index(inplace=True)
vix = vix[["Date", "Close"]].rename(columns={"Close": "VIX_Close"})
vix["Date"] = pd.to_datetime(vix["Date"]) + pd.offsets.Week(weekday=4)

[*********************100%***********************]  1 of 1 completed


In [6]:
# 3️⃣ Load Google Trends data
pytrends = TrendReq(hl='en-US', tz=360)
kw_list = ["stock market crash"]
pytrends.build_payload(kw_list, cat=0, timeframe='2004-01-01 2025-07-14', geo='', gprop='')

# Download interest over time
google_sentiment = pytrends.interest_over_time()
google_sentiment = google_sentiment.reset_index()[["date", "stock market crash"]]
google_sentiment.columns = ["Date", "Google_Sentiment_Index"]

# Resample to weekly (Friday)
google_sentiment["Date"] = pd.to_datetime(google_sentiment["Date"])
google_sentiment = google_sentiment.set_index("Date").resample("W-FRI").ffill().reset_index()

  df = df.fillna(False)


In [7]:
# 4️⃣ Load Unemployment data from FRED
fred = Fred(api_key=os.getenv("FRED_API_KEY"))
unrate = fred.get_series('UNRATE')
unrate = unrate.to_frame(name='Unemployment')
unrate.index = pd.to_datetime(unrate.index)
unrate = unrate.resample("W-FRI").ffill().reset_index()
unrate.columns = ["Date", "Unemployment"]
unrate = unrate[unrate["Date"] >= pd.to_datetime(start_date)]

In [8]:
# 5 Checking which latest data available
print("📅 Dataset Date Ranges\n" + "-"*30)

print(f"S&P 500:          {sp500['Date'].min().date()} → {sp500['Date'].max().date()}")
print(f"VIX:              {vix['Date'].min().date()} → {vix['Date'].max().date()}")
print(f"Unemployment:     {unrate['Date'].min().date()} → {unrate['Date'].max().date()}")
print(f"Google Trends:    {google_sentiment['Date'].min().date()} → {google_sentiment['Date'].max().date()}")

📅 Dataset Date Ranges
------------------------------
S&P 500:          2004-01-02 → 2025-07-11
VIX:              2004-01-02 → 2025-07-11
Unemployment:     2004-01-02 → 2025-06-06
Google Trends:    2004-01-02 → 2025-07-04


In [9]:
# 6 Align all datasets to the latest valid end date: 2025-06-06
latest_date = pd.to_datetime("2025-06-06")
sp500 = sp500[sp500["Date"] <= latest_date]
vix = vix[vix["Date"] <= latest_date]
unrate = unrate[unrate["Date"] <= latest_date]
google_sentiment = google_sentiment[google_sentiment["Date"] <= latest_date]

In [10]:
# ✅ Save data to files for further reuse
sp500.to_csv("../data/raw/sp500_data.csv", index=False)
vix.to_csv("../data/raw/vix_data.csv", index=False)
unrate.to_csv("../data/raw/unrate_data.csv", index=False)
google_sentiment.to_csv("../data/raw/google_trends.csv", index=False)