02_clean_data

In [1]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
# ✅ Import mood index feature from utils
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))
from utils.mood_features import get_mood_index

In [3]:
# ✅ Load datasets prepared in PR#1
sp500 = pd.read_csv("../data/raw/sp500_data.csv", parse_dates=["Date"])
vix = pd.read_csv("../data//raw/vix_data.csv", parse_dates=["Date"])
unrate = pd.read_csv("../data/raw/unrate_data.csv", parse_dates=["Date"])
google_sentiment = pd.read_csv("../data/raw/google_trends.csv", parse_dates=["Date"])

In [4]:
# 🔗 Merge all datasets
df = sp500.merge(vix, on="Date", how="outer")

In [5]:
# 🧼 Flatten column levels if merged DataFrame has multi-index columns
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [' '.join(col).strip() for col in df.columns.values]

df = df.merge(unrate, on="Date", how="outer")
df = df.merge(google_sentiment, on="Date", how="left")

In [6]:
# 🧼 Clean and compute features
# Rename columns to standard names
df = df.rename(columns={
    "Close ^GSPC": "Close",
    "Volume ^GSPC": "Volume",
    "VIX_Close ^VIX": "VIX_Close"
})

df = df.sort_values("Date").reset_index(drop=True)
df = df.dropna(subset=["Close", "VIX_Close", "Unemployment"])

In [7]:
# Ensure columns are numeric
df["Close"] = pd.to_numeric(df["Close"], errors="coerce")
df["VIX_Close"] = pd.to_numeric(df["VIX_Close"], errors="coerce")

In [8]:
# Calculate returns and changes
df["SP500_Returns"] = df["Close"].pct_change()
df["VIX_Change"] = df["VIX_Close"].pct_change()

In [9]:
# ✅ Apply mood index from utils
df = get_mood_index(df)

In [10]:
# Saving cleaned data
df.to_csv("../data/cleaned/cleaned_data.csv", index=False)