In [1]:
# Importing libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
# ✅ Load datasets prepared in PR#1
sp500 = pd.read_csv("../data/raw/sp500_data.csv", parse_dates=["Date"])
vix = pd.read_csv("../data//raw/vix_data.csv", parse_dates=["Date"])
unrate = pd.read_csv("../data/raw/unrate_data.csv", parse_dates=["Date"])
google_sentiment = pd.read_csv("../data/raw/google_trends.csv", parse_dates=["Date"])

In [3]:
# 🔗 Merge all datasets
df = sp500.merge(vix, on="Date", how="outer")
# 🧼 Flatten column levels if merged DataFrame has multi-index columns
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [' '.join(col).strip() for col in df.columns.values]

df = df.merge(unrate, on="Date", how="outer")
df = df.merge(google_sentiment, on="Date", how="left")

In [4]:
# 🧼 Clean and compute features
# Rename columns to standard names
df = df.rename(columns={
    "Close ^GSPC": "Close",
    "Volume ^GSPC": "Volume",
    "VIX_Close ^VIX": "VIX_Close"
})

df = df.sort_values("Date").reset_index(drop=True)
df = df.dropna(subset=["Close", "VIX_Close", "Unemployment"])

In [5]:
# Ensure columns are numeric
df["Close"] = pd.to_numeric(df["Close"], errors="coerce")
df["VIX_Close"] = pd.to_numeric(df["VIX_Close"], errors="coerce")

In [6]:
# Calculate returns and changes
df["SP500_Returns"] = df["Close"].pct_change()
df["VIX_Change"] = df["VIX_Close"].pct_change()

In [7]:
# Define mood index function
def get_mood_index(df, vix_col='VIX_Close', google_col='Google_Sentiment_Index', unemp_col='Unemployment'):
    scaler = MinMaxScaler()
    norm_values = scaler.fit_transform(df[[vix_col, google_col, unemp_col]])
    norm_df = pd.DataFrame(norm_values, columns=["VIX_Norm", "Google_Norm", "Unemp_Norm"])
    norm_df.index = df.index
    df = df.copy()
    df[["VIX_Norm", "Google_Norm", "Unemp_Norm"]] = norm_df
    df["Mood_Index"] = df[["VIX_Norm", "Google_Norm", "Unemp_Norm"]].mean(axis=1)
    df["Mood_Zone"] = df["Mood_Index"].apply(lambda val: "Calm" if val < 0.4 else "Cautious" if val < 0.7 else "Panic")
    return df

In [8]:
# Apply mood index
df = get_mood_index(df)

In [9]:
# Saving cleaned data
df.to_csv("../data/cleaned/cleaned_data.csv", index=False)