Load the Libaraies

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
from fairlearn.metrics import MetricFrame, selection_rate


In [4]:
# Load dataset
df = pd.read_csv("sp500_clean.csv", parse_dates=["date"])

# Feature Engineering
df["volatility"] = df["high"] - df["low"]
df["price_change"] = df["close"] - df["open"]
df["group_volume"] = pd.qcut(df["volume"], q=3, labels=["low", "medium", "high"])
df["group_volatility"] = pd.qcut(df["volatility"], q=3, labels=["low", "medium", "high"])
df["group_price_change"] = pd.qcut(df["price_change"], q=3, labels=["low", "medium", "high"])

# Binary Target
df["target"] = (df["close"].shift(-1) > df["close"]).astype(int)
df.dropna(inplace=True)

In [5]:
# Features
features = ["open", "high", "low", "volume"]
X = df[features]
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [6]:
# Base Model with Tuned Hyperparameters
model = LGBMClassifier(max_depth=5, num_leaves=31, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

[LightGBM] [Info] Number of positive: 532, number of negative: 474
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 1006, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.528827 -> initscore=0.115436
[LightGBM] [Info] Start training from score 0.115436


In [None]:
# SHAP Explainability
explainer = shap.TreeExplainer(model)
shap_vals = explainer.shap_values(X_test)[1] if isinstance(explainer.shap_values(X_test), list) else explainer.shap_values(X_test)
shap.summary_plot(shap_vals, X_test, feature_names=features, plot_type="bar")
