In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#CLASSIFICATION RANDOM FOREST (ONE TO PREDICT HIGH/LOW VOLATILITY FOLLOWING DAY)

In [None]:
df_final = pd.read_csv('FINAL_1YEAR_DATA.csv',parse_dates=["date"])
print(df_final.head())
print(df_final.shape)

In [None]:
from sklearn.metrics import precision_score
#CLASSIFICATION RANDOM FOREST
#median threshold (kinda unrealistic, want to test other threshold decisions: )
#threshold = df_final["Target"].median()
#df_final["Target_binary"] = (df_final["Target"] > threshold).astype(int)

#QUANTILE BASED THRESHOLD
threshold = df_final["Target"].quantile(0.7)
df_final["Target_binary"] = (df_final["Target"] >= threshold).astype(int)
#print(df_final["Target_binary"].value_counts(normalize=True))


# Define features
features = [
    "RealizedVol_3d",
    "reddit_sentiment_lag1", "reddit_volume_lag1",
    "news_sentiment_lag1", "news_volume_lag1",
   "reddit_sentiment_missing", "news_missing"
]

# Train-test split
X = df_final[features]
y = df_final["Target_binary"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=True, stratify=y, random_state=42
)

# Train Random Forest USING GRID SEARCH
# param_grid = {
#     'n_estimators': [100,200,300,350],
#     'max_depth': [4, 6, 8, 10, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 3, 5],
#     'class_weight': [None, 'balanced'],
#     'max_features': ['sqrt','log2',.5]
# }

# rf = RandomForestClassifier(random_state=42, oob_score=True)
# grid_rf = HalvingGridSearchCV(
#     rf, param_grid,factor = 3,
#     cv=5, scoring='roc_auc', n_jobs=-1, verbose=2
# )
# grid_rf.fit(X_train, y_train)
# print("Best Random Forest params:", grid_rf.best_params_)
# best_rf = grid_rf.best_estimator_
# sent_model = best_rf

sent_model = RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_leaf=1,max_features= 'log2',
 random_state=42)
sent_model.fit(X_train,y_train)

#Get f1 for binary prediction at threshold for comparison later.
rf_probs = sent_model.predict_proba(X_test)[:, 1]
prob_thresh = np.quantile(rf_probs,0.7)
y_pred_rf_thresh = (rf_probs >= prob_thresh).astype(int)

sentiment_rf_f1 = f1_score(y_test, y_pred_rf_thresh)
sentiment_rf_auc = roc_auc_score(y_test,rf_probs)

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred_rf_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_thresh))
print("F1 of Positive Class:", sentiment_rf_f1)


#TEST TRADING STRATEGY:
#SIGNALS MEAN DATA SAMPLE TRIGGERED HIGH CONFIDENCE OR HIGH VOLATILITY SIGNAL 151 TIMES.
#PRECISION MEANS OF THOSE SET OF SIGNALS S, X% ACTUALLY CLASSIFIED CORRECTLY.



# Decision: Enter trade if prob of high volatility > threshold
X_test_sim = X_test.copy()
X_test_sim["vol_prob"] = rf_probs
X_test_sim["true_label"] = y_test.values
X_test_sim["signal"] = (X_test_sim["vol_prob"] >= prob_thresh).astype(int)

# How many signals?
n_signals = X_test_sim["signal"].sum()
print(f"Trade Signals Triggered: {n_signals} out of {len(X_test_sim)} samples")

# How accurate?
precision = precision_score(X_test_sim["true_label"], X_test_sim["signal"])
print(f"Precision of Strategy: {precision:.3f}")

# High confidence trade indexes
print("\nTop 5 high-confidence trades:")
print(X_test_sim.sort_values("vol_prob", ascending=False).head(5)[["vol_prob", "true_label"]])


In [None]:
#CLASSIFICATION ANALYSIS
train_scores = []
test_scores = []
f1_tests = []
f1_trains = []
depths = list(range(1,7, 1))

for depth in depths:
    model = RandomForestClassifier(max_depth=depth, n_estimators=300, max_features='log2',min_samples_leaf=5, random_state=42)
    model.fit(X_train, y_train)

    train_prob = model.predict_proba(X_train)[:,1]
    train_thr = np.quantile(train_prob,.7)
    test_prob  = model.predict_proba(X_test)[:,1]
    test_thr = np.quantile(test_prob,.7)

    train_pred = (train_prob >= train_thr).astype(int)
    test_pred  = (test_prob  >= test_thr).astype(int)
    train_scores.append(accuracy_score(y_train, train_pred))
    test_scores.append(accuracy_score(y_test, test_pred))

    f1 = f1_score(y_test,test_pred)
    f1_tr = f1_score(y_train,train_pred)
    f1_tests.append(f1)
    f1_trains.append(f1_tr)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, label="Train Accuracy", marker='o')
plt.plot(depths, test_scores, label="Test Accuracy", marker='o')
plt.title("Train vs. Test Accuracy by Tree Depth")
plt.xlabel("Max Tree Depth")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(depths, f1_trains, marker="o", label="Train F1")
plt.plot(depths, f1_tests, marker="x", label="Test F1")
plt.xlabel("Max Tree Depth")
plt.ylabel("Score")
plt.title("AVG Train/Test F1 Score vs Max Tree Depth")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()



In [None]:
#CLASSIFICATION ANALYSIS
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=True)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance.values, y=feature_importance.index)
plt.title("Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()

#Because we are dealing with class imbalance (we have only top 30% volatility counting as our class y = 1), F1/AUC harder to optimize. Means RF model is learning meaningful patterns from this data
#even though the sentiment features have very little weights. They are helping by solving edge cases in the tree (slight boost in ROC)


In [None]:
#COMPARE CLASSIFICATION TO BASELINE GARCH MODEL
df_garch = pd.read_csv("garch_volatility_predictions.csv", parse_dates=["date"])
start, end = "2024-02-02", "2025-04-08"
mask_garch = (df_garch['date'] >= start) & (df_garch['date'] <= end)
df_garch = df_garch.loc[mask_garch].reset_index(drop=True)
df_garch["ticker"] = df_garch["ticker"].str.upper().str.replace(".", "-", regex=False)
df_garch['garch_vol'] = df_garch['Rolling_GARCH_volatility %'].astype(float) / 100


X_test_full = X_test.copy()
X_test_full["date"] = df_final.loc[X_test.index, "date"].values
X_test_full["ticker"] = df_final.loc[X_test.index, "ticker"].values

# Merge with GARCH
X_test_full = X_test_full.merge(df_garch[['date','ticker','garch_vol']], on=["date", "ticker"], how="left")
print(X_test_full[["date", "ticker", "garch_vol"]].head())

garch_vol = X_test_full['garch_vol']
garch_thresh = np.quantile(garch_vol,.7)
X_test_full["garch_pred"] = (garch_vol >= garch_thresh).astype(int)

# Random Forest

# GARCH
garch_preds = X_test_full["garch_pred"]
#garch_auc = roc_auc_score(y_test, garch_preds)
garch_f1  = f1_score(y_test, X_test_full["garch_pred"])
garch_auc = roc_auc_score(y_test,garch_vol)
print(" Random Forest F1: ", sentiment_rf_f1)
print("Random Forest AUC: ", sentiment_rf_auc)
print("\n GARCH F1: ", garch_f1)
print('Garch AUC: ', garch_auc)


print("\n Random Forest Report:")
print(classification_report(y_test,y_pred_rf_thresh))

print("\n GARCH Report:")
print(classification_report(y_test, X_test_full['garch_pred']))

#PRECISION GOOD WHEN FALSE ALARMS ARE COSTLY (WRONG BET ON VOLATILITY)
#RECALL GOOD WHEN TRUE EVENTS ARE COSTLY (MISSING HIGH-VOLATILITY)
#f1 score: mean of precision/recall




In [None]:
# Train baseline RF
rf_base = RandomForestClassifier(
    n_estimators=300,
    max_depth=7,
    min_samples_leaf=1,
    max_features=.5,
    random_state=42
)
rf_base.fit(X_train[['RealizedVol_3d']], y_train)

# Predict on baseline features
base_rf_probs = rf_base.predict_proba(X_test[['RealizedVol_3d']])[:, 1]
base_rf_thresh = np.quantile(base_rf_probs,.7)
base_rf_pred = (base_rf_probs >= base_rf_thresh).astype(int)
base_rf_f1 = f1_score(y_test, base_rf_pred)
base_rf_auc = roc_auc_score(y_test,base_rf_probs)

plt.bar(["GARCH", "RF (base)", "RF Base + Sentiment"], [garch_f1, base_rf_f1, sentiment_rf_f1])
plt.ylabel("Binary Avg F1 Score")
plt.title("Model Comparison: GARCH vs Random Forests")
plt.ylim(0.5, 1)
plt.show()

plt.bar(["GARCH", "RF (base)", "RF Base + Sentiment"], [garch_auc, base_rf_auc, sentiment_rf_auc])
plt.ylabel("Avg ROC-AUC Score")
plt.title("Model Comparison: GARCH vs Random Forests")
plt.ylim(0.5, 1)
plt.show()

print("Base RF f1: ", base_rf_f1)
print("Base RF AUC: ", base_rf_auc)
print("sent auc: ", sentiment_rf_auc)
print("sent f1: ", sentiment_rf_f1)
#Learned that both RF models beat GARCH. Sentiment didn't improve RF model. Reddit/news sentiment noisy/sparse. Some tickers don't have enough signals (show ticker distribution chart)


In [None]:
#WALK FORWARD TESTING RF SENTIMENT
def walk_forward_test(df, features, target_col, train_window=1000, test_window=100):
    results = []
    preds_all = []
    actuals_all = []

    # Walk across the data
    for start in range(0, len(df) - train_window - test_window, test_window):
        train = df.iloc[start:start + train_window]
        test = df.iloc[start + train_window:start + train_window + test_window]

        X_train, y_train = train[features], train[target_col]
        X_test, y_test = test[features], test[target_col]

        model = RandomForestClassifier(
            n_estimators=300,
            max_depth=7,
            min_samples_leaf=1,
            max_features=.5,
            random_state=42
        )
        model.fit(X_train, y_train)
        probs = model.predict_proba(X_test)[:, 1]
        thresh = np.quantile(probs,.7)
        preds = (probs >= thresh).astype(int)

        # Record performance
        acc = accuracy_score(y_test, preds)
        auc = roc_auc_score(y_test, probs)

        results.append({"start": start, "accuracy": acc, "auc": auc})

        preds_all.extend(preds)
        actuals_all.extend(y_test)

    return results, np.array(preds_all), np.array(actuals_all)

df_walk = df_final.sort_values(["ticker", "date"]).reset_index(drop=True)

results, preds_all, actuals_all = walk_forward_test(
    df_walk,
    features=features,
    target_col="Target_binary",
    train_window=2000,
    test_window=150
)



In [None]:
results_df = pd.DataFrame(results)

plt.figure(figsize=(10, 5))
plt.plot(results_df["start"], results_df["accuracy"], label="Accuracy", marker='o')
plt.plot(results_df["start"], results_df["auc"], label="ROC AUC", marker='x')
plt.xlabel("Start Index of Window")
plt.ylabel("Score")
plt.title("Walk-Forward Testing Results")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#Class imbalance occurring/ slight model overconfidence at various indexes (accuracy > ROC AUC)

#ACCURACY VARIES WILDLY OVER TIME (SIMILAR TO MARKET EFFECTS) (SOME TIME PERIODS WE HAVE GREAT SIGNAL ALIGNMENT (SEE IF ITS DURING EARNINGS TIME) )



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

sent_features = [
    "reddit_sentiment_lag1", "reddit_volume_lag1",
    "news_sentiment_lag1",    "news_volume_lag1",
    "reddit_sentiment_missing","news_missing"
]

X_base = df_final[["RealizedVol_3d"]]   # baseline: 3-day realized vol only
X_All= df_final[features]       # sentiment-only (raw)
X_sent = df_final[sent_features]
# PCA pipeline will also use X_sent

# Define binary target with fixed 0.7 cutoff
y = (df_final["Target"] >= threshold).astype(int)

# Set up 5‑fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"AUC": "roc_auc", "F1": "f1"}

#Build pipelines
base_pipe = Pipeline([
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=7,
        min_samples_leaf=1,
        max_features = .5,
        random_state=42
    ))
])

sent_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=7,
        min_samples_leaf=1,
        max_features = .5,
        random_state=42
    ))
])

preproc = ColumnTransformer([
    # scale & reduce sentiment → 3 components
    ("sent_pca", Pipeline([
         ("scale", StandardScaler()),
         ("pca",   PCA(n_components=3))
    ]), sent_features),

    # pass through the single vol feature
    ("vol_passthrough", "passthrough", ['RealizedVol_3d'])
])

# Full PCA+vol → RF pipeline
pca_vol_pipe = Pipeline([
    ("feature_setup", preproc),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=7,
        min_samples_leaf=1,
        max_features=0.5,
        random_state=42
    ))
])
# Cross-validate each pipeline for both AUC and F1
base_res = cross_validate(base_pipe, X_base, y, cv=cv,
                          scoring=scoring, return_train_score=False)
sent_res = cross_validate(sent_pipe, X_All, y, cv=cv,
                          scoring=scoring, return_train_score=False)
pca_vol_res = cross_validate(pca_vol_pipe,X_All,y,cv=cv,scoring=scoring, return_train_score=False
)


def print_results(name, res):
    aucs = res["test_AUC"]
    f1s  = res["test_F1"]
    print(f"=== {name} ===")
    print("ROC‑AUC per fold:", np.round(aucs, 3))
    print("Mean ROC‑AUC:", f"{aucs.mean():.3f}")
    print("F₁ per fold:   ", np.round(f1s, 3))
    print("Mean F₁:   ", f"{f1s.mean():.3f}\n")


print_results("3DVol Only RF (Base)", base_res)
print_results("Rf sentiment + Base", sent_res)
print_results("PCA Sentiment + Base", pca_vol_res)



In [None]:
#ONE HOT ENCODING ON TICKERS
# One-hot encode
ticker_dummies = pd.get_dummies(df_final["ticker"], prefix="ticker")
# Concatenate with original features
df_with_ticker = pd.concat([df_final, ticker_dummies], axis=1)

# Add  sentiment + baseline vol + ticker dummies
final_features_with_ticker = features + list(ticker_dummies.columns)

X = df_with_ticker[final_features_with_ticker]
y = (df_with_ticker["Target"] >= df_with_ticker["Target"].quantile(0.7)).astype(int)

model = RandomForestClassifier(
    n_estimators=350,
    max_depth=8,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42
)

encode_scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc")
encode_scores2 = cross_val_score(model, X, y, cv=cv, scoring="f1")

print("Cross-validated AUC scores with ticker encoding:", encode_scores)
print("Mean AUC:", encode_scores.mean())
print("Cross-validated f1 scores with ticker encoding:", encode_scores2)
print("Mean F1: ", encode_scores2.mean())







In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# TEST FINAL MODEL (sentiment, one hot encoding on tickers, realized_vol3d are features. TARGET IS CLASS y = 1 if predicted vol is 70% oR HIGHER, 0 otherwise)
depths = list(range(1, 8))
train_acc = []
test_acc = []
f1_scores = []
auc_scores = []

for d in depths:
    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=d,
        min_samples_leaf=5,
        max_features='sqrt',
        random_state=42
    )
    model.fit(X_train, y_train)

    train_prob = model.predict_proba(X_train)[:,1]
    train_thr = np.quantile(train_prob,.7)
    test_prob  = model.predict_proba(X_test)[:,1]
    test_thr = np.quantile(test_prob,.7)

    train_pred = (train_prob >= train_thr).astype(int)
    test_pred  = (test_prob  >= test_thr).astype(int)

    f1 = f1_score(y_test,test_pred, zero_division=0)
    auc = roc_auc_score(y_test,test_prob)
    f1_scores.append(f1)
    auc_scores.append(auc)


    train_acc.append(accuracy_score(y_train, train_pred))
    test_acc.append(accuracy_score(y_test,test_pred))

# Plot
plt.figure(figsize=(10, 6))
plt.plot(depths, train_acc, marker='o', label="Train Accuracy")
plt.plot(depths, test_acc, marker='x', label="Test Accuracy")
plt.title("Train vs Test Accuracy (Random Forest + PCA)")
plt.xlabel("Tree Depth")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(depths, f1_scores, marker="o", label="F1 Score")
plt.plot(depths, auc_scores, marker="x", label="ROC AUC")
plt.xlabel("Max Tree Depth")
plt.ylabel("Score")
plt.title("F1 Score and ROC AUC vs Max Tree Depth")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# Fit final model
rf_final = RandomForestClassifier(
    n_estimators=300,
    max_depth=7,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)
rf_final.fit(X_train, y_train)

# Get importances
importances = rf_final.feature_importances_
feature_names = X_train.columns

# Plot
feat_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False).head(20)  # Top 20 to keep it readable

plt.figure(figsize=(10, 6))
sns.barplot(x="importance", y="feature", data=feat_df, palette="mako")
plt.title("Top 20 Feature Importances (Random Forest + PCA + Ticker)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

#Shows by using one-hot encoding are trading off slight AUC gain for lower classification generalization (likely due to varying sample size per ticker)
# Also likely due to imbalanced threhold like .7 quantile.