In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, Bidirectional, LSTM, GRU, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, brier_score_loss
)
import random
import warnings

# Reproducibility & Load Data
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
warnings.filterwarnings("ignore")

df = pd.read_csv("/content/drive/MyDrive/MRP/final_dataset.csv", parse_dates=["date"])
df = df.sort_values(["symbol", "date"]).reset_index(drop=True)

# Feature Engineering
for lag in (1, 3, 5):
    df[f"return_1d_lag{lag}"] = df.groupby("symbol")["return_1d"].shift(lag)

df["return_7d_mean"] = df.groupby("symbol")["return_1d"].transform(lambda x: x.rolling(7).mean())
df["return_7d_std"]  = df.groupby("symbol")["return_1d"].transform(lambda x: x.rolling(7).std())

df["sentiment_7d_mean"] = df.groupby("symbol")["avg_sentiment"].transform(lambda x: x.rolling(7).mean())
df["pos_sent_count_7d"] = df.groupby("symbol")["avg_sentiment"] \
    .transform(lambda x: x.rolling(7).apply(lambda a: (a > 0).sum(), raw=True))
df["neg_sent_count_7d"] = df.groupby("symbol")["avg_sentiment"] \
    .transform(lambda x: x.rolling(7).apply(lambda a: (a < 0).sum(), raw=True))

dow_ohe = pd.get_dummies(df["day_of_week"], prefix="dow", drop_first=True)
df = pd.concat([df, dow_ohe], axis=1)

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Build Sequence & Static Arrays
price_feats = ["adj close","log_volume","ma_10","vol_30","rsi_14","return_1d_lag1"]
news_feats  = ["avg_sentiment","avg_sentiment_confidence","sentiment_std_7"]
eng_feats   = [
    "return_1d_lag3","return_1d_lag5",
    "return_7d_mean","return_7d_std",
    "sentiment_7d_mean","pos_sent_count_7d","neg_sent_count_7d"
]
dow_feats   = [c for c in df.columns if c.startswith("dow_")]
static_feats = price_feats + news_feats + eng_feats + dow_feats

SEQ_LEN = 30
Xs, stat_X, ys, dates = [], [], [], []

for sym, grp in df.groupby("symbol"):
    grp = grp.sort_values("date").reset_index(drop=True)
    seq_vals  = grp[price_feats].values
    stat_vals = grp[static_feats].values
    lbls      = grp["target"].values
    dts       = grp["date"].values

    for i in range(SEQ_LEN, len(grp)):
        Xs.append(seq_vals[i-SEQ_LEN:i])
        stat_X.append(stat_vals[i])
        ys.append(lbls[i])
        dates.append(dts[i])

X        = np.stack(Xs).astype("float32")
static_X = np.stack(stat_X).astype("float32")
y        = np.array(ys, dtype="float32")
dates    = np.array(dates)

# Chronological Train / Val / Test Split
train_mask = dates <= np.datetime64("2021-12-31")
val_mask   = (dates >  np.datetime64("2021-12-31")) & (dates <= np.datetime64("2022-12-31"))
test_mask  = dates >  np.datetime64("2022-12-31")

X_train, X_val, X_test = X[train_mask], X[val_mask], X[test_mask]
s_train, s_val, s_test = static_X[train_mask], static_X[val_mask], static_X[test_mask]
y_train, y_val, y_test = y[train_mask], y[val_mask], y[test_mask]

print(f"Train: {len(y_train)} | Val: {len(y_val)} | Test: {len(y_test)}")

# Train CNN–BiLSTM–GRU Embedding Model
inp = Input(shape=(SEQ_LEN, X_train.shape[2]))
x   = Conv1D(32, 3, padding="same", activation="relu")(inp)
x   = Conv1D(32, 3, padding="same", activation="relu")(x)
x   = Bidirectional(LSTM(64, return_sequences=True))(x)
x   = GRU(32)(x)
embed = Dropout(0.2)(x)
out   = Dense(1, activation="sigmoid")(embed)

seq_model = Model(inputs=inp, outputs=out)
seq_model.compile(
    optimizer=Adam(1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

seq_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    callbacks=[EarlyStopping("val_loss", patience=3, restore_best_weights=True)],
    verbose=2
)

# Extract Embeddings
embed_model = Model(inputs=inp, outputs=embed)
emb_train   = embed_model.predict(X_train, batch_size=1024)
emb_val     = embed_model.predict(X_val,   batch_size=1024)
emb_test    = embed_model.predict(X_test,  batch_size=1024)

# Train LightGBM on Hybrid Features
train_feat = np.hstack([emb_train, s_train])
val_feat   = np.hstack([emb_val,   s_val])
test_feat  = np.hstack([emb_test,  s_test])

clf = lgb.LGBMClassifier(
    n_estimators=200,
    num_leaves=127,
    min_data_in_leaf=20,
    learning_rate=0.01,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

# use the 2022 validation set for early stopping
clf.fit(
    train_feat, y_train,
    eval_set=[(val_feat, y_val)],
    eval_metric="binary_logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(period=0)]
)

# Final Evaluation on 2023 Test Set
y_prob = clf.predict_proba(test_feat)[:, 1]
y_pred = (y_prob >= 0.35).astype(int)

print("\nFinal 2023 Test Performance:")
print(f"  Accuracy    : {accuracy_score(y_test,   y_pred):.4f}")
print(f"  Precision   : {precision_score(y_test,  y_pred):.4f}")
print(f"  Recall      : {recall_score(y_test,     y_pred):.4f}")
print(f"  F1 Score    : {f1_score(y_test,         y_pred):.4f}")
print(f"  ROC AUC     : {roc_auc_score(y_test, y_prob):.4f}")
print(f"  Brier Score : {brier_score_loss(y_test, y_prob):.4f}")
print("  Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Train: 3451220 | Val: 641865 | Test: 571382
Epoch 1/20
3371/3371 - 70s - 21ms/step - accuracy: 0.5124 - loss: 0.6929 - val_accuracy: 0.4873 - val_loss: 0.6945
Epoch 2/20
3371/3371 - 65s - 19ms/step - accuracy: 0.5161 - loss: 0.6924 - val_accuracy: 0.4930 - val_loss: 0.6945
Epoch 3/20
3371/3371 - 65s - 19ms/step - accuracy: 0.5187 - loss: 0.6919 - val_accuracy: 0.4947 - val_loss: 0.6948
Epoch 4/20
3371/3371 - 65s - 19ms/step - accuracy: 0.5231 - loss: 0.6902 - val_accuracy: 0.4954 - val_loss: 0.6955
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/step
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's binary_logloss: 0.556131

Final 2023 Test Performance:
  Accuracy    : 0.6826
  Precision   : 0.6274
  Recall      : 0.9120
  F1 Score    : 0.7