In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv1D, Bidirectional, LSTM, GRU,
    Dropout, Dense, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import random
import warnings

# Reproducibility & Load Data
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
warnings.filterwarnings("ignore")

df = pd.read_csv(
    "/content/drive/MyDrive/MRP/final_dataset.csv",
    parse_dates=["date"]
)
df.sort_values(["symbol", "date"], inplace=True, ignore_index=True)

# Feature Engineering
# multi-lag returns
for lag in (1, 3, 5):
    df[f"return_1d_lag{lag}"] = df.groupby("symbol")["return_1d"].shift(lag)

# 7-day rolling on returns
df["return_7d_mean"] = df.groupby("symbol")["return_1d"]\
                        .transform(lambda x: x.rolling(7).mean())
df["return_7d_std"]  = df.groupby("symbol")["return_1d"]\
                        .transform(lambda x: x.rolling(7).std())

# drop any rows with NA from shifts/rolling
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Define feature lists
price_feats = [
    "adj close","log_volume","ma_10","vol_30","rsi_14",
    "return_1d_lag1","return_1d_lag3","return_1d_lag5",
    "return_7d_mean","return_7d_std"
]
# static_feats now only the same price_feats
static_feats = price_feats.copy()

TARGET  = "target"
SEQ_LEN = 60

# Build sequences & static arrays
Xs, stat_X, ys, dates = [], [], [], []
for sym, grp in df.groupby("symbol"):
    grp = grp.sort_values("date", ignore_index=True)
    seq_vals  = grp[price_feats].values
    stat_vals = grp[static_feats].values
    lbls      = grp[TARGET].values
    dts       = grp["date"].values

    for i in range(SEQ_LEN, len(grp)):
        Xs.append(seq_vals[i-SEQ_LEN : i])
        stat_X.append(stat_vals[i])
        ys.append(lbls[i])
        dates.append(dts[i])

X        = np.stack(Xs).astype("float32")       # (N, SEQ_LEN, 10)
static_X = np.stack(stat_X).astype("float32")  # (N, 10)
y        = np.array(ys, dtype="float32")
dates    = np.array(dates)

# Chronological Train/Val/Test split
train_mask = dates <= np.datetime64("2021-12-31")
val_mask   = (dates >  np.datetime64("2021-12-31")) & (dates <= np.datetime64("2022-12-31"))
test_mask  = dates >  np.datetime64("2022-12-31")

X_tr, X_va, X_te = X[train_mask], X[val_mask], X[test_mask]
s_tr, s_va, s_te = static_X[train_mask], static_X[val_mask], static_X[test_mask]
y_tr, y_va, y_te = y[train_mask], y[val_mask], y[test_mask]

print(f"Train: {len(y_tr)} | Val: {len(y_va)} | Test: {len(y_te)}")

# Train CNN–BiLSTM–GRU with multi-kernel & dilated convs
inp = Input(shape=(SEQ_LEN, len(price_feats)))
# three parallel convs
c1 = Conv1D(32, 3, padding="same", activation="relu")(inp)
c2 = Conv1D(32, 5, padding="same", activation="relu")(inp)
c3 = Conv1D(32, 3, dilation_rate=2, padding="same", activation="relu")(inp)
x  = Concatenate()([c1, c2, c3])
x  = Conv1D(64, 3, padding="same", activation="relu")(x)
x  = Bidirectional(LSTM(64, return_sequences=True))(x)
x  = GRU(32)(x)
emb = Dropout(0.2)(x)
out = Dense(1, activation="sigmoid")(emb)

seq_model = Model(inputs=inp, outputs=out)
seq_model.compile(
    loss="binary_crossentropy",
    optimizer=Adam(learning_rate=1e-3),
    metrics=["accuracy"]
)
seq_model.fit(
    X_tr, y_tr,
    validation_data=(X_va, y_va),
    epochs=20,
    batch_size=1024,
    callbacks=[EarlyStopping("val_loss", patience=3, restore_best_weights=True)],
    verbose=2
)

# Extract embeddings
embed_model = Model(inputs=inp, outputs=emb)
emb_tr = embed_model.predict(X_tr, batch_size=1024)
emb_va = embed_model.predict(X_va, batch_size=1024)
emb_te = embed_model.predict(X_te, batch_size=1024)

# Train LightGBM on [embeddings ∥ static_feats]
train_feat = np.hstack([emb_tr, s_tr])
val_feat   = np.hstack([emb_va, s_va])
test_feat  = np.hstack([emb_te, s_te])

clf = lgb.LGBMClassifier(
    n_estimators   = 200,
    learning_rate  = 0.05,
    num_leaves     = 31,
    random_state   = SEED,
    n_jobs         = -1,
    verbosity      = -1
)
clf.fit(
    train_feat, y_tr,
    eval_set=[(val_feat, y_va)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Calibration
calibrator = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
calibrator.fit(val_feat, y_va)

# Threshold sweep on calibrated validation scores
val_probs, best_t, best_f1 = calibrator.predict_proba(val_feat)[:,1], 0.5, 0
for t in np.linspace(0.3,0.7,41):
    p = (val_probs >= t).astype(int)
    f = f1_score(y_va, p)
    if f > best_f1:
        best_f1, best_t = f, t
print(f"Optimal τ on VAL: {best_t:.2f} (F1={best_f1:.4f})")

# Final evaluation on test set
test_probs = calibrator.predict_proba(test_feat)[:,1]
test_pred  = (test_probs >= best_t).astype(int)

print("\n2023 Test Performance:")
print(f"  Accuracy   : {accuracy_score(y_te, test_pred):.4f}")
print(f"  Precision  : {precision_score(y_te, test_pred):.4f}")
print(f"  Recall     : {recall_score(y_te, test_pred):.4f}")
print(f"  F1 Score   : {f1_score(y_te, test_pred):.4f}")
print(f"  ROC AUC    : {roc_auc_score(y_te, test_probs):.4f}")
print("  Confusion Matrix:")
print(confusion_matrix(y_te, test_pred))

Train: 3294804 | Val: 639385 | Test: 570805
Epoch 1/20
3218/3218 - 144s - 45ms/step - accuracy: 0.5111 - loss: 0.6931 - val_accuracy: 0.4907 - val_loss: 0.6941
Epoch 2/20
3218/3218 - 133s - 41ms/step - accuracy: 0.5149 - loss: 0.6926 - val_accuracy: 0.4952 - val_loss: 0.6936
Epoch 3/20
3218/3218 - 133s - 41ms/step - accuracy: 0.5162 - loss: 0.6924 - val_accuracy: 0.4964 - val_loss: 0.6940
Epoch 4/20
3218/3218 - 133s - 41ms/step - accuracy: 0.5191 - loss: 0.6919 - val_accuracy: 0.5042 - val_loss: 0.6942
Epoch 5/20
3218/3218 - 133s - 41ms/step - accuracy: 0.5245 - loss: 0.6896 - val_accuracy: 0.5056 - val_loss: 0.6936
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 15ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.819544	valid_