In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv1D, Bidirectional, LSTM, GRU, Dropout, Dense
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import random
import warnings

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

warnings.filterwarnings("ignore")

# Load & Prepare Data
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Compute 1-day lagged return and drop NaNs
df['return_1d_lag1'] = df.groupby('symbol')['return_1d'].shift(1)
df = df.dropna(subset=['return_1d_lag1']).reset_index(drop=True)

# Define Features & Parameters
price_feats  = ['adj close', 'log_volume', 'ma_10', 'vol_30', 'rsi_14', 'return_1d_lag1']
news_feats   = ['avg_sentiment', 'avg_sentiment_confidence', 'sentiment_std_7']
static_feats = price_feats + news_feats + ['day_of_week']
TARGET       = 'target'
SEQ_LEN      = 30

# Generate Sequences & Static Vectors
Xs, static_X, ys, dates = [], [], [], []
for sym, grp in df.groupby('symbol'):
    grp = grp.sort_values('date').reset_index(drop=True)
    arr    = grp[price_feats].values
    stat   = grp[static_feats].values
    labels = grp[TARGET].values
    dts    = grp['date'].values
    for i in range(SEQ_LEN, len(grp)):
        Xs.append(arr[i-SEQ_LEN:i])
        static_X.append(stat[i])
        ys.append(labels[i])
        dates.append(dts[i])

X         = np.stack(Xs).astype('float32')       # (N, SEQ_LEN, len(price_feats))
static_X  = np.stack(static_X).astype('float32') # (N, len(static_feats))
y         = np.array(ys, dtype='float32')
dates     = np.array(dates)

# Chronological Split
train_mask = dates <= np.datetime64('2021-12-31')
val_mask   = (dates > np.datetime64('2021-12-31')) & (dates <= np.datetime64('2022-12-31'))
test_mask  = dates > np.datetime64('2022-12-31')

X_train, X_val, X_test         = X[train_mask], X[val_mask], X[test_mask]
static_train, static_val, static_test = static_X[train_mask], static_X[val_mask], static_X[test_mask]
y_train, y_val, y_test         = y[train_mask], y[val_mask], y[test_mask]

print(f"Samples: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")

# Build CNN–BiLSTM–GRU Embedding Model
n_feats = X_train.shape[2]
inputs = Input(shape=(SEQ_LEN, n_feats))
x = Conv1D(32, 3, padding='same', activation='relu')(inputs)
x = Conv1D(32, 3, padding='same', activation='relu')(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GRU(32)(x)
embed_output = Dropout(0.2)(x)  # embedding vector
outputs = Dense(1, activation='sigmoid')(embed_output)
model = Model(inputs, outputs)

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)
model.summary()

# Train with Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    callbacks=[early_stop],
    verbose=2
)

# Extract Embeddings
embed_model = Model(inputs, embed_output)
emb_train = embed_model.predict(X_train, batch_size=1024)
emb_val   = embed_model.predict(X_val,   batch_size=1024)
emb_test  = embed_model.predict(X_test,  batch_size=1024)

# Combine with Static Features
train_feat = np.hstack([emb_train, static_train])
val_feat   = np.hstack([emb_val,   static_val])
test_feat  = np.hstack([emb_test,  static_test])

# Train LightGBM on Hybrid Features
clf = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)
clf.fit(
    train_feat, y_train,
    eval_set=[(val_feat, y_val)],
    eval_metric='binary_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=20)
    ]
)


# Threshold optimization on the VALIDATION set
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Get probabilities on validation:
val_probs = clf.predict_proba(val_feat)[:,1]

# Sweep thresholds:
best_t, best_f1, best_p, best_r = 0, 0, 0, 0
for t in np.arange(0.1, 0.91, 0.01):
    preds = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1, best_t = f1, t
        best_p = precision_score(y_val, preds)
        best_r = recall_score(y_val, preds)

# Report your optimal threshold:
print(f"\nOptimal threshold on VAL: {best_t:.2f}")
print(f"  Precision: {best_p:.4f}")
print(f"  Recall   : {best_r:.4f}")
print(f"  F1 Score : {best_f1:.4f}")

# Evaluate on Test Set using that threshold
test_preds = (clf.predict_proba(test_feat)[:,1] >= best_t).astype(int)
test_probs = clf.predict_proba(test_feat)[:,1]

print("\nTest set performance at threshold "
      f"{best_t:.2f}:")
print(f"  Accuracy : {accuracy_score(y_test, test_preds):.4f}")
print(f"  Precision: {precision_score(y_test, test_preds):.4f}")
print(f"  Recall   : {recall_score(y_test, test_preds):.4f}")
print(f"  F1 Score : {f1_score(y_test, test_preds):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_test, test_probs):.4f}")
print("  Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

Samples: Train=3477497, Val=642247, Test=571443


Epoch 1/20
3396/3396 - 76s - 22ms/step - accuracy: 0.5124 - loss: 0.6929 - val_accuracy: 0.5021 - val_loss: 0.6933
Epoch 2/20
3396/3396 - 72s - 21ms/step - accuracy: 0.5162 - loss: 0.6923 - val_accuracy: 0.5052 - val_loss: 0.6938
Epoch 3/20
3396/3396 - 71s - 21ms/step - accuracy: 0.5194 - loss: 0.6918 - val_accuracy: 0.5071 - val_loss: 0.6942
Epoch 4/20
3396/3396 - 71s - 21ms/step - accuracy: 0.5234 - loss: 0.6902 - val_accuracy: 0.5024 - val_loss: 0.6949
[1m3396/3396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
[LightGBM] [Info] Number of positive: 1776573, number of negative: 1700924
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.588822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10269
[LightGBM] [Info] Number of data points 