In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv1D, Bidirectional, LSTM, GRU, Dropout, Dense
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import random
import warnings

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

warnings.filterwarnings("ignore")

# Load & Prepare Data
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Compute 1-day lagged return and drop NaNs
df['return_1d_lag1'] = df.groupby('symbol')['return_1d'].shift(1)
df = df.dropna(subset=['return_1d_lag1']).reset_index(drop=True)

# Define Features & Parameters
price_feats  = ['adj close', 'log_volume', 'ma_10', 'vol_30', 'rsi_14', 'return_1d_lag1']
news_feats   = ['avg_sentiment', 'avg_sentiment_confidence', 'sentiment_std_7']
static_feats = price_feats + news_feats + ['day_of_week']
TARGET       = 'target'
SEQ_LEN      = 30

# Generate Sequences & Static Vectors
Xs, static_X, ys, dates = [], [], [], []
for sym, grp in df.groupby('symbol'):
    grp = grp.sort_values('date').reset_index(drop=True)
    arr    = grp[price_feats].values
    stat   = grp[static_feats].values
    labels = grp[TARGET].values
    dts    = grp['date'].values
    for i in range(SEQ_LEN, len(grp)):
        Xs.append(arr[i-SEQ_LEN:i])
        static_X.append(stat[i])
        ys.append(labels[i])
        dates.append(dts[i])

X         = np.stack(Xs).astype('float32')       # (N, SEQ_LEN, len(price_feats))
static_X  = np.stack(static_X).astype('float32') # (N, len(static_feats))
y         = np.array(ys, dtype='float32')
dates     = np.array(dates)

# Chronological Split
train_mask = dates <= np.datetime64('2021-12-31')
val_mask   = (dates > np.datetime64('2021-12-31')) & (dates <= np.datetime64('2022-12-31'))
test_mask  = dates > np.datetime64('2022-12-31')

X_train, X_val, X_test         = X[train_mask], X[val_mask], X[test_mask]
static_train, static_val, static_test = static_X[train_mask], static_X[val_mask], static_X[test_mask]
y_train, y_val, y_test         = y[train_mask], y[val_mask], y[test_mask]

print(f"Samples: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")

# Build CNN–BiLSTM–GRU Embedding Model
n_feats = X_train.shape[2]
inputs = Input(shape=(SEQ_LEN, n_feats))
x = Conv1D(32, 3, padding='same', activation='relu')(inputs)
x = Conv1D(32, 3, padding='same', activation='relu')(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GRU(32)(x)
embed_output = Dropout(0.2)(x)  # embedding vector
outputs = Dense(1, activation='sigmoid')(embed_output)
model = Model(inputs, outputs)

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)
model.summary()

# Train with Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    callbacks=[early_stop],
    verbose=2
)

# Extract Embeddings
embed_model = Model(inputs, embed_output)
emb_train = embed_model.predict(X_train, batch_size=1024)
emb_val   = embed_model.predict(X_val,   batch_size=1024)
emb_test  = embed_model.predict(X_test,  batch_size=1024)

# Combine with Static Features
train_feat = np.hstack([emb_train, static_train])
val_feat   = np.hstack([emb_val,   static_val])
test_feat  = np.hstack([emb_test,  static_test])

# grid‐search over key LightGBM hyperparameters on your validation split
import itertools
import numpy as np
from sklearn.metrics import f1_score
import lightgbm as lgb

# Define the grid of hyperparameters to try
param_grid = {
    'num_leaves':       [31, 63, 127],
    'max_depth':        [-1, 10, 20],
    'min_data_in_leaf': [20, 50, 100],
    'learning_rate':    [0.01, 0.05, 0.1],
}
fixed_params = {
    'n_estimators': 200,
    'random_state': 42,
    'n_jobs':      -1,
    'verbosity':   -1,
}

best_f1 = -np.inf
best_params = None

# Loop over all combinations
for num_leaves, max_depth, min_data in itertools.product(
    param_grid['num_leaves'],
    param_grid['max_depth'],
    param_grid['min_data_in_leaf']
):
    for lr in param_grid['learning_rate']:
        params = {
            **fixed_params,
            'num_leaves':       num_leaves,
            'max_depth':        max_depth,
            'min_data_in_leaf': min_data,
            'learning_rate':    lr,
        }
        # Train on train_feat, early‐stop on val_feat
        clf_cv = lgb.LGBMClassifier(**params)
        clf_cv.fit(
            train_feat, y_train,
            eval_set=[(val_feat, y_val)],
            eval_metric='binary_logloss',
            callbacks=[lgb.early_stopping(stopping_rounds=10)]
        )

        # Evaluate F1 on validation (threshold=0.5)
        val_preds = (clf_cv.predict_proba(val_feat)[:,1] >= 0.5).astype(int)
        f1 = f1_score(y_val, val_preds)
        print(f"leaves={num_leaves}, depth={max_depth}, min_leaf={min_data}, lr={lr} → F1={f1:.4f}")

        # Track best
        if f1 > best_f1:
            best_f1 = f1
            best_params = params

# Report the winner
print("\nBest hyperparameters on validation:")
print(best_params)
print(f"Best F1 on validation: {best_f1:.4f}")


# Train LightGBM on Hybrid Features
clf = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)
clf.fit(
    train_feat, y_train,
    eval_set=[(val_feat, y_val)],
    eval_metric='binary_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=20)
    ]
)


# Evaluate on Test Set
y_pred = clf.predict(test_feat)
y_prob = clf.predict_proba(test_feat)[:, 1]

print("\nHybrid CNN–BiLSTM–GRU → LightGBM Performance:")
print(f"  Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"  Precision: {precision_score(y_test, y_pred):.4f}")
print(f"  Recall   : {recall_score(y_test, y_pred):.4f}")
print(f"  F1 Score : {f1_score(y_test, y_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_test, y_prob):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Samples: Train=3477497, Val=642247, Test=571443


Epoch 1/20
3396/3396 - 76s - 22ms/step - accuracy: 0.5122 - loss: 0.6929 - val_accuracy: 0.5062 - val_loss: 0.6930
Epoch 2/20
3396/3396 - 70s - 20ms/step - accuracy: 0.5163 - loss: 0.6923 - val_accuracy: 0.5036 - val_loss: 0.6939
Epoch 3/20
3396/3396 - 70s - 20ms/step - accuracy: 0.5191 - loss: 0.6919 - val_accuracy: 0.5087 - val_loss: 0.6938
Epoch 4/20
3396/3396 - 69s - 20ms/step - accuracy: 0.5235 - loss: 0.6899 - val_accuracy: 0.5048 - val_loss: 0.6951
[1m3396/3396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[45]	valid_0's binary_logloss: 0.694208
leaves=31, depth=-1, min_leaf=20, lr=0.01 → F1=0.6228
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[10]	valid_0's binary_logl