In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv1D, Bidirectional, LSTM, GRU, Dropout, Dense
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, brier_score_loss
)
import random
import warnings

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
warnings.filterwarnings("ignore")

# Load & sort data
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Feature Engineering
# Multi-lag returns
for lag in (1, 3, 5):
    df[f'return_1d_lag{lag}'] = df.groupby('symbol')['return_1d'].shift(lag)

# Rolling-window return stats (7-day mean & std)
df['return_7d_mean'] = df.groupby('symbol')['return_1d']\
                         .transform(lambda x: x.rolling(7).mean())
df['return_7d_std']  = df.groupby('symbol')['return_1d']\
                         .transform(lambda x: x.rolling(7).std())

# Rolling-window sentiment stats
df['sentiment_7d_mean'] = df.groupby('symbol')['avg_sentiment']\
                            .transform(lambda x: x.rolling(7).mean())
df['pos_sent_count_7d'] = df.groupby('symbol')['avg_sentiment']\
    .transform(lambda x: x.rolling(7).apply(lambda arr: (arr>0).sum(), raw=True))
df['neg_sent_count_7d'] = df.groupby('symbol')['avg_sentiment']\
    .transform(lambda x: x.rolling(7).apply(lambda arr: (arr<0).sum(), raw=True))

# One-hot encode day_of_week
dow_ohe = pd.get_dummies(df['day_of_week'], prefix='dow', drop_first=True)
df = pd.concat([df, dow_ohe], axis=1)

# Drop any rows with NA created by shifts/rollings
df = df.dropna().reset_index(drop=True)


# Define Features & Parameters
price_feats = [
    'adj close','log_volume','ma_10','vol_30','rsi_14','return_1d_lag1'
]
news_feats  = ['avg_sentiment','avg_sentiment_confidence','sentiment_std_7']
eng_feats   = [
    'return_1d_lag3','return_1d_lag5',
    'return_7d_mean','return_7d_std',
    'sentiment_7d_mean','pos_sent_count_7d','neg_sent_count_7d'
]
dow_feats   = [c for c in df.columns if c.startswith('dow_')]
static_feats = price_feats + news_feats + eng_feats + dow_feats

TARGET  = 'target'
SEQ_LEN = 30

# Prepare sequences & static arrays
Xs, stat_X, ys, dates = [], [], [], []
for sym, grp in df.groupby('symbol'):
    grp = grp.sort_values('date').reset_index(drop=True)
    seq_vals  = grp[price_feats].values
    stat_vals = grp[static_feats].values
    lbls      = grp[TARGET].values
    dts       = grp['date'].values
    for i in range(SEQ_LEN, len(grp)):
        Xs.append(seq_vals[i-SEQ_LEN:i])
        stat_X.append(stat_vals[i])
        ys.append(lbls[i])
        dates.append(dts[i])

X        = np.stack(Xs).astype('float32')
static_X = np.stack(stat_X).astype('float32')
y        = np.array(ys, dtype='float32')
dates    = np.array(dates)

# Chronological splits
train_mask = dates <= np.datetime64('2021-12-31')
val_mask   = (dates > np.datetime64('2021-12-31')) & (dates <= np.datetime64('2022-12-31'))
test_mask  = dates > np.datetime64('2022-12-31')

X_train, X_val, X_test = X[train_mask], X[val_mask], X[test_mask]
s_train, s_val, s_test = static_X[train_mask], static_X[val_mask], static_X[test_mask]
y_train, y_val, y_test = y[train_mask], y[val_mask], y[test_mask]

print(f"Samples: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")

# CNN–BiLSTM–GRU embedding model
n_feats = X_train.shape[2]
inp     = Input(shape=(SEQ_LEN, n_feats))
x       = Conv1D(32,3,padding='same',activation='relu')(inp)
x       = Conv1D(32,3,padding='same',activation='relu')(x)
x       = Bidirectional(LSTM(64,return_sequences=True))(x)
x       = GRU(32)(x)
embed   = Dropout(0.2)(x)
out     = Dense(1,activation='sigmoid')(embed)
seq_mod = Model(inp,out)
seq_mod.compile(loss='binary_crossentropy',optimizer=Adam(1e-3),metrics=['accuracy'])
seq_mod.fit(
    X_train,y_train,
    validation_data=(X_val,y_val),
    epochs=20,batch_size=1024,
    callbacks=[EarlyStopping('val_loss',patience=3,restore_best_weights=True)],
    verbose=2
)

# Extract embeddings & combine
emb_mod    = Model(inp, embed)
emb_tr, emb_val, emb_te = (
    emb_mod.predict(X_train,1024),
    emb_mod.predict(X_val,1024),
    emb_mod.predict(X_test,1024)
)
train_feat = np.hstack([emb_tr, s_train])
val_feat   = np.hstack([emb_val, s_val])
test_feat  = np.hstack([emb_te, s_test])

# LightGBM training
clf = lgb.LGBMClassifier(
    n_estimators=200, num_leaves=127, max_depth=-1,
    min_data_in_leaf=20, learning_rate=0.01,
    random_state=42, n_jobs=-1
)
clf.fit(
    train_feat, y_train,
    eval_set=[(val_feat,y_val)],
    eval_metric='binary_logloss',
    callbacks=[lgb.early_stopping(10), lgb.log_evaluation(period=20)]
)

# Calibration & threshold selection
cal = CalibratedClassifierCV(clf,method='sigmoid',cv='prefit')
cal.fit(val_feat,y_val)
uncal_p = clf.predict_proba(val_feat)[:,1]
calib_p = cal.predict_proba(val_feat)[:,1]
print("Val Brier uncal:", brier_score_loss(y_val,uncal_p))
print("Val Brier calib:", brier_score_loss(y_val,calib_p))

best_t,best_f1 = 0,0
for t in np.arange(0.1,0.91,0.01):
    p = (calib_p>=t).astype(int)
    f = f1_score(y_val,p)
    if f>best_f1:
        best_f1,best_t = f,t
print(f"Optimal τ={best_t:.2f}, F1={best_f1:.4f}")

# Final evaluation
tp = cal.predict_proba(test_feat)[:,1]
tpred = (tp>=best_t).astype(int)
print("Final Accuracy :",accuracy_score(y_test,tpred))
print("Final Precision:",precision_score(y_test,tpred))
print("Final Recall   :",recall_score(y_test,tpred))
print("Final F1       :",f1_score(y_test,tpred))
print("Final ROC AUC  :",roc_auc_score(y_test,tp))
print("Confusion Matrix:\n",confusion_matrix(y_test,tpred))

Samples: Train=3451220, Val=641865, Test=571382
Epoch 1/20
3371/3371 - 76s - 23ms/step - accuracy: 0.5126 - loss: 0.6929 - val_accuracy: 0.4886 - val_loss: 0.6945
Epoch 2/20
3371/3371 - 71s - 21ms/step - accuracy: 0.5162 - loss: 0.6924 - val_accuracy: 0.4894 - val_loss: 0.6948
Epoch 3/20
3371/3371 - 71s - 21ms/step - accuracy: 0.5190 - loss: 0.6919 - val_accuracy: 0.4939 - val_loss: 0.6948
Epoch 4/20
3371/3371 - 71s - 21ms/step - accuracy: 0.5232 - loss: 0.6900 - val_accuracy: 0.4931 - val_loss: 0.6955
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
[LightGBM] [Info] Number of positive: 1762694, number of negative: 1688526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.504792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin