In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, Bidirectional, LSTM, GRU, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, roc_auc_score
import random
import warnings

# Reproducibility & Load Data
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
warnings.filterwarnings("ignore")

file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Feature Engineering
for lag in (1, 3, 5):
    df[f'return_1d_lag{lag}'] = df.groupby('symbol')['return_1d'].shift(lag)

df['return_7d_mean'] = df.groupby('symbol')['return_1d'].transform(lambda x: x.rolling(7).mean())
df['return_7d_std']  = df.groupby('symbol')['return_1d'].transform(lambda x: x.rolling(7).std())

df['sentiment_7d_mean'] = df.groupby('symbol')['avg_sentiment'].transform(lambda x: x.rolling(7).mean())
df['pos_sent_count_7d'] = df.groupby('symbol')['avg_sentiment'] \
    .transform(lambda x: x.rolling(7).apply(lambda arr: (arr>0).sum(), raw=True))
df['neg_sent_count_7d'] = df.groupby('symbol')['avg_sentiment'] \
    .transform(lambda x: x.rolling(7).apply(lambda arr: (arr<0).sum(), raw=True))

dow_ohe = pd.get_dummies(df['day_of_week'], prefix='dow', drop_first=True)
df = pd.concat([df, dow_ohe], axis=1)

df = df.dropna().reset_index(drop=True)

# Build Arrays
price_feats = ['adj close','log_volume','ma_10','vol_30','rsi_14','return_1d_lag1']
news_feats  = ['avg_sentiment','avg_sentiment_confidence','sentiment_std_7']
eng_feats   = ['return_1d_lag3','return_1d_lag5','return_7d_mean','return_7d_std',
               'sentiment_7d_mean','pos_sent_count_7d','neg_sent_count_7d']
dow_feats   = [c for c in df.columns if c.startswith('dow_')]
static_feats = price_feats + news_feats + eng_feats + dow_feats

TARGET  = 'target'
SEQ_LEN = 30

Xs, stat_X, ys, dates = [], [], [], []
for sym, grp in df.groupby('symbol'):
    grp = grp.sort_values('date').reset_index(drop=True)
    seq  = grp[price_feats].values
    stat = grp[static_feats].values
    lbl  = grp[TARGET].values
    dt   = grp['date'].values
    for i in range(SEQ_LEN, len(grp)):
        Xs.append(seq[i-SEQ_LEN:i])
        stat_X.append(stat[i])
        ys.append(lbl[i])
        dates.append(dt[i])

X        = np.stack(Xs).astype('float32')
static_X = np.stack(stat_X).astype('float32')
y        = np.array(ys, dtype='float32')
dates    = np.array(dates)

# Define Temporal Folds
folds = [
    {'train_end':'2019-12-31','val_start':'2020-01-01','val_end':'2020-12-31'},
    {'train_end':'2020-12-31','val_start':'2021-01-01','val_end':'2021-12-31'},
    {'train_end':'2021-12-31','val_start':'2022-01-01','val_end':'2022-12-31'},
]

results = []

# Loop over folds
for fold in folds:
    te = np.datetime64(fold['train_end'])
    vs = np.datetime64(fold['val_start'])
    ve = np.datetime64(fold['val_end'])
    tr_mask = dates <= te
    va_mask = (dates >= vs) & (dates <= ve)

    X_tr, X_va = X[tr_mask], X[va_mask]
    s_tr, s_va = static_X[tr_mask], static_X[va_mask]
    y_tr, y_va = y[tr_mask], y[va_mask]

    # Train CNN–BiLSTM–GRU embedding model
    inp = Input(shape=(SEQ_LEN, X_tr.shape[2]))
    x   = Conv1D(32,3,padding='same',activation='relu')(inp)
    x   = Conv1D(32,3,padding='same',activation='relu')(x)
    x   = Bidirectional(LSTM(64,return_sequences=True))(x)
    x   = GRU(32)(x)
    embed = Dropout(0.2)(x)
    out   = Dense(1,activation='sigmoid')(embed)
    seq_mod = Model(inp,out)
    seq_mod.compile(loss='binary_crossentropy',
                    optimizer=Adam(1e-3),
                    metrics=['accuracy'])
    seq_mod.fit(X_tr, y_tr,
                validation_data=(X_va, y_va),
                epochs=10,
                batch_size=1024,
                callbacks=[EarlyStopping('val_loss',patience=2,restore_best_weights=True)],
                verbose=0)

    # extract embeddings
    emb_mod = Model(inp, embed)
    emb_tr = emb_mod.predict(X_tr, batch_size=1024)
    emb_va = emb_mod.predict(X_va, batch_size=1024)

    # Train LightGBM
    train_feat = np.hstack([emb_tr, s_tr])
    val_feat   = np.hstack([emb_va, s_va])
    clf = lgb.LGBMClassifier(
        n_estimators=200, num_leaves=127, min_data_in_leaf=20,
        learning_rate=0.01, random_state=SEED, n_jobs=-1,
        verbosity=-1
    )
    clf.fit(train_feat, y_tr,
            eval_set=[(val_feat, y_va)],
            eval_metric='binary_logloss',
            callbacks=[
                lgb.early_stopping(stopping_rounds=10),
                lgb.log_evaluation(period=0)
            ])

    # Calibrate & Threshold
    cal = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit')
    cal.fit(val_feat, y_va)
    probs_va = cal.predict_proba(val_feat)[:,1]

    best_f1, best_t = 0, 0
    for t in np.linspace(0.1,0.9,17):
        preds = (probs_va >= t).astype(int)
        f = f1_score(y_va, preds)
        if f > best_f1:
            best_f1, best_t = f, t

    auc = roc_auc_score(y_va, probs_va)
    results.append({
        'fold'     : fold,
        'val_F1'   : best_f1,
        'val_AUC'  : auc,
        'threshold': best_t
    })

# Summarize
df_results = pd.DataFrame(results)
print(df_results)
print(f"Average F1: {df_results['val_F1'].mean():.4f}")
print(f"Average AUC: {df_results['val_AUC'].mean():.4f}")

[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step
[1m721/721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's binary_logloss: 0.574181
[1m2825/2825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's binary_logloss: 0.552167
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's binary_logloss: 0.556622
                                                fold    val_F1   val_AUC  \
0  {'train_end'