In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import random
import warnings

warnings.filterwarnings("ignore")

# Reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Load & sort
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Create 1-day lagged return and drop NaNs
df['return_1d_lag1'] = df.groupby('symbol')['return_1d'].shift(1)
df = df.dropna(subset=['return_1d_lag1']).reset_index(drop=True)

# Features & target
price_feats = [
    'adj close', 'log_volume', 'ma_10',
    'vol_30', 'rsi_14', 'return_1d_lag1'
]
target = 'target'
seq_len = 30  # look-back window

# Generate sequences
X, y, seq_dates = [], [], []
for sym, grp in df.groupby('symbol'):
    grp = grp.sort_values('date').reset_index(drop=True)
    arr = grp[price_feats].values
    labels = grp[target].values
    dates = grp['date'].values
    for i in range(seq_len, len(grp)):
        X.append(arr[i-seq_len:i])
        y.append(labels[i])
        seq_dates.append(dates[i])
X = np.array(X)  # shape (n_samples, seq_len, n_features)
y = np.array(y)
seq_dates = np.array(seq_dates)

# Chronological split
train_mask = seq_dates <= np.datetime64('2021-12-31')
val_mask   = (seq_dates > np.datetime64('2021-12-31')) & (seq_dates <= np.datetime64('2022-12-31'))
test_mask  = seq_dates > np.datetime64('2022-12-31')

X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print(f"Sequences: train={X_train.shape}, val={X_val.shape}, test={X_test.shape}")

# Build the LSTM model
model = Sequential([
    LSTM(64, input_shape=(seq_len, len(price_feats))),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)
model.summary()

# Train with early stopping
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    callbacks=[early_stop]
)

# Evaluate on test set
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)

print("\nLSTM Performance on Test Set:")
print(f"  Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"  Precision: {precision_score(y_test, y_pred):.4f}")
print(f"  Recall   : {recall_score(y_test, y_pred):.4f}")
print(f"  F1 Score : {f1_score(y_test, y_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_test, y_pred_prob):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Sequences: train=(3477497, 30, 6), val=(642247, 30, 6), test=(571443, 30, 6)


Epoch 1/20
[1m3396/3396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 8ms/step - accuracy: 0.5122 - loss: 0.6945 - val_accuracy: 0.4992 - val_loss: 0.6935
Epoch 2/20
[1m3396/3396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 8ms/step - accuracy: 0.5167 - loss: 0.6924 - val_accuracy: 0.4983 - val_loss: 0.6937
Epoch 3/20
[1m3396/3396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 8ms/step - accuracy: 0.5177 - loss: 0.6923 - val_accuracy: 0.4959 - val_loss: 0.6939
Epoch 4/20
[1m3396/3396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 8ms/step - accuracy: 0.5180 - loss: 0.6921 - val_accuracy: 0.4999 - val_loss: 0.6937
[1m17858/17858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2ms/step

LSTM Performance on Test Set:
  Accuracy : 0.5119
  Precision: 0.5166
  Recall   : 0.4951
  F1 Score : 0.5056
  ROC AUC  : 0.5173
Confusion Matrix:
[[149895 133462]
 [145459 142627]]
