## Modeling

In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers

# Load preprocessed data
train_df = pd.read_pickle("train_df.pkl")
test_df  = pd.read_pickle("test_df.pkl")

In [20]:
# Build sequences

features = ["Open", "High", "Low", "Close", "Volume"]

def make_sequences(df, window=30, horizon=1):
    """
    Converts time-series data into sequences for an RNN.

    window  = how many past days the model sees (30)
    horizon = how many days into the future we predict (1, 5, 10)
    """
    X, y = [], []

    for stock in df["Stock"].unique():
        s = df[df["Stock"] == stock]

        data = s[features].values
        target = s["Return"].values

        for i in range(window, len(s) - horizon):
            X.append(data[i-window:i])
            y.append(target[i+horizon])

    return np.array(X), np.array(y)

In [21]:
WINDOW = 30
HORIZON = 1

X_train, y_train = make_sequences(train_df, WINDOW, HORIZON)
X_test, y_test   = make_sequences(test_df, WINDOW, HORIZON)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(144501, 30, 5) (144501,)
(640, 30, 5) (640,)


In [22]:
print("y_train min/max/mean/std:",
      y_train.min(), y_train.max(), y_train.mean(), y_train.std())
print("y_test min/max/mean/std:",
      y_test.min(), y_test.max(), y_test.mean(), y_test.std())

y_train min/max/mean/std: -0.9999960726058523 0.9999999999999998 -0.0004396380700705552 0.05328160435645003
y_test min/max/mean/std: -0.21628768829142098 0.38259110614382474 -0.005713482265587203 0.06286707668193339


In [23]:
print("X_train NaN:", np.isnan(X_train).any(), "inf:", np.isinf(X_train).any())
print("y_train NaN:", np.isnan(y_train).any(), "inf:", np.isinf(y_train).any())
print("X_test NaN:", np.isnan(X_test).any(), "inf:", np.isinf(X_test).any())
print("y_test NaN:", np.isnan(y_test).any(), "inf:", np.isinf(y_test).any())

X_train NaN: False inf: False
y_train NaN: False inf: False
X_test NaN: False inf: False
y_test NaN: False inf: False


In [25]:
# Fixes randomness so results are reproducible
# (same initialization, same training behavior)
tf.random.set_seed(42)

# Number of input features per day (Open, High, Low, Close, Volume)
n_features = X_train.shape[2]   # should be 5

# Define a sequential neural network
model = models.Sequential([

    # Input shape:
    # WINDOW = 30 days
    # n_features = 5 features per day
    # So each sample is a 30x5 matrix
    layers.Input(shape=(WINDOW, n_features)),

    # First LSTM layer
    # 64 = number of memory units
    # return_sequences=True means:
    #   output a sequence of hidden states (one per day)
    #   so that the next LSTM can process them
    layers.LSTM(64, return_sequences=True),

    # Dropout randomly removes 30% of neurons during training
    # This prevents overfitting
    layers.Dropout(0.3),

    # Second LSTM layer
    # This one compresses the 30-day sequence into one vector
    # that summarizes recent market behavior
    layers.LSTM(32),

    # More dropout for regularization
    layers.Dropout(0.3),

    # A small dense (fully-connected) layer
    # This learns nonlinear combinations of the LSTM output
    layers.Dense(16, activation="relu"),

    layers.Dropout(0.2),

    # Output layer:
    # One number = predicted future return
    layers.Dense(1)
])

# Compile the model:
# Adam optimizer = efficient gradient descent
# MSE loss = regression loss for predicting numbers
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
)

# Stop training when validation error stops improving
early_stop = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.1,   # 10% of training data for validation
    epochs=30,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# Test the model on future data
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)

print("Test MSE:", test_loss)
print("Test MAE:", test_mae)

Epoch 1/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - loss: 0.0038 - mae: 0.0263 - val_loss: 8.3147e-04 - val_mae: 0.0190
Epoch 2/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0222 - val_loss: 8.3110e-04 - val_mae: 0.0190
Epoch 3/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3110e-04 - val_mae: 0.0190
Epoch 4/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3118e-04 - val_mae: 0.0190
Epoch 5/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3115e-04 - val_mae: 0.0190
Epoch 6/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3125e-04 - val_mae: 0.0190
Epoch 7/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m