## Modeling

In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers
from sklearn.metrics import mean_squared_error

# Load preprocessed data
train_df = pd.read_pickle("train_df.pkl")
test_df  = pd.read_pickle("test_df.pkl")

In [16]:
# Build sequences

features = ["Open", "High", "Low", "Close", "Volume"]

def make_sequences(df, window=30, horizon=1):
    """
    Converts time-series data into sequences for an RNN.

    window  = how many past days the model sees (30)
    horizon = how many days into the future we predict (1, 5, 10)
    """
    X, y = [], []

    for stock in df["Stock"].unique():
        s = df[df["Stock"] == stock]

        data = s[features].values
        target = s["Return"].values

        for i in range(window, len(s) - horizon):
            X.append(data[i-window:i])
            y.append(target[i+horizon])

    return np.array(X), np.array(y)

In [17]:
WINDOW = 30
HORIZON = 1

X_train, y_train = make_sequences(train_df, WINDOW, HORIZON)
X_test, y_test   = make_sequences(test_df, WINDOW, HORIZON)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(144501, 30, 5) (144501,)
(640, 30, 5) (640,)


In [18]:
print("y_train min/max/mean/std:",
      y_train.min(), y_train.max(), y_train.mean(), y_train.std())
print("y_test min/max/mean/std:",
      y_test.min(), y_test.max(), y_test.mean(), y_test.std())

y_train min/max/mean/std: -0.9999960726058523 0.9999999999999998 -0.0004396380700705552 0.05328160435645003
y_test min/max/mean/std: -0.21628768829142098 0.38259110614382474 -0.005713482265587203 0.06286707668193339


In [19]:
print("X_train NaN:", np.isnan(X_train).any(), "inf:", np.isinf(X_train).any())
print("y_train NaN:", np.isnan(y_train).any(), "inf:", np.isinf(y_train).any())
print("X_test NaN:", np.isnan(X_test).any(), "inf:", np.isinf(X_test).any())
print("y_test NaN:", np.isnan(y_test).any(), "inf:", np.isinf(y_test).any())

X_train NaN: False inf: False
y_train NaN: False inf: False
X_test NaN: False inf: False
y_test NaN: False inf: False


In [None]:
tf.keras.backend.clear_session()

# Fixes randomness so results are reproducible
# (same initialization, same training behavior)
tf.random.set_seed(42)

# Number of input features per day (Open, High, Low, Close, Volume)
n_features = X_train.shape[2]

# Define a sequential neural network
model = models.Sequential([

    # Input shape:
    # WINDOW = 30 days
    # n_features = 5 features per day
    # So each sample is a 30x5 matrix
    layers.Input(shape=(WINDOW, n_features)),

    # First LSTM layer
    # 64 = number of memory units
    # return_sequences=True means:
    #   output a sequence of hidden states (one per day)
    #   so that the next LSTM can process them
    layers.LSTM(64, return_sequences=True),

    # Dropout randomly removes 30% of neurons during training
    # This prevents overfitting
    layers.Dropout(0.3),

    # Second LSTM layer
    # This one compresses the 30-day sequence into one vector
    # that summarizes recent market behavior
    layers.LSTM(32),

    # More dropout for regularization
    layers.Dropout(0.3),

    # A small dense (fully-connected) layer
    # This learns nonlinear combinations of the LSTM output
    layers.Dense(16, activation="relu"),

    layers.Dropout(0.2),

    # Output layer:
    # One number = predicted future return
    layers.Dense(1)
])

# Compile the model:
# Adam optimizer = efficient gradient descent
# MSE loss = regression loss for predicting numbers
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
)

# Stop training when validation error stops improving
early_stop = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.1,   # 10% of training data for validation
    epochs=30,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# Test the model on future data
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)

print("Test MSE:", test_loss)
print("Test MAE:", test_mae)

Epoch 1/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - loss: 0.0038 - mae: 0.0263 - val_loss: 8.3147e-04 - val_mae: 0.0190
Epoch 2/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0222 - val_loss: 8.3110e-04 - val_mae: 0.0190
Epoch 3/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3110e-04 - val_mae: 0.0190
Epoch 4/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3118e-04 - val_mae: 0.0190
Epoch 5/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3115e-04 - val_mae: 0.0190
Epoch 6/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.0031 - mae: 0.0221 - val_loss: 8.3125e-04 - val_mae: 0.0190
Epoch 7/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
# Different Prediction Horizons

def run_experiment(horizon):
    tf.keras.backend.clear_session()
    Xtr, ytr = make_sequences(train_df, WINDOW, horizon)
    Xte, yte = make_sequences(test_df, WINDOW, horizon)

    tf.random.set_seed(42)

    model = models.Sequential([
        layers.Input(shape=(WINDOW, Xtr.shape[2])),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.3),
        layers.LSTM(32),
        layers.Dropout(0.3),
        layers.Dense(16, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(
        optimizer=optimizers.Adam(1e-3),
        loss="mse",
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
    )

    # Stop training when validation error stops improving
    early_stop = callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
        restore_best_weights=True
    )

    model.fit(
        Xtr, ytr,
        validation_split=0.1,
        epochs=30,
        batch_size=256,
        callbacks=[early_stop],
        verbose=1
    )

    mse, mae = model.evaluate(Xte, yte, verbose=0)
    return mse, mae

results = {}

for h in [1, 5, 10]:
    mse, mae = run_experiment(h)
    results[f"Horizon_{h}"] = {"MSE": mse, "MAE": mae}

pd.DataFrame(results).T

Epoch 1/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 0.0032 - mae: 0.0251 - val_loss: 8.3121e-04 - val_mae: 0.0190
Epoch 2/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - loss: 0.0029 - mae: 0.0224 - val_loss: 8.3093e-04 - val_mae: 0.0190
Epoch 3/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - loss: 0.0028 - mae: 0.0222 - val_loss: 8.3096e-04 - val_mae: 0.0190
Epoch 4/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - loss: 0.0028 - mae: 0.0221 - val_loss: 8.3100e-04 - val_mae: 0.0190
Epoch 5/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - loss: 0.0028 - mae: 0.0221 - val_loss: 8.3117e-04 - val_mae: 0.0190
Epoch 6/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - loss: 0.0028 - mae: 0.0221 - val_loss: 8.3122e-04 - val_mae: 0.0190
Epoch 7/30
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

Unnamed: 0,MSE,MAE
Horizon_1,0.003991,0.047703
Horizon_5,0.004479,0.051994
Horizon_10,0.005132,0.05628


In [25]:
# Different Number of Stocks

def subset_stocks(df, n):
    stocks = df["Stock"].unique()[:n]
    return df[df["Stock"].isin(stocks)]

stock_results = {}

for n in [5, 10, 20]:
    tf.keras.backend.clear_session()
    
    tr = subset_stocks(train_df, n)
    te = subset_stocks(test_df, n)

    Xtr, ytr = make_sequences(tr, WINDOW, 1)
    Xte, yte = make_sequences(te, WINDOW, 1)

    tf.random.set_seed(42)

    model = models.Sequential([
        layers.Input(shape=(WINDOW, Xtr.shape[2])),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.3),
        layers.LSTM(32),
        layers.Dropout(0.3),
        layers.Dense(16, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-3),
        loss="mse",
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
    )

    # Stop training when validation error stops improving
    early_stop = callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
        restore_best_weights=True
    )

    model.fit(
        Xtr, ytr,
        validation_split=0.1,
        epochs=30,
        batch_size=256,
        callbacks=[early_stop],
        verbose=1
    )

    mse = model.evaluate(Xte, yte, verbose=0)

    stock_results[f"{n}_stocks"] = mse

stock_results

Epoch 1/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - loss: 0.0018 - mae: 0.0279 - val_loss: 6.2207e-04 - val_mae: 0.0165
Epoch 2/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0228 - val_loss: 6.2122e-04 - val_mae: 0.0165
Epoch 3/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0226 - val_loss: 6.2089e-04 - val_mae: 0.0165
Epoch 4/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0225 - val_loss: 6.2092e-04 - val_mae: 0.0165
Epoch 5/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0225 - val_loss: 6.2082e-04 - val_mae: 0.0165
Epoch 6/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0225 - val_loss: 6.2080e-04 - val_mae: 0.0165
Epoch 7/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

{'5_stocks': [0.003379339352250099, 0.04530029371380806],
 '10_stocks': [0.003212945070117712, 0.044040340930223465],
 '20_stocks': [0.004214080050587654, 0.049018822610378265]}

In [22]:
configs = [
    (5, 1),
    (10, 5),
    (20, 10),
]

results = []

for n_stocks, horizon in configs:
    tr = subset_stocks(train_df, n_stocks)
    te = subset_stocks(test_df, n_stocks)

    Xtr, ytr = make_sequences(tr, WINDOW, horizon)
    Xte, yte = make_sequences(te, WINDOW, horizon)

    tf.keras.backend.clear_session()
    tf.random.set_seed(42)

    model = models.Sequential([
        layers.Input(shape=(WINDOW, Xtr.shape[2])),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.3),
        layers.LSTM(32),
        layers.Dropout(0.3),
        layers.Dense(16, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(
        optimizer=optimizers.Adam(1e-3),
        loss="mse",
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
    )

    model.fit(
        Xtr, ytr,
        validation_split=0.1,
        epochs=30,
        batch_size=256,
        callbacks=[callbacks.EarlyStopping(
            monitor="val_loss",
            patience=5,
            restore_best_weights=True
        )],
        verbose=1
    )

    mse, mae = model.evaluate(Xte, yte, verbose=0)

    results.append({
        "Stocks": n_stocks,
        "Horizon": horizon,
        "MSE": mse,
        "MAE": mae
    })

pd.DataFrame(results)

Epoch 1/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - loss: 0.0032 - mae: 0.0338 - val_loss: 6.7490e-04 - val_mae: 0.0176
Epoch 2/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 0.0012 - mae: 0.0235 - val_loss: 6.2417e-04 - val_mae: 0.0165
Epoch 3/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0230 - val_loss: 6.2282e-04 - val_mae: 0.0164
Epoch 4/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0227 - val_loss: 6.2214e-04 - val_mae: 0.0164
Epoch 5/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0226 - val_loss: 6.2108e-04 - val_mae: 0.0164
Epoch 6/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 0.0011 - mae: 0.0225 - val_loss: 6.2097e-04 - val_mae: 0.0164
Epoch 7/30
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

Unnamed: 0,Stocks,Horizon,MSE,MAE
0,5,1,0.003386,0.045362
1,10,5,0.003617,0.048338
2,20,10,0.005077,0.056044
