# RNN Stock Price Forecasting

This notebook demonstrates a complete pipeline for one-step-ahead stock price prediction using recurrent neural networks built with TensorFlow/Keras.

**Pipeline overview**

1. Setup and imports  
2. Load and explore the NYSE price dataset  
3. Prepare normalized rolling windows for training/validation/testing  
4. Train a stacked GRU model  
5. Evaluate predictions with plots and directional-accuracy metrics

Dataset: Kaggle "New York Stock Exchange" (`prices-split-adjusted.csv`). The code assumes the CSV is extracted under `./prices-split-adjusted.csv/`.


# 1. Setup and Imports

Configure libraries, random seeds, and global settings used throughout the notebook.


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

VALID_SPLIT_PCT = 10
TEST_SPLIT_PCT = 10
FEATURES = ['open', 'high', 'low', 'close']  # Core price features

print(f"Working directory: {os.getcwd()}")
print(f"Sample files: {os.listdir(os.getcwd())[:5]}")


# 2. Load Data

Load daily prices from the Kaggle NYSE dataset, inspect the available tickers, and select one symbol to model. Adjust `ticker` as needed.


In [None]:
DATA_PATH = "prices-split-adjusted.csv/prices-split-adjusted.csv"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(
        f"Expected to find {DATA_PATH}. Please download and extract the Kaggle dataset."
    )

df = pd.read_csv(DATA_PATH)

print(f"Rows: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")

symbols = sorted(df['symbol'].unique())
print(f"Number of tickers: {len(symbols)}")
print(f"Sample tickers: {symbols[:10]}")

ticker = "EQIX"

if ticker not in symbols:
    raise ValueError(f"Ticker {ticker} not found in dataset.")

stock = (
    df.loc[df.symbol == ticker, FEATURES]
      .sort_index()
      .reset_index(drop=True)
)

scaler = MinMaxScaler()
stock_scaled = pd.DataFrame(
    scaler.fit_transform(stock),
    columns=FEATURES
)

stock_scaled.head()


# 3. Prepare Data

Create normalized rolling windows to build supervised sequences and split them chronologically into training, validation, and test sets.


In [None]:
def build_windows(values: np.ndarray, seq_len: int):
    """Create rolling windows of length `seq_len` from a [time, features] array."""
    if len(values) <= seq_len:
        raise ValueError("Not enough observations to build training windows.")
    windows = np.array([values[i:i + seq_len] for i in range(len(values) - seq_len)], dtype=np.float32)
    x = windows[:, :-1, :]
    y = windows[:, -1, :]
    return x, y


def split_data(x: np.ndarray, y: np.ndarray, valid_pct: int, test_pct: int):
    """Split the sequences chronologically into train/validation/test subsets."""
    total = x.shape[0]
    valid_len = int(round(valid_pct / 100 * total))
    test_len = int(round(test_pct / 100 * total))
    train_len = total - valid_len - test_len
    if train_len <= 0:
        raise ValueError("Split configuration leaves no data for training.")
    return (
        x[:train_len], y[:train_len],
        x[train_len:train_len + valid_len], y[train_len:train_len + valid_len],
        x[train_len + valid_len:], y[train_len + valid_len:]
    )


seq_len = 20

x_all, y_all = build_windows(stock_scaled.values, seq_len)
x_train, y_train, x_valid, y_valid, x_test, y_test = split_data(
    x_all, y_all, VALID_SPLIT_PCT, TEST_SPLIT_PCT
)

for name, array in {
    "x_train": x_train,
    "y_train": y_train,
    "x_valid": x_valid,
    "y_valid": y_valid,
    "x_test": x_test,
    "y_test": y_test,
}.items():
    print(f"{name}.shape = {array.shape}")


In [None]:
plt.figure(figsize=(15, 5))
for feature, color in zip(FEATURES, ["red", "green", "blue", "black"]):
    plt.plot(stock_scaled[feature].values, color=color, label=feature)

plt.title(f"{ticker} - normalized feature history")
plt.xlabel("Trading days")
plt.ylabel("Scaled price")
plt.legend(loc="best")
plt.tight_layout()
plt.show()


# 4. Model Training

Train a two-layer GRU network to predict the next day's normalized prices, using early stopping on the validation loss to avoid overfitting.


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.GRU(
        units=200,
        activation="tanh",
        return_sequences=True,
        input_shape=(seq_len - 1, len(FEATURES))
    ),
    tf.keras.layers.GRU(
        units=200,
        activation="tanh",
        return_sequences=False
    ),
    tf.keras.layers.Dense(len(FEATURES))
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse"
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    x_train, y_train,
    epochs=100,
    batch_size=50,
    validation_data=(x_valid, y_valid),
    callbacks=[early_stop],
    verbose=1,
    shuffle=False
)

y_train_pred = model.predict(x_train)
y_valid_pred = model.predict(x_valid)
y_test_pred = model.predict(x_test)


# 5. Evaluate Predictions

Visualise forecasts against actual normalized prices and report mean squared error, mean absolute error, and directional accuracy (close minus open sign).


In [None]:
target_feature_idx = 1  # Close price

def directional_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    if len(y_true) == 0:
        return float("nan")
    true_direction = np.sign(y_true[:, 1] - y_true[:, 0])
    pred_direction = np.sign(y_pred[:, 1] - y_pred[:, 0])
    return np.mean(true_direction == pred_direction)

splits = [
    ("Train", y_train, y_train_pred),
    ("Validation", y_valid, y_valid_pred),
    ("Test", y_test, y_test_pred),
]

timelines = {
    "Train": (0, len(y_train)),
    "Validation": (len(y_train), len(y_train) + len(y_valid)),
    "Test": (len(y_train) + len(y_valid), len(y_train) + len(y_valid) + len(y_test)),
}

plt.figure(figsize=(15, 5))

for split_name, y_true, _ in splits:
    start, end = timelines[split_name]
    if len(y_true) == 0:
        continue
    plt.plot(
        range(start, end),
        y_true[:, target_feature_idx],
        label=f"{split_name.lower()} target"
    )

for split_name, _, y_pred in splits:
    start, end = timelines[split_name]
    if len(y_pred) == 0:
        continue
    plt.plot(
        range(start, end),
        y_pred[:, target_feature_idx],
        linestyle="--",
        label=f"{split_name.lower()} prediction"
    )

plt.title("Normalized close price: actual vs. predicted")
plt.xlabel("Time steps")
plt.ylabel("Scaled price")
plt.legend(loc="best")
plt.tight_layout()
plt.show()

if len(y_test) > 0:
    plt.figure(figsize=(7, 4))
    start, end = timelines["Test"]
    plt.plot(range(start, end), y_test[:, target_feature_idx], label="test target", color="black")
    plt.plot(range(start, end), y_test_pred[:, target_feature_idx], label="test prediction", color="green")
    plt.title("Test window: normalized close price")
    plt.xlabel("Time steps")
    plt.ylabel("Scaled price")
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

for split_name, y_true, y_pred in splits:
    if len(y_true) == 0:
        print(f"{split_name}: no samples available.")
        continue
    mse = mean_squared_error(y_true[:, target_feature_idx], y_pred[:, target_feature_idx])
    mae = mean_absolute_error(y_true[:, target_feature_idx], y_pred[:, target_feature_idx])
    dir_acc = directional_accuracy(y_true, y_pred)
    print(f"{split_name}: MSE={mse:.6f}, MAE={mae:.6f}, directional accuracy={dir_acc:.2%}")


## Next Steps

- Try alternative architectures such as deeper GRUs, LSTMs, or convolutional hybrids and compare validation metrics.
- Incorporate additional features (e.g., volume or technical indicators) after revisiting the preprocessing pipeline.
- Persist the fitted scaler and invert the predictions to interpret results in the original price units.
