In [1]:
import os
import random
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
# Set seed for reproductability
seed = 42
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

In [3]:
# Step 1: Load the historical data
data = pd.read_csv('historical_data.csv')

In [4]:
# Step 2: Filter the required columns and sort by date and ticker
data = data[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(['Ticker', 'Date'], inplace=True)

In [5]:
# Step 3: Encode the Ticker column
encoder = OneHotEncoder(sparse=False)
tickers_encoded = encoder.fit_transform(data[['Ticker']])



In [6]:
# Step 4: Add the encoded tickers as additional features
encoded_columns = [f'Ticker_{i}' for i in range(tickers_encoded.shape[1])]
encoded_df = pd.DataFrame(tickers_encoded, columns=encoded_columns, index=data.index)
data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)

In [7]:
# Step 5: Scale numerical features
scaler_features = MinMaxScaler(feature_range=(0, 1))
numerical_features = ['Open', 'High', 'Low', 'Volume']  # Exclude 'Close' from general scaling
data[numerical_features] = scaler_features.fit_transform(data[numerical_features])

# Separate scaler for 'Close' price
scaler_close = MinMaxScaler(feature_range=(0, 1))
data[['Close']] = scaler_close.fit_transform(data[['Close']])  # Scale only 'Close' separately

In [8]:
# Step 6: Preprocessing function
def preprocess_data(data, n_steps=10):
    X, y = [], []
    feature_columns = numerical_features + ['Close'] + encoded_columns  # Include 'Close' now

    for ticker in data['Ticker'].unique():
        ticker_data = data[data['Ticker'] == ticker]  # Ensure sequences stay within the same stock
        ticker_data = ticker_data[feature_columns].values

        for i in range(n_steps, len(ticker_data)):
            X.append(ticker_data[i - n_steps:i, :])  # Last n_steps rows as features
            y.append(ticker_data[i, -len(encoded_columns)-1])  # Predict 'Close' price

    return np.array(X), np.array(y)

In [9]:
# Step 7: Preprocess the data
X, y = preprocess_data(data, n_steps=10)

In [10]:
# Step 8: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 9: Build the LSTM model
model = Sequential([
    LSTM(50, activation="tanh", return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.15),
    LSTM(30, activation="tanh", return_sequences=True),
    Dropout(0.05),
    LSTM(20, activation="tanh", return_sequences=False),
    Dropout(0.01),
    Dense(1, activation="linear")
])

  super().__init__(**kwargs)


In [12]:
# Step 10: Compile the model
model.compile(optimizer='adam', loss='mse')

In [13]:
# Step 11: Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1)

Epoch 1/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - loss: 9.3873e-04 - val_loss: 1.7506e-04
Epoch 2/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 1.2243e-04 - val_loss: 9.7737e-05
Epoch 3/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - loss: 1.6359e-04 - val_loss: 3.4518e-05
Epoch 4/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 1.0430e-04 - val_loss: 6.5942e-05
Epoch 5/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - loss: 9.1930e-05 - val_loss: 2.9606e-05
Epoch 6/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - loss: 8.9725e-05 - val_loss: 3.2006e-05
Epoch 7/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - loss: 7.6743e-05 - val_loss: 2.7421e-05
Epoch 8/100
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - los

In [14]:
# Step 12: Make predictions
predictions = model.predict(X_test)

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [15]:
# Step 13: Inverse scale the predictions and actual values correctly
y_test_scaled = scaler_close.inverse_transform(y_test.reshape(-1, 1)).flatten()

predictions_scaled = scaler_close.inverse_transform(predictions.reshape(-1, 1)).flatten()

In [16]:
# Step 14: Evaluate the model
mae = mean_absolute_error(predictions_scaled, y_test_scaled)
mape = mean_absolute_percentage_error(predictions_scaled, y_test_scaled)

# Define a percentage threshold for accuracy
threshold_percentage = 5  # 5% tolerance
percentage_errors = np.abs((y_test_scaled - predictions_scaled) / y_test_scaled) * 100
acc = np.mean(percentage_errors <= threshold_percentage) * 100

# Alternative accuracy metric
acc2 = (1 - mape) * 100


In [17]:
# Step 15: Print performance metrics
print(f"Mean Absolute Error = {mae}")
print(f"Mean Absolute Percentage Error = {mape*100:.2f}%")
print(f"Accuracy with threshold = {acc:.2f}%")
print(f"Accuracy = {acc2:.2f}%")

Mean Absolute Error = 8.528863920876589
Mean Absolute Percentage Error = 14.55%
Accuracy with threshold = 44.97%
Accuracy = 85.45%
