In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
# Load the historical data
data = pd.read_csv('historical_data.csv')

In [4]:
# Filter the required columns and sort by date and ticker
data = data[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(['Ticker', 'Date'], inplace=True)

In [6]:
# Encode the Ticker column
encoder = OneHotEncoder(sparse=False)
tickers_encoded = encoder.fit_transform(data[['Ticker']])



In [8]:
# Add the encoded tickers as additional features
encoded_columns = [f'Ticker_{i}' for i in range(tickers_encoded.shape[1])]
encoded_df = pd.DataFrame(tickers_encoded, columns=encoded_columns, index=data.index)
data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)

In [10]:
# Scale numerical features (Open, High, Low, Close, Volume)
scaler = MinMaxScaler(feature_range=(0, 1))
numerical_features = ['Open', 'High', 'Low', 'Close', 'Volume']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [11]:
# Prepare data for LSTM
def preprocess_data(data, n_steps=10):
    X, y = [], []
    feature_columns = numerical_features + encoded_columns
    for ticker in data['Ticker'].unique():
        ticker_data = data[data['Ticker'] == ticker]
        ticker_data = ticker_data[feature_columns].values
        for i in range(n_steps, len(ticker_data)):
            X.append(ticker_data[i - n_steps:i, :])  # Last n_steps rows as features
            y.append(ticker_data[i, 3])  # Predict 'Close' price
    return np.array(X), np.array(y)

In [13]:
# Define the number of timesteps
n_steps = 10

In [14]:
# Preprocess the data
X, y = preprocess_data(data, n_steps=n_steps)

In [18]:
print(X[1])
print(y[1])

[[0.045787   0.04652913 0.04590719 ... 0.         0.         0.        ]
 [0.04625588 0.04830898 0.04682727 ... 0.         0.         0.        ]
 [0.04754531 0.04756867 0.04547844 ... 0.         0.         0.        ]
 ...
 [0.04104265 0.04205328 0.04042424 ... 0.         0.         0.        ]
 [0.04160407 0.04263937 0.04177307 ... 0.         0.         0.        ]
 [0.04190021 0.04246046 0.04210793 ... 0.         0.         0.        ]]
0.043157014098311686


In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Build the LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)  # Output layer for predicting 'Close' price
])

  super().__init__(**kwargs)


In [20]:
# Compile the model
model.compile(optimizer='adam', loss='mse')

In [21]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - loss: 9.1867e-04 - val_loss: 7.9593e-05
Epoch 2/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 1.9848e-04 - val_loss: 4.8957e-05
Epoch 3/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 1.5674e-04 - val_loss: 5.5863e-05
Epoch 4/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 1.3140e-04 - val_loss: 3.3574e-05
Epoch 5/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 1.0849e-04 - val_loss: 9.4034e-05
Epoch 6/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 1.1194e-04 - val_loss: 4.4070e-05
Epoch 7/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 8.6519e-05 - val_loss: 5.7141e-05
Epoch 8/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 1.0119e-0

In [22]:
# Evaluate the model on the test set
loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")


[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 5.3402e-05
Test Loss: 5.241185863269493e-05


In [23]:
# Make predictions
predictions = model.predict(X_test)

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [24]:
# Inverse scale the predictions and actual values
y_test_scaled = scaler.inverse_transform(
    np.concatenate((np.zeros((y_test.shape[0], 4)), y_test.reshape(-1, 1)), axis=1))[:, -1]
predictions_scaled = scaler.inverse_transform(
    np.concatenate((np.zeros((predictions.shape[0], 4)), predictions), axis=1))[:, -1]

In [40]:
mae = mean_absolute_error(predictions, y_test)
mape = mean_absolute_percentage_error(predictions, y_test)

# Define a percentage threshold for accuracy
threshold_percentage = 5  # 5% tolerance

# Calculate percentage errors
percentage_errors = np.abs((y_test_scaled - predictions_scaled) / y_test_scaled) * 100

# Count predictions within the threshold
#acc = np.mean(percentage_errors <= threshold_percentage) * 100

acc = 1 - mape


In [41]:
print(f"Mean Absolute Error = {mae}")
print(f"Mean Absolute Percentage Error = {mape* 100:.2f}%")
print(f"Accuracy = {acc * 100:.2f}%")

Mean Absolute Error = 0.004911069799739158
Mean Absolute Percentage Error = 16.37%
Accuracy = 83.63%
