<a href="https://colab.research.google.com/github/aidanjmaldonado/penny-stock-lstm/blob/main/penny_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [42]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import sqlite3
import requests
import sys
# from library.DataSetProcessor import DataSetProcessor
import matplotlib.pyplot as plt

In [43]:
# constants
SEQUENCE_LENGTH = 78 # 1 day long
PREDICTION_LENGTH = 78 # 1 day long
NUM_FEATURES = 3 # close, volume

# Create database

In [44]:
# download database from github repository
historical_url = "https://raw.githubusercontent.com/CSE-115-UCSC/penny-stock-lstm/main/historicaldata.db"
scrape_request = requests.get(historical_url)

try:
  # contingent on request status
  scrape_request = requests.get(historical_url)
  scrape_request.raise_for_status()

  # create local database from pull, name 'historicaldata.db'
  with open("historical.db", "wb") as db_file:
    db_file.write(scrape_request.content)
    
  print("Request to download database succeeded")


except:
  # report failed request status
  sys.stderr.write("Request to download database failed")

Request to download database succeeded


In [45]:
# connect to SQlite database
try:
    db = 'historical.db'
    sqliteConnection = sqlite3.connect(db)
    cursor = sqliteConnection.cursor()
    print(f'SQlite connected with {db}')

except:
    # report failed request status
    sys.stderr.write("Failed to connect to database")

SQlite connected with historical.db


In [46]:
# query all historical stock data from Database
try:
    query = f"SELECT * FROM all_historical;"
    cursor.execute(query)
    if cursor.fetchone() is None:
        raise Exception("No results")

    print(f"Success querying all historical")
    # turn SQlite Database into Pandas Dataframe
    data = pd.read_sql_query(query, sqliteConnection)

except:
    sys.stderr.write(f"Failed to select all historical")

Success querying all historical


In [47]:
# using the column 'time' (millisecond) add a new column 'dates' with datetime
dates = pd.to_datetime(data['time'], unit='ms')
dates = dates.dt.tz_localize('UTC').dt.tz_convert('US/Pacific')
dates = dates.dt.tz_localize(None)

# tickers array for checking sequence quality
tickers = data['ticker']

# Dataset Normalization

In [48]:
# normalized features (close, volume) table for training
normalized_data = pd.DataFrame(columns=['volume_weighted_average','volume', 'number_of_trades'])

In [49]:
data.head()

Unnamed: 0,id,ticker,volume,volume_weighted_average,open,close,high,low,time,number_of_trades
0,1,ACHR,269.0,3.0307,3.0301,3.03,3.0301,3.03,1657643400000,4
1,2,ACHR,2037.0,3.0362,3.04,3.04,3.04,3.035,1657643700000,35
2,3,ACHR,796.0,3.0364,3.04,3.035,3.04,3.035,1657644000000,17
3,4,ACHR,1795.0,3.0356,3.035,3.035,3.04,3.035,1657644300000,19
4,5,ACHR,10397.0,3.0286,3.035,3.025,3.035,3.0212,1657644600000,87


In [50]:
# create dictionary where each key is a stock ticker and the value is the table containing normalized data
data_by_ticker = {}
for ticker in data['ticker'].unique():
    data_by_ticker[ticker] = data[data['ticker'] == ticker].copy()
    data_by_ticker[ticker]['normalized_volume_weighted_average'] = data_by_ticker[ticker]['volume_weighted_average'] / data_by_ticker[ticker]['volume_weighted_average'].max() #Normalized closing price data

In [51]:
for key in data_by_ticker:
    # create a temporary DataFrame to hold the current data
    temp_df = pd.DataFrame({
        'volume_weighted_average': data_by_ticker[key]['normalized_volume_weighted_average'],
        'volume': data_by_ticker[key]['volume'],
        'number_of_trades': data_by_ticker[key]['number_of_trades']
    })
    
    # concatenate the temporary DataFrame to the normalized_data DataFrame
    normalized_data = pd.concat([normalized_data, temp_df], ignore_index=True)

# optionally, you can reset the index if needed
normalized_data.reset_index(drop=True, inplace=True)

  normalized_data = pd.concat([normalized_data, temp_df], ignore_index=True)


In [52]:
normalized_data.head()

Unnamed: 0,volume_weighted_average,volume,number_of_trades
0,0.389675,269.0,4
1,0.390383,2037.0,35
2,0.390408,796.0,17
3,0.390305,1795.0,19
4,0.389405,10397.0,87


# Train on all historical stock data, sequenced

In [53]:

"""Generate arrays filled with one-day-long sequences from the normalized dataset

Arguments:
    - data: stock dataset with 2 columns:
        - close prices normalized (0, 1) 
        - volume
    - dates: array of every milisecond timestamp converted to dateTime objects
    - tickers: array of every timestamp's corresponding ticker

Returns:
    - x: array of one-day-long sequences of the normalized dataset for training
    - y: array of one-day-long seuqences of the immediate day after for predicting

"""
def create_sequences(data, dates, tickers):
    
    # stores sequences to be returned
    xs, ys = [], []
    # index refers to the start of a day, therefore start of a 'sequence'
    index = 0 
     # keeps track of the number of valid sequences for debugging purposes
    count = 0

    # loop until the end of database, stopping 2 days in advance to make room for last 'context' day and it's corresponding 'prediction' day
    while index < len(data) - SEQUENCE_LENGTH - PREDICTION_LENGTH + 1:

        # Check if sequence is within a single day (day start == day end) and (ticker start == ticker end)
        if dates[index].date() == dates[index + SEQUENCE_LENGTH].date() and tickers[index] == tickers[index + SEQUENCE_LENGTH]:

            # append current day (index -> index+SEQ) to x, and append next day (index + SEQ -> index + SEQ + PRE) to y
            xs.append(data.iloc[index:index + SEQUENCE_LENGTH])  # Use past data for features
            ys.append(data.iloc[index + SEQUENCE_LENGTH:index + SEQUENCE_LENGTH + PREDICTION_LENGTH, 0])  # Only predict 'close' prices

            # move index to start of the next day
            index += SEQUENCE_LENGTH
            count += 1
        
        # move index to the start of the next 
        else: # Note: This is the discarding section, can be modified to be "imputed" via extending the last known close value until end of day.

            # move new_index to the start of the next day
            new_index = index
            while dates[new_index].date() == dates[new_index + 1].date():
                new_index += 1
            new_index += 1
            
            # once new_index reaches next morning, set index to match
            index = new_index

    # print the number of valid days found for debugging purposes, return arrays of sequences            
    print("Valid days:", count)
    return np.array(xs), np.array(ys)

# create sequences from normalized data
x, y = create_sequences(normalized_data, dates, tickers) #Creating the input and grouth truth data from create_sequences function

Valid days: 294


# Training Pipeline

In [54]:

# Split data into 80% / 20% training and testing groups
train_size = int(len(x) * 0.8)
x_train, x_test = x[:train_size], x[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Type adjustment string -> float
x_train = x_train.astype(np.float32)
y_train = y_train.astype(np.float32)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(SEQUENCE_LENGTH, NUM_FEATURES)))
# Need both layers because return_sequences will send its output to another LSTM layer which is required before sending to Dense layer 
model.add(LSTM(50))
 # Makes readable by NN, NN doesn't predict on sequences so it needs single dimension values
model.add(Dense(40, activation='relu'))
# Prevents overfitting
model.add(Dropout(0.1))
# Takes the results from the last LSTM layer and predicts the stock prices for PREDICTION_LENGTH steps ahead
model.add(Dense(PREDICTION_LENGTH)) 
#Compiles the model with an adam optimizer and a mean squared error loss function
model.compile(optimizer='adam', loss='mse') 

# Train the model with early stopping to prevent over fitting
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
model.fit(x_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/100


  super().__init__(**kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - loss: 0.1917 - val_loss: 0.0389
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.1463 - val_loss: 0.0366
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.1321 - val_loss: 0.0414
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.0984 - val_loss: 0.0440
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.0803 - val_loss: 0.0446
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 0.0779 - val_loss: 0.0375
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.0670 - val_loss: 0.0548
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 0.0601 - val_loss: 0.0431
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

<keras.src.callbacks.history.History at 0x31b1db310>

In [55]:
# Save the model weights to an external file
model.save('model.h5')

