# LSTM price prediction model

In [1]:
import pandas as pd
import numpy as np
import requests, time
from pathlib import Path
from datetime import datetime

from sklearn.metrics import mean_squared_error

from functions import *

%matplotlib inline

## Data fetch/import and cleanup

### Stocks history dataframe for training and validation

In [2]:
# load csv into dataframe
df = pd.read_csv(
    Path('../data/stocks_history.csv'),
    index_col='Unnamed: 0',
    infer_datetime_format=True,
    parse_dates=True
)
df.head()

Unnamed: 0,MSFT_open,MSFT_high,MSFT_low,MSFT_close,AMD_open,AMD_high,AMD_low,AMD_close,TSLA_open,TSLA_high,...,JNJ_low,JNJ_close,REGN_open,REGN_high,REGN_low,REGN_close,GILD_open,GILD_high,GILD_low,GILD_close
2010-06-29,24.13,24.2,23.11,23.31,7.93,7.93,7.41,7.48,19.0,25.0,...,58.68,59.24,23.67,23.95,22.86,22.98,35.4,35.61,34.74,34.97
2010-06-30,23.3,23.68,22.95,23.01,7.58,7.65,7.3,7.32,25.79,30.4192,...,58.94,59.06,23.05,23.47,22.32,22.32,34.83,35.13,34.26,34.28
2010-07-01,23.09,23.32,22.73,23.16,7.35,7.53,7.1,7.39,25.0,25.92,...,58.65,59.07,22.31,22.37,20.45,20.79,34.24,34.27,33.3,34.14
2010-07-02,23.36,23.48,23.05,23.27,7.45,7.48,7.02,7.17,23.0,23.1,...,58.85,59.08,21.06,21.88,20.75,21.61,34.38,35.16,34.18,34.87
2010-07-06,23.7,24.09,23.584,23.82,7.4,7.42,6.96,7.04,20.0,20.0,...,58.669,59.08,22.03,22.03,21.16,21.36,35.11,35.42,34.415,34.77


In [3]:
# drop all columns except closing prices
dropped_columns = [
    'MSFT_open',
    'MSFT_high',
    'MSFT_low',
    'AMD_open',
    'AMD_high',
    'AMD_low',
    'TSLA_open',
    'TSLA_high',
    'TSLA_low',
    'JNJ_open',
    'JNJ_high',
    'JNJ_low',
    'REGN_open',
    'REGN_high',
    'REGN_low',
    'GILD_open',
    'GILD_high',
    'GILD_low'
]
df.drop(columns=dropped_columns, inplace=True)
df.head()

Unnamed: 0,MSFT_close,AMD_close,TSLA_close,JNJ_close,REGN_close,GILD_close
2010-06-29,23.31,7.48,23.89,59.24,22.98,34.97
2010-06-30,23.01,7.32,23.83,59.06,22.32,34.28
2010-07-01,23.16,7.39,21.96,59.07,20.79,34.14
2010-07-02,23.27,7.17,19.2,59.08,21.61,34.87
2010-07-06,23.82,7.04,16.11,59.08,21.36,34.77


### AAPL dataframe for testing

In [4]:
# fetch stock data
AAPL_df = get_stock_closing_prices(symbols=['AAPL'], name='AAPL_history')

In [5]:
AAPL_df.head()

Unnamed: 0,AAPL_open,AAPL_high,AAPL_low,AAPL_close,AAPL_volume
2000-07-10,54.09,58.25,53.75,57.13,7103500
2000-07-11,57.0,59.25,55.44,56.94,6391000
2000-07-12,58.13,58.94,56.38,58.88,4025600
2000-07-13,58.5,60.63,54.75,56.5,7958200
2000-07-14,57.13,59.0,56.88,57.69,3397800


In [6]:
# drop all columns except closing prices
dropped_columns = [
    'AAPL_open',
    'AAPL_high',
    'AAPL_low',
    'AAPL_volume'
]
AAPL_df.drop(columns=dropped_columns, inplace=True)
AAPL_df.head()

Unnamed: 0,AAPL_close
2000-07-10,57.13
2000-07-11,56.94
2000-07-12,58.88
2000-07-13,56.5
2000-07-14,57.69


In [7]:
AAPL_df.count()

AAPL_close    5034
dtype: int64

In [8]:
# check for NaN's
AAPL_df.isna().sum()

AAPL_close    0
dtype: int64

In [9]:
# drop NaN's
AAPL_df.dropna(inplace=True)
AAPL_df.count()

AAPL_close    5034
dtype: int64

In [10]:
AAPL_df.describe()

Unnamed: 0,AAPL_close
count,5034.0
unique,4506.0
top,15.0
freq,6.0


### Model parameters to train and test

Based on iterations performed earlier for 1-layer, 2-layer and 3-layer LSTM models, we've come up with 12 candidate models to test and pick one for our dashboard (6 1-layer LSTM models, and 6 2-layer LSTM models)

In [11]:
# create dataframes with results from previous iterations
# (manually created as the data wasn't saved to a csv before)
candidates_1lstm = {
    'stock':['AMD_close','GILD_close','JNJ_close','MSFT_close','REGN_close','TSLA_close'],
    'window size':[20,1,25,30,30,30],
    'dropout fraction':[0.1,0.2,0.1,0.2,0.35,0.25],
    'epochs':[10,10,10,10,10,10],
    'batch size':[50,10,10,10,50,10],
    'mse':[2.866193,3.692729,6.503621,8.395475,207.228028,997.440286],
    'rmse':[1.692983,1.921647,2.550220,2.897495,14.395417,31.582278]
}
df_candidates_1lstm = pd.DataFrame(candidates_1lstm)

In [12]:
df_candidates_1lstm

Unnamed: 0,stock,window size,dropout fraction,epochs,batch size,mse,rmse
0,AMD_close,20,0.1,10,50,2.866193,1.692983
1,GILD_close,1,0.2,10,10,3.692729,1.921647
2,JNJ_close,25,0.1,10,10,6.503621,2.55022
3,MSFT_close,30,0.2,10,10,8.395475,2.897495
4,REGN_close,30,0.35,10,50,207.228028,14.395417
5,TSLA_close,30,0.25,10,10,997.440286,31.582278


In [13]:
candidates_2lstm = {
    'stock':['AMD_close','GILD_close','JNJ_close','MSFT_close','REGN_close','TSLA_close'],
    'window size':[50,1,50,50,50,30],
    'dropout fraction':[0.15,0.1,0.2,0.1,0.25,0.3],
    'epochs':[10,10,10,10,10,10],
    'batch size':[1,10,10,1,10,10],
    'mse':[2.222121,2.532451,5.871770,7.640492,192.700920,883.447214],
    'rmse':[1.490678,1.591368,2.423173,2.764144,13.881676,29.722840]
}
df_candidates_2lstm = pd.DataFrame(candidates_2lstm)

In [14]:
df_candidates_2lstm

Unnamed: 0,stock,window size,dropout fraction,epochs,batch size,mse,rmse
0,AMD_close,50,0.15,10,1,2.222121,1.490678
1,GILD_close,1,0.1,10,10,2.532451,1.591368
2,JNJ_close,50,0.2,10,10,5.87177,2.423173
3,MSFT_close,50,0.1,10,1,7.640492,2.764144
4,REGN_close,50,0.25,10,10,192.70092,13.881676
5,TSLA_close,30,0.3,10,10,883.447214,29.72284


## Model training and validation

In [18]:
# 1lstm training and validation
for index, row in df_candidates_1lstm.iterrows():
    
    feature_column = df.columns.get_loc(row['stock'])
    target_column = df.columns.get_loc(row['stock'])
    window_size = row['window size']
    dropout_fraction = row['dropout fraction']
    batch_size = row['batch size']
    epochs = row['epochs']
    name = str(Path('./models'))+row['stock']+'_1lstm_model'
    
    one_lstm(
        df=df,
        feature_column=feature_column,
        target_column=target_column,
        window_size=window_size,
        batch_size=batch_size,
        dropout_fraction=dropout_fraction,
        epoch=epochs,
        name=name
    )


TypeError: one_lstm() missing 1 required positional argument: 'split_pct'