# Prepare and download stock data

## Download stock data
Get Data of stocks out of the snp500 index from yfinance

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import os

from keras.utils import timeseries_dataset_from_array


from ta import add_all_ta_features # add all here select only needed one in training later


2024-05-11 17:21:12.204117: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 17:21:12.244598: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
projectDir = os.getcwd()
dataPath = os.path.join(projectDir, "data")


In [3]:
tickers = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
tickers = tickers.Symbol.to_list()

In [4]:
trainDataStart = "2015-01-01"
trainDataEnd = "2020-01-01"
valDataStart = "2020-01-02"
valDataEnd = "2022-01-01"
testDataStart = "2022-01-02"
testDataEnd = "2024-05-01"

movingAverageSizes = [5,15,200]

sequenceLength = 20

### Additional features

In [5]:
def shiftDateXDaysEarlier(dateStr, numDaysEarlier):
    date = dt.datetime.strptime(dateStr, "%Y-%m-%d")
    date = date - dt.timedelta(days=numDaysEarlier)
    return date.strftime("%Y-%m-%d")

In [6]:
def addMovingAverages(data, maSizes):
    for size in maSizes:
        data[f"MovingAvg_{size}"] = data["Close"].rolling(size).mean()
    return data
    

In [7]:
def addRSI(data):
    return data

In [9]:
def dataPreprocessing(data, startDate, movingAverageSizes):
    try:
        data = add_all_ta_features(data,open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)
        data = data[startDate:]
    except:
        return None,None
    # data = addMovingAverages(data, movingAverageSizes)
    # data = addCrossoverPoints(data)
    # data = addRSI(data)
    data = data.dropna()
    if not data.empty:
        featurelist = data.columns
        data = MinMaxScaler().fit_transform(data)
    else:
        return None, None
    return data, featurelist

### Download, preprocess data  

In [10]:
def getStockData(tickers, startDate, endDate, movingAverageSizes):
    shiftedStartDate = shiftDateXDaysEarlier(startDate, max(movingAverageSizes)*1.5)

    stockDataList = []
    featureList = []
    
    for ticker in tickers:
        data = yf.download(ticker,shiftedStartDate, endDate, auto_adjust=True, keepna=False, progress=False, threads=8)
        if not data.empty:
            data, features = dataPreprocessing(data, startDate, movingAverageSizes)
            
            if data is not None:
                stockDataList.append(data)
                if len(featureList) == 0:
                    featureList = features
    
    print(f"Number of stocks in dataset: {len(stockDataList)}")
    return stockDataList, featureList

Train Test Validation Split

In [11]:
# target one of 'Open', 'High', 'Low', 'Close', 'Volume', 'MovingAvg_5', 'MovingAvg_15', 'MovingAvg_200'
def getTimeWindowsFromStock(data, sequenceLength, features, target = "Close"):
    generatedTimeseries = timeseries_dataset_from_array(data, targets=None, batch_size=None, sequence_length=sequenceLength + 1)
    X = np.zeros((len(generatedTimeseries), sequenceLength, data.shape[1]))
    Y = np.zeros((len(generatedTimeseries)))
    for i, timeWindow in enumerate(generatedTimeseries):
        X[i,:,:] = timeWindow[:-1,:]
        Y[i] = timeWindow[-1, features.get_loc(target)]
    
    return X, Y

In [12]:
def sliceStockData(data, features):
    X = []
    Y = []

    for stock in data:
        XNew, YNew = getTimeWindowsFromStock(stock, sequenceLength, features)
        X.append(XNew)
        Y.append(YNew)

    # Combine along first axis    
    X = np.concatenate(X, axis=0)
    Y = np.concatenate(Y, axis=0)

    # add another dimension after last one
    X = np.expand_dims(X, axis=-1)
    Y = np.expand_dims(Y, axis=-1)  
    
    return X,Y

In [13]:
# list of preprocessed but not sliced stocks
trainData, features = getStockData(tickers, trainDataStart, trainDataEnd, movingAverageSizes)
valData, features = getStockData(tickers, valDataStart, valDataEnd, movingAverageSizes)
testData, features = getStockData(tickers, testDataStart, testDataEnd, movingAverageSizes)

  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2

1 Failed download:
['ABNB']: Exception("%ticker%: Data doesn't exist for startDate = 1394168400, endDate = 1577854800")
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  return bound(*args, **kwds)
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._p

Number of stocks in dataset: 492


  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] 

Number of stocks in dataset: 495


  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] = high2
  self._psar[i] 

IndexError: index 14 is out of bounds for axis 0 with size 9

In [None]:
x_train, y_train = sliceStockData(trainData, features)
x_val, y_val = sliceStockData(valData, features)
x_test, y_test = sliceStockData(testData, features)

2024-05-03 16:09:17.717744: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-03 16:09:18.542403: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-03 16:09:19.520208: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-03 16:09:20.509898: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-03 16:09:21.563933: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-03 16:09:22.552786: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-03 16:09:23.498023: W tensorflow/core/framework/local_rendezvous.cc:404] L

In [None]:
np.save(os.path.join(dataPath,"x_train.npy"), x_train)
np.save(os.path.join(dataPath,"y_train.npy"), y_train)
np.save(os.path.join(dataPath,"x_val.npy"), x_val)
np.save(os.path.join(dataPath,"y_val.npy"), y_val)
np.save(os.path.join(dataPath,"x_test.npy"), x_test)
np.save(os.path.join(dataPath,"y_test.npy"), y_test)

### Example loading all the data into tensorflow datasets

In [216]:
batchSize = 64

x_trainLoaded = np.load(os.path.join(dataPath,"x_train.npy"))
y_trainLoaded = np.load(os.path.join(dataPath,"y_train.npy"))
x_valLoaded = np.load(os.path.join(dataPath,"x_val.npy"))
y_valLoaded = np.load(os.path.join(dataPath,"y_val.npy"))
x_testLoaded = np.load(os.path.join(dataPath,"x_test.npy"))
y_testLoaded = np.load(os.path.join(dataPath,"y_test.npy"))

trainDataset = tf.data.Dataset.from_tensor_slices((x_trainLoaded, y_trainLoaded)).batch(batchSize)
valDataset = tf.data.Dataset.from_tensor_slices((x_valLoaded, y_valLoaded)).batch(batchSize)
testDataset = tf.data.Dataset.from_tensor_slices((x_testLoaded, y_testLoaded)).batch(batchSize)

In [228]:
print(f"Shape: {x_trainLoaded.shape}")
print(f"Shape: {y_trainLoaded.shape}")
print(f"Shape: {x_valLoaded.shape}")
print(f"Shape: {y_valLoaded.shape}")
print(f"Shape: {x_testLoaded.shape}")
print(f"Shape: {y_testLoaded.shape}")

Shape: (593810, 20, 8, 1)
Shape: (593810, 1)
Shape: (239052, 20, 8, 1)
Shape: (239052, 1)
Shape: (279125, 20, 8, 1)
Shape: (279125, 1)
