In [23]:
import pandas as pd
import numpy as np
import datetime as dt
import os


In [8]:
tickers = ['SPY', 'QQQ', 'DIA']

In [76]:
# we are using extended hours trading date which is from 4:00 am - 8:00 pm
time_start = dt.time(4,0,0)
time_end = dt.time(20,0,0)
window_size =  15  # in days i.e. window_size = 15 means window has 15 days
window_stride =  1    # in days i.e. window_stride = 1 means window moves 1 day over
time_step = 60 # in minutes i.e. time_step = 10 is stock data every 10 minutes

# load data
ticker = tickers[0]
stock_df = pd.read_csv(f"tickers_raw/{ticker}_1min.csv", parse_dates=['time'])

# separate date time
stock_df['date'] = stock_df['time'].dt.date
stock_df['time'] = stock_df['time'].dt.time
stock_df.head()

# pivot data by day to clean data
stock_df = stock_df.pivot(index='time', columns='date', values='high')
stock_df = stock_df.fillna(method="ffill") # fill in NAN by propagating last valid observation forward
stock_df = stock_df[stock_df.index >= time_start] # keep only data in range
stock_df = stock_df[stock_df.index <= time_end] # keep only data in range

# transform data so that each row is a a different day
stock_df = stock_df.T

# retain the opening and closing price of everyday to make it easier later
stock_df_close = stock_df.iloc[:-(window_size) , -1]
# stock_df_open = stock_df.iloc[window_size:), 1]

# drop columns to be within correct time_step frequency
keep_cols = range(0, stock_df.shape[1], time_step)
stock_df = stock_df.iloc[:, keep_cols]

# create rowlling window
num_days = stock_df.shape[0]
num_mins = stock_df.shape[1]

for i in range(0, window_size -1, window_stride):
    index = i * num_mins
    stock_df_add = stock_df.iloc[ :, index:index+num_mins]
    stock_df = pd.concat([stock_df, stock_df_add], axis=1)

# drop the last few rows 
stock_df = stock_df.iloc[: - (window_size-1), :]

# normalize data using closing price of previous trading day
stock_df = stock_df.iloc[1:,:]
stock_df_close.index = stock_df.index
stock_df = stock_df.sub(stock_df_close, axis=0)

# # add targets
# stock_df_open.index = stock_df.index
# stock_df['targets'] = np.where( stock_df_open >= stock_df.iloc[:,-1], 1, 0)

stock_df.to_csv(f"tickers_processed/{ticker}_no_targets.csv", index=True)


In [14]:
print(stock_df.shape[1])

961


In [10]:
# lables data as up or down

# file_path = "tickers"
# files = os.listdir(file_path)

# for file in files:
#     ticker = file.split(".")[0]
#     security_df = pd.read_csv(f"{file_path}/{file}", parse_dates=['date'])
#     security_df = security_df[['date', 'adj_close']].set_index('date')
#     security_df_diff = security_df.iloc[1:] - security_df.iloc[:-1].values
#     security_df_diff.columns = ['diff']
#     security_df_diff = security_df_diff.applymap(lambda x: 1 if x >= 0 else 0)
#     security_df_diff.to_csv(f"tickers_bool/{ticker}_bool.csv", index=True,)
#     break