In [1]:
import numpy as np
import pandas_datareader.data as web
import pandas as pd
import pandas_ta as ta
import pandas_market_calendars as mcal
from dotenv import load_dotenv
from datetime import datetime, timedelta
import matplotlib
import matplotlib.pyplot as plt
import yfinance as yf
import os

from sklearn import preprocessing

In [2]:
features = pd.read_csv('RoboStockFeatures.csv')
prices = pd.read_csv('RoboStockPrices.csv')

In [3]:
while features.isnull().values.any(): # check for missing data
    features.fillna(method='ffill',inplace=True)

while prices.isnull().values.any():
    prices.fillna(method='ffill',inplace=True)

In [4]:
features[features.isna().any(axis=1)]

Unnamed: 0,date,symbol,f01,f02,f03,f04,f05,f06,f07,f08,...,VIDYA_14,VTXP_14,VTXM_14,VWAP_D,VWMA_10,WCP,WILLR_14,WMA_10,ZL_EMA_10,ZS_30


In [5]:
prices[prices.isnull().any(axis=1)]

Unnamed: 0,date,symbol,open,high,low,close,volume


In [6]:
exclude_cols = ['month_'+str(num) for num in list(range(1,1+12))] + ['f20'] + ['date'] + ['symbol']
std_scalar = preprocessing.StandardScaler()
df = std_scalar.fit_transform(features[features.columns.difference(exclude_cols)])
df = pd.DataFrame(df,columns=features.columns.difference(exclude_cols))
df[exclude_cols] = features[exclude_cols].copy()

In [7]:
NUM_OF_YEARS_PRED = 1/2 # num of year to predict
NUM_OF_YEARS_DATA = 5 # num of years of data to use

In [8]:
from tqdm import tqdm
import os.path as path

def prep_data(X,y,target_col_name='close',n_past_steps=365,n_future_steps=30):
    X_sequence = []
    y_sequence = []

    X_filename = path.join(os.getcwd(), 'X_sequence.dat')
    if path.exists(X_filename):
        os.remove(X_filename)
    y_filename = path.join(os.getcwd(), 'Y_sequence.dat')
    if path.exists(y_filename):
        os.remove(y_filename)

    length = (X.groupby('symbol').ngroups)
    cols = int(length)
    for group_x, group_y, i in zip(X.groupby('symbol'), y.groupby('symbol'),tqdm(range(length),desc="Preparing data",ncols=cols)):
        
        if group_x[0] == group_y[0]:
            
            X = group_x[1].drop(['date','symbol'],axis=1).values
            y = group_y[1][target_col_name].values

            for i in range(n_past_steps, len(X) - n_future_steps + 1):                 
                X_sequence.append(X[i - n_past_steps:i, :])
                y_sequence.append(y[i:i + n_future_steps])
        else:
            print(group_x[0]," ",group_y[0]," SKIPPED")
            
    X_shape = (len(X_sequence),len(X_sequence[0]),len(X_sequence[0][0]))
    y_shape = (len(y_sequence),len(y_sequence[0]))
    
    X_mem = np.memmap(X_filename, mode='w+',shape=(X_shape),dtype='float64')
    y_mem = np.memmap(y_filename, mode='w+',shape=(y_shape),dtype='float64')

    for i in tqdm(range(len(X_sequence)),desc="Flushing X data",ncols=cols):
        X_mem[i] = X_sequence[i]
    X_mem.reshape(X_shape)
    X_mem.flush()
    
    for i in tqdm(range(len(y_sequence)),desc="Flushing y data",ncols=cols):
        y_mem[i] = y_sequence[i]
    y_mem.reshape(y_shape)
    y_mem.flush()

    print("X Shape : ",(len(X_sequence),len(X_sequence[0]),len(X_sequence[0][0])))
    print("Y Shape: ",(len(y_sequence),len(y_sequence[0])))

    shapes = {"X":X_shape, "y":y_shape}
    np.save('shapes.npy',shapes)

    print("DONE")
    del X_filename, y_filename
    
    return X_sequence, y_sequence

In [9]:
X,Y = prep_data(df,prices,target_col_name='close',n_past_steps=int(365*NUM_OF_YEARS_DATA),n_future_steps=int(365*NUM_OF_YEARS_PRED)) # Use past 5*365 days/data point to predict next 1*365 days of "target_col_name"(close)

Preparing data:  99%|███████████████████████████████████████████████████████████████▍| 119/120 [00:00<00:00, 122.27it/s]
Flushing X data: 100%|████████████████████████████████████████████████████████████| 66120/66120 [19:08<00:00, 57.59it/s]
Flushing y data: 100%|████████████████████████████████████████████████████████| 66120/66120 [00:00<00:00, 137763.61it/s]


X Shape :  (66120, 1825, 308)
Y Shape:  (66120, 182)
DONE


In [8]:
shapes = np.load('shapes.npy',allow_pickle=True)
X_shape = shapes.item()['X']
y_shape = shapes.item()['y']

In [9]:
X1 = np.memmap('X_sequence.dat', dtype='float64', mode='r+', shape=X_shape)
Y1 = np.memmap('Y_sequence.dat', dtype='float64', mode='r+', shape=y_shape)

In [10]:
for i in range(3):
    print(X1[i].shape)
    print(X1[i])
    print()

(1825, 308)
[[-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
   1.        ]
 [-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
   1.        ]
 [-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
  -1.        ]
 ...
 [-0.04418512 -0.04125727 -0.0405759  ...  0.          0.
   1.        ]
 [-0.0441725  -0.04124297 -0.04056139 ...  0.          0.
  -1.        ]
 [-0.04418331 -0.04121695 -0.04052937 ...  0.          0.
   1.        ]]

(1825, 308)
[[-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
   1.        ]
 [-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
  -1.        ]
 [-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
  -1.        ]
 ...
 [-0.0441725  -0.04124297 -0.04056139 ...  0.          0.
  -1.        ]
 [-0.04418331 -0.04121695 -0.04052937 ...  0.          0.
   1.        ]
 [-0.04415542 -0.04116639 -0.04047528 ...  0.          0.
   1.        ]]

(1825, 308)
[[-0.0452168  -0.04198384 -0.04124833 ...  0.          0.
  -1.        ]
 

In [11]:
for i in range(3):
    print(Y1[i].shape)
    print(Y1[i])
    print()

(182,)
[39.59664917 38.90419006 40.18181992 40.02577209 40.29885483 38.53358078
 38.16297531 38.71888351 39.67466736 38.85542297 35.27612686 34.93477631
 35.24686813 35.59797287 34.28133392 35.64673233 38.0459404  38.6896286
 39.96725082 39.01147079 37.76311493 37.8801384  36.80733109 37.66557312
 37.04138565 36.49523544 36.45622635 36.03685379 35.39315414 34.48613739
 32.75013351 31.28720474 32.2722435  33.07197189 34.17405319 35.4516716
 34.91526794 35.19809723 36.60251236 35.92956543 36.04659653 36.98286438
 35.56871414 35.3346405  34.48613739 36.51473618 36.36844635 34.73971939
 34.27158356 33.686409   32.13570404 30.83856964 34.08627701 36.17338943
 36.30992508 36.14413071 37.99716568 37.32422638 37.28520966 38.39704514
 39.15776825 38.42630386 39.38208389 37.79236603 37.79236603 39.70392227
 39.52837753 42.78583527 43.55630875 44.03420639 44.13172913 42.34695053
 40.58168793 40.4646492  36.00758743 36.77806473 40.09403992 41.08884048
 41.02056503 40.39637375 43.03940964 42.961391