In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import datetime
from sklearn.preprocessing import robust_scale
from tqdm.auto import tqdm

In [None]:
T = 15

date_limit_train_validation = datetime.datetime(year = 2016, month=5, day=1)
features = {
    "preprocessed_open": lambda df : df["Open"] / df["Close"] - 1,
    "preprocessed_high": lambda df : df["High"] / df["Close"] - 1,
    "preprocessed_low": lambda df : df["Low"] / df["Close"] - 1,
    "preprocessed_close": lambda df : df["Close"].pct_change(),
    "preprocessed_adj_close": lambda df : df["Adj Close"].pct_change(),
    "preprocessed_5-day": lambda df : (df["Adj Close"].rolling(5).mean() / df["Adj Close"]) -1,
    "preprocessed_10-day": lambda df : (df["Adj Close"].rolling(10).mean() / df["Adj Close"]) -1,
    "preprocessed_15-day": lambda df : (df["Adj Close"].rolling(15).mean() / df["Adj Close"]) -1,
    "preprocessed_20-day": lambda df : (df["Adj Close"].rolling(20).mean() / df["Adj Close"]) -1,
    "preprocessed_25-day": lambda df : (df["Adj Close"].rolling(25).mean() / df["Adj Close"]) -1,
    "preprocessed_30-day": lambda df : (df["Adj Close"].rolling(30).mean() / df["Adj Close"]) -1,
    "preprocessed_volume": lambda df : df["Volume"]
}



In [None]:
def labelling(df):
   df["temp"] =  (df["Adj Close"].shift(-1) / df["Adj Close"] ) - 1
   df["label"] = 0
   df.loc[df["temp"] > 0.55/100, "label"] = 1
   df.loc[df["temp"] < -0.50/100, "label"] = -1
   
   df.drop(df[df["label"] == 0].index, inplace= True)
   del df["temp"]
def generate_sequences(df, features_columns):
    X_stock_array = np.array(df[features_columns])
    y_stock_array = np.array(df["label"])
    sequences_indexes = [np.arange(i, T + i, 1) for i in range(len(df) - T)]
    _X = X_stock_array[sequences_indexes]
    _y = y_stock_array[sequences_indexes][:, -1]
    return _X, _y
def shuffled_X_y(X, y):
    assert len(X) == len(y)
    p = np.random.permutation(len(X))
    return X[p], y[p]

In [None]:
raw_data_path = "data/stocknet-dataset/price/raw/*.csv"
raw_data_pathes = glob.glob(raw_data_path)

X_train, y_train = None, None
X_validation, y_validation = None, None
#Iterate through each stock RAW data
for path in tqdm(raw_data_pathes): 
    stock_df = pd.read_csv(path, parse_dates=["Date"],index_col="Date")
    
    ## Generate labels
    labelling(stock_df)

    ## Preprocessing
    stock_df.sort_index(inplace= True)
    stock_df.dropna(inplace = True)

    ### Apply features functions
    for feature_key in features.keys(): 
        stock_df[feature_key] = robust_scale(features[feature_key](stock_df))
    stock_df.dropna(inplace = True)

    train_stock_df = stock_df[stock_df.index < date_limit_train_validation]
    validation_stock_df = stock_df[stock_df.index >= date_limit_train_validation]

    ## Generate sequences
    X_stock_train, y_stock_train = generate_sequences(df = train_stock_df, features_columns= features.keys())
    X_stock_validation, y_stock_validation = generate_sequences(df = validation_stock_df, features_columns= features.keys())

    if X_train is None: X_train = X_stock_train
    else : X_train = np.concatenate([X_train, X_stock_train], axis = 0)
    if X_validation is None: X_validation = X_stock_validation
    else : X_validation = np.concatenate([X_validation, X_stock_validation], axis = 0)
    if y_train is None: y_train = y_stock_train
    else : y_train = np.concatenate([y_train, y_stock_train], axis = 0)
    if y_validation is None: y_validation = y_stock_validation
    else : y_validation = np.concatenate([y_validation, y_stock_validation], axis = 0)

# Shuffle X and y
X_train, y_train = shuffled_X_y(X_train, y_train)
X_validation, y_validation = shuffled_X_y(X_validation, y_validation)
X_train.shape, y_train.shape, X_validation.shape, y_validation.shape

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, return_sequences= False),
    tf.keras.layers.Dense(3, activation="softmax")
])
model.compile(
    loss= "sparse_categorical_crossentropy",
    optimizer= "adam",
    metrics = ["acc"]
)
model.fit(
    X_train, y_train + 1,
    validation_data = (X_validation, y_validation + 1),
    epochs = 10,
    batch_size = 1024
)