In [1]:
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
import numpy as np

from src.common.analysis_and_plots import Visualize as V
from src.features.build_features import FeatureEngineering as FE
from src.common.globals import G
from src.common.globals import split_train_valid_test, get_naive_forecast, calc_errors, save_errors_to_table
from src.data.get_data import CSVsLoader
from src.common.logs import setup_logging, log_model_info

import logging
import os
from datetime import datetime

logger = setup_logging(logger_name=__name__,
                        console_level=logging.INFO, 
                        log_file_level=logging.INFO)

PROJECT_PATH = G.get_project_root()
DATA_DIR_PROCESSED = os.path.join(PROJECT_PATH, r'data\03_processed\daily_full')

config = {
    'AV': {
        'key': '',
        'ticker': 'MSFT',
        'outputsize': 'full',
        'key_adjusted_close': 'Adj Close',
        'key_volume': 'Volume',
    },
    'data': {
        'test_size': 0.05,
    }, 
    'model': {
        'name': 'LSTM', 
        'window': 20,
        'batch_size' : 32,
        'shuffle_buffer_size' : 5600, # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle
        'epochs' : 100,
        'optimizer': tf.keras.optimizers.Adam(),
        'loss': tf.keras.losses.Huber(),
    },
}

In [2]:
def label_column_to_end(df, last_column):
    ''' 
    Changes positions of columns in df to put the target column at the end

    Args:
        df (pandas dataframe) - dataframe to change
        last_column (string) - name of the column to put at the end
        
    Returns:
        df (pandas dataframe) - dataframe with the target column at the end
    '''
    cols = df.columns.tolist()
    cols.remove(last_column)
    cols.append(last_column)
    return df[cols]

In [3]:
def windowed_dataset_X(df, window_size, batch_size, shuffle_buffer, verbose=True):
    # change the position of the target column to the end
    df = label_column_to_end(df, 'Adj Close')

    X_df = df.iloc[:, :-1]
    y_df = df.iloc[:, -1:]
    for col in X_df.columns:
        scaler = MinMaxScaler()
        X_df[col] = scaler.fit_transform(X_df[col].values.reshape(-1,1))
    
    # Creating X and y
    X = X_df.values
    y = y_df.values
    if verbose:
        print('---------------------------------X,y shape-------------------------------------')
        print (f'X.shape: {X.shape}, y.shape: {y.shape}')
        print('-'*100)


    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(X)
    if verbose:
        print('--------------------------from_tensor_slices--------------------------')
        for element in dataset:
            print(element)
            break
        print('-'*100)

    # Window the data but only take those with the specified size
    # And add + 1 to the window size to account for the label, which we will separate later
    dataset = dataset.window(window_size, shift=1, drop_remainder=True)
    if verbose:
        print('-------------------------------window-----------------------------------')
        for window in dataset:
            print(type(window))
            print(list(window.as_numpy_iterator()))
            break
        print('-'*100)
    
    # Flatten the windows by putting its elements in a single batch
    dataset = dataset.flat_map(lambda window: window.batch(window_size))
    if verbose:
        print('--------------------------------flat_map--------------------------------')
        for window in dataset:
            print(window)
            break
        print('-'*100)

    # # Shuffle the windows
    # dataset = dataset.shuffle(shuffle_buffer)

    # Create batches of windows
    # dataset = dataset.batch(batch_size).prefetch(1)
    # if verbose:
    #     print('--------------------------------batch-----------------------------------')
    #     for x in dataset:
    #         print(x.numpy().shape)
    #         print(x.numpy())
    #         break
    #     print('-'*100)

    return dataset

# -----------------------------Data----------------------------------------
df = CSVsLoader(ticker=config['AV']['ticker'], directory=DATA_DIR_PROCESSED)

test_size_int = int(len(df) * config['data']['test_size'])
df_train = df.iloc[:-test_size_int].copy(deep=True)
df_test = df.iloc[-test_size_int:].copy(deep=True)

for col in df.columns:
    df_train[f'{col} - 1'] = df_train[col].shift(1)
    df_test[f'{col} - 1'] = df_test[col].shift(1)
df_train = df_train.dropna()

#Drop Original columns is any left 
df_train = df_train.drop(columns=['Volume'])

2023-09-20 15:39:24 - src.data.get_data - INFO - Loaded "..\P7-Trading-Bot\data\03_processed\daily_full\MSFT-daily-full.csv". Number data points 5995. From "1999-11-01 00:00:00" to "2023-08-29 00:00:00"


In [4]:
def windowed_dataset_y(df, window_size, batch_size, shuffle_buffer, verbose=True):
    # change the position of the target column to the end
    df = label_column_to_end(df, 'Adj Close')

    # Creating X and y
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1:].values
    if verbose:
        print('---------------------------------X,y shape-------------------------------------')
        print (f'X.shape: {X.shape}, y.shape: {y.shape}')
        print('-'*100)
    
    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(y)
    if verbose:
        print('--------------------------from_tensor_slices--------------------------')
        for element in dataset:
            print(element)
            break
        print('-'*100)

    # calculate number of points we need to cut to make series evenly divisible by window_size
    remainder = window_size - 1

    # Remove the reminder elements from the end of dataset
    dataset = dataset.take(len(y) - remainder)
    if verbose:
        print('--------------------------------take len(y)-reminder--------------------------------')
        for window in dataset:
            print(window)
            break
        print('-'*100)
    
    return dataset

In [5]:
train_dataset_X = windowed_dataset_X(df_train, 
                                    window_size=config['model']['window'], 
                                    batch_size=config['model']['batch_size'], 
                                    shuffle_buffer=config['model']['shuffle_buffer_size'],
                                    verbose=True)
print('Lenght of X =', len(list(train_dataset_X.as_numpy_iterator())))

---------------------------------X,y shape-------------------------------------
X.shape: (5695, 2), y.shape: (5695, 1)
----------------------------------------------------------------------------------------------------
--------------------------from_tensor_slices--------------------------
tf.Tensor([0.05339881 0.0355088 ], shape=(2,), dtype=float64)
----------------------------------------------------------------------------------------------------
-------------------------------window-----------------------------------
<class 'tensorflow.python.data.ops.dataset_ops._VariantDataset'>
[array([0.05339881, 0.0355088 ]), array([0.05358007, 0.02960297]), array([0.05304583, 0.0280377 ]), array([0.05280732, 0.03634458]), array([0.05262606, 0.04995357]), array([0.05108055, 0.19832283]), array([0.05005975, 0.08379013]), array([0.04839022, 0.04928543]), array([0.05077526, 0.04919076]), array([0.05036504, 0.0322219 ]), array([0.04827574, 0.03022788]), array([0.04857148, 0.04055322]), array([0.04

In [6]:
train_dataset_y = windowed_dataset_y(df_train, 
                                     window_size=config['model']['window'], 
                                     batch_size=config['model']['batch_size'], 
                                     shuffle_buffer=config['model']['shuffle_buffer_size'], 
                                     verbose=True)
print('Lenght of y =', len(list(train_dataset_y.as_numpy_iterator())))

---------------------------------X,y shape-------------------------------------
X.shape: (5695, 2), y.shape: (5695, 1)
----------------------------------------------------------------------------------------------------
--------------------------from_tensor_slices--------------------------
tf.Tensor([28.80933532], shape=(1,), dtype=float64)
----------------------------------------------------------------------------------------------------
--------------------------------take len(y)-reminder--------------------------------
tf.Tensor([28.80933532], shape=(1,), dtype=float64)
----------------------------------------------------------------------------------------------------
Lenght of y = 5676


In [7]:
train_dataset = tf.data.Dataset.zip((train_dataset_X, train_dataset_y))
train_dataset = train_dataset.shuffle(config['model']['shuffle_buffer_size'])
train_dataset = train_dataset.batch(config['model']['batch_size']).prefetch(1)

for x, y in train_dataset:
    print (f'x.shape: {x.numpy().shape}, y.shape: {y.numpy().shape}')
    break

x.shape: (32, 20, 2), y.shape: (32, 1)


In [8]:
input_shape = train_dataset.element_spec[0].shape
print(f'Input shape: {input_shape}')

Input shape: (None, None, 2)


In [9]:
# -----------------------------Model Architecture--------------------------
model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(64, return_sequences=True, input_shape=(None,2)),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1),
        ],
    name=config['model']['name'])

model._name = f"{model._name}_{str(model.count_params())}_{datetime.now().strftime('%Y-%m-%d--%H-%M')}"
log_model_info(config, model, logger)

2023-09-20 15:39:27 - __main__ - INFO - AV_key: , AV_ticker: MSFT, AV_outputsize: full, AV_key_adjusted_close: Adj Close, AV_key_volume: Volume, 
2023-09-20 15:39:27 - __main__ - INFO - data_test_size: 0.05, 
2023-09-20 15:39:27 - __main__ - INFO - ----------------------------------------------------------------------
2023-09-20 15:39:27 - __main__ - INFO - model_name: LSTM
2023-09-20 15:39:27 - __main__ - INFO - model_window: 20
2023-09-20 15:39:27 - __main__ - INFO - model_batch_size: 32
2023-09-20 15:39:27 - __main__ - INFO - model_shuffle_buffer_size: 5600
2023-09-20 15:39:27 - __main__ - INFO - model_epochs: 100
2023-09-20 15:39:27 - __main__ - INFO - model_optimizer: <keras.src.optimizers.adam.Adam object at 0x000001C4F1299A50>
2023-09-20 15:39:27 - __main__ - INFO - model_loss: <keras.src.losses.Huber object at 0x000001C4F12B38D0>
2023-09-20 15:39:27 - __main__ - INFO - Model: "LSTM_42113_2023-09-20--15-39"
2023-09-20 15:39:27 - __main__ - INFO - ________________________________

In [10]:
# Set the training parameters and train the model
model.compile(loss=config['model']['loss'], 
            optimizer=config['model']['optimizer'], 
            metrics=["mae"],
            )    

# Train the model
history = model.fit(train_dataset, epochs=config['model']['epochs'])

# Plot MAE and Loss
mae=history.history['mae']
loss=history.history['loss']
zoom = int(len(mae) * 0.9)
V.plot_series(x=range(config['model']['epochs'])[-zoom:],
                y=(mae[-zoom:],loss[-zoom:]),
                model_name=config['model']['name'],
                title='MAE_and_Loss',
                xlabel='MAE',
                ylabel='Loss',
                legend=['MAE', 'Loss']
            )

# Save the model
FE.model_save(model, logger)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100

In [None]:
def model_forecast(model, df, window_size, batch_size):
    # calculate actual prediction size 
    prediction_size = len(series) - window_size
    print('prediction_size', prediction_size)

    # calculate number of points we need to add to make series evenly divisible by window_size
    remainder = prediction_size % window_size
    remainder = window_size - remainder if remainder > 0 else 0
    print('remainder', remainder)

    # get last prediction_size + remainder points from series
    series = series[-(prediction_size + remainder):]
    print('Paddded series to predict on', series)

    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(series)
    print('--------------------------from_tensor_slices--------------------------')
    for i,element in enumerate(dataset):
        print(element)
        print(i)
    print('-'*100)

    # Batch the data to window size
    dataset = dataset.batch(window_size)
    print('--------------------------------Batch to window size--------------------------------')
    for i,x in enumerate(dataset):
        print(i,x)
    print('-'*100)


    # Create batches of windows
    dataset = dataset.batch(batch_size).prefetch(1)
    print('--------------------------------Batch to Batch size-----------------------------------')
    for x in dataset:
        print(x)
        break
    print('-'*100)
  
    # Get predictions on the entire dataset
    forecast = model.predict(dataset)
    print('--------------------------------forecast-----------------------------------')
    for i,x in enumerate(forecast):
        if i > 1:
            break
        print(i,x)
    print('-'*100)

    forecast = forecast.squeeze()

    # flatten the forecast array
    forecast = forecast.flatten()
    # get the last prediction_size points
    forecast = forecast[-prediction_size:]

    return forecast

In [None]:
# -----------------------------Predictions---------------------------------
forecast_series = df.iloc[-test_size_int - config['model']['window']:]

results = model_forecast(model=model, 
                            series=forecast_series, 
                            window_size=config['model']['window'], 
                            batch_size=config['model']['batch_size'])
print(results.shape)
# unnormalize the data
# results = scaler.inverse_transform(results.reshape(-1,1))

V.plot_series(  x=df_test.index, 
                y=(df_test['Adj Close'], results),
                model_name=config['model']['name'])

In [None]:
# -----------------------Calculate Errors----------------------------------
naive_forecast = get_naive_forecast(df).iloc[-len(df_test['Adj Close']):]
rmse, mae, mape, mase = calc_errors(df_test['Adj Close'], results, naive_forecast)
save_errors_to_table(config['model']['name'], {'rmse': rmse, 'mae': mae, 'mape': mape, 'mase': mase})