# LSTM


In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

import tensorflow as tf
tf.get_logger().setLevel('CRITICAL')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import warnings
warnings.filterwarnings("ignore")

from datetime import timedelta, datetime
import itertools
import json
import matplotlib.pyplot as plt
import mlflow

import numpy as np
import os
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import yaml
from time import time
import pickle
from tqdm import tqdm

from sklearn.model_selection import ParameterSampler
from scipy.stats.distributions import uniform, randint
import mlflow
from glob import glob
from tensorflow.keras.callbacks import EarlyStopping


import tensorflow as tf
tf.__version__


# Get the current project path (where you open the notebook)
# and go up two levels to get the project path
current_dir = Path.cwd()
proj_path = current_dir.parent.parent


# make the code in src available to import in this notebook
import sys
sys.path.append(os.path.join(proj_path,'src'))

from metrics import *
from utils import *
from scalers import * 

# Catalog contains all the paths related to datasets
with open(os.path.join(proj_path, 'conf/catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)['olist']
    
# Params contains all of the dataset creation parameters and model parameters
with open(os.path.join(proj_path, 'conf/params.yml'), "r") as f:
    params = yaml.safe_load(f)


In [3]:
def set_model(n_layers:int=1, init_units:int=50, n_unit_strategy:str='stable', 
              dropout_p:float=0.1, num_timesteps: int=8, num_series: int=5, 
              lr:float=0.001, optimizer:str='adam', loss:str='mape'):
    '''
    # Gives flexibility
    # Explore depth vs width of the RNN model
    https://stackoverflow.com/questions/59072728/what-is-the-rule-to-know-how-many-lstm-cells-and-how-many-units-in-each-lstm-cel
    # there are no rule of thumb, but here the decrease strategy will devide by i the number of units at each layer.
    
    Args
        num_timesteps: the number of lags in the dataframes
        num_series: the number of time series, specify 1 if univariate
        n_unit_strategy: two options, stable and decrease. decrease will decrease the  
    '''
    model = Sequential()
    if n_unit_strategy == 'stable':
        n_units = [init_units] * n_layers
    if n_unit_strategy == 'decrease':
        n_units = [max(int(init_units/i),1) for i in range(1,n_layers+1)]
    
    if n_layers > 1:
        model.add(LSTM(n_units[0], return_sequences=True, input_shape=(num_timesteps, num_series)))
        model.add(Dropout(dropout_p)) # Prevent overfitting
        for i in range(1, n_layers):
            model.add(LSTM(n_units[i], return_sequences=True))
            model.add(Dropout(dropout_p)) 
        model.add(LSTM(n_units[-1]))
        model.add(Dropout(dropout_p))
    
    else:
        model.add(LSTM(n_units[0], return_sequences=False, input_shape=(num_timesteps, num_series)))
        model.add(Dropout(dropout_p)) # Prevent overfitting

    model.add(Dense(units=1))
    
    if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
    if optimizer == 'rmsprop':
        opt = tf.keras.optimizers.RMSprop(learning_rate=lr)
    
    model.compile(optimizer=opt, loss=loss)

    return model

  and should_run_async(code)


In [4]:
model_a = set_model(n_layers=3, n_unit_strategy='stable')
model_a.summary()

  tensor_proto.tensor_content = nparray.tostring()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 8, 50)             11200     
_________________________________________________________________
dropout (Dropout)            (None, 8, 50)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8, 50)             20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 50)             0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 8, 50)             20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 8, 50)             0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                2

In [5]:
def prepare_data(df, lag_units, list_features:list=['UNITS']):
    """
    For every feature in the list of features, create lagged features
    for a specified number of lagged units. The lenght must be the
    same for all of the features. After creating the new features,
    it filters on those and adds them to a list that will be 
    reshaped in the correct format.
    
    We go from a dataframe version:
    
    UNITS    | FEATURE_A | FEATURE_B |   ...
    ---------|-----------|-----------|-------
    
    For every feature:
        
        Create the lagged features:

        UNITS    | FEATURE_A | FEATURE_A-lag-1 |   ...
        ---------|-----------|-----------------|-------  

        Filter on each lagged feature and inverse columns:

        FEATURE_A-lag-N | ... |  FEATURE_A-lag-1
        ----------------|-----|------------------
        
        Convert to numpy as reshape by adding a dimension
        (nrows, nlags) -> (nrows, nlags, 1)
        
    Concatenate each "feature-specific matrix" on the last dimension
    
    [(nrows, lagged-features-A, 1),
     (nrows, lagged-features-B, 1),
                  ...
     (nrows, lagged-features-X, 1)]
     
     Results in: 
     
     (nrows, nlags, N)
    
    
    Note: We need to specify the column 'UNITS' in the list of features.
    
    """
    
    # Initialize a list
    list_df = []
    
    for feature in list_features:
        # create new features that will have the convention lag-<col_name>-<lagged unit>
        make_lag_features(df, lag_units, col_name=feature, prefix_name=f'lag-{feature}',inplace=True)
        # filter columns that were just created
        # we are only appending lag something, so not actual features
        dummy = df.filter(like=f'lag-{feature}')
        # invert columns left to right (lag-1 ... lag-N) -> (lag-N ... lag-1)
        dummy = dummy[dummy.columns[::-1]]
        # add an extra dimension
        dummy = dummy.values.reshape(dummy.shape[0], dummy.shape[1], 1)
        # add the matrice to a list where it will get concatenated at the end
        list_df.append(dummy)

    # matrices in list_df must have shape (n_rows, n_cols, 1)
    # so we can concatenate on the last axis
    x_features = np.concatenate(list_df, axis=2)
    y_feature = np.array(df['payment_value'].values)

    return x_features, y_feature, df['order_approved_at'].values

In [6]:
%%time

# Step 1: Read data and convert string to proper datetime objects
merged_data = pd.read_csv(os.path.join(proj_path, 
                                       catalog['output_dir']['dir'], 
                                       catalog['output_dir']['transactions']))

merged_data['order_approved_at'] = pd.to_datetime(merged_data['order_approved_at'])

# Step2: Create date folds
date_ranges = make_dates(params['olist']['experiment_dates'])


for prod_cat in params['olist']['product_categories']:
    print(f'Processing product category: {prod_cat}')
    
    # Initialize mlflow tracking
    create_folder(os.path.join(proj_path, 'mlruns'))
    #mlflow.set_tracking_uri(os.path.join(proj_path, 'mlruns'))
    mlflow.set_tracking_uri(os.path.join('../../','mlruns'))
    mlflow.set_experiment(prod_cat)
    
    start_timer = time()
    all_predictions = []
    all_y_true = []
    all_hyperparameters = []
    # Iterate over each period, unpack tuple in each variable.
    # in each of the period, we will find the best set of parameters,
    # which will represent the time-series cross validation methodology
    for _, train_start, train_end, valid_start, valid_end, test_start, test_end in date_ranges.itertuples():
        print(f'  - - Processing range {str(train_start.date())} to {str(test_end.date())}')

        # This is the only different between experiment 2 and 3
#         list_features = ['UNITS','VISITS','HHS','FEATURE','DISPLAY']
        list_features = ['payment_value']

        # Filter data here
        # Prepare the dataset for one UPC and one store, currently no-scaling
        filtered_data = merged_data[(merged_data['product_category_name']==prod_cat)][['payment_value','order_approved_at']].copy()
        
        # Initialize some of the parameters
        ws = params['lstm']['window_size']
        
        # Prepare data, pass in all of the data, it will filter it
        data, label, dates = prepare_data(df=filtered_data, 
                                          lag_units=ws, 
                                          list_features=list_features)
        
        # Because we are working with numpy now, it's easier to select the indexes
        # Will need date indexes 
        date_series = pd.Series(dates)
        
        train_series_idx = date_series[(date_series>=train_start) &
                                       (date_series<=train_end)].index
        valid_series_idx = date_series[(date_series>=valid_start) &
                                       (date_series<=valid_end)].index
        test_series_idx = date_series[(date_series>=test_start) &
                                      (date_series<=test_end)].index
        
        train_data = data[train_series_idx].copy()
        train_label = label[train_series_idx].copy()
        valid_data = data[valid_series_idx].copy()
        valid_label = label[valid_series_idx].copy()
        test_data = data[test_series_idx].copy()
        test_label = label[test_series_idx].copy()
        
        # The scaling of 0 to 1 creates issue with the loss function, as the minimum will be 0 and maximum 1. 
        # a bad solution was to drop the row that has 0, but doesn't seem to help. 
        # another solution is to log the values
        
        scalers = {}
        # The last two features are already bounded 0 to 1, we subtract 2 from the shape which represent 'FEATURE', 'DISPLAY'
        # for feature_col in range(train_data.shape[-1]-2):
        for feature_col in range(train_data.shape[-1]):
            scalers[feature_col] = NormalizeScalerDf()
            train_data[:,:,feature_col] = scalers[feature_col].fit_transform(train_data[:,:,feature_col])
            valid_data[:,:,feature_col] = scalers[feature_col].transform(valid_data[:,:,feature_col])
            test_data[:,:,feature_col] = scalers[feature_col].transform(test_data[:,:,feature_col])
        scalers['label'] = NormalizeScalerDf()
        train_label = scalers['label'].fit_transform(train_label)
        valid_label = scalers['label'].transform(valid_label)
        test_label = scalers['label'].transform(test_label)

        # Replace NaN's with 0's which are caused by the lagged values. This was done after the scaling to avoid issues
        train_data = np.nan_to_num(train_data)
        valid_data = np.nan_to_num(valid_data)
        test_data = np.nan_to_num(test_data)
        
        # Initialize some of the parameters
        dp = params['lstm']['dropout']
        units_strategy = params['lstm']['units_strategy']
        optimizers = params['lstm']['optimizers']
        losses = params['lstm']['loss']
        num_series = train_data.shape[2]   

        # Step 3: Define a random search for these parameters, for hyperparameter tuning
#         random_number_generator = np.random.RandomState(0) 
        param_grid = {'lr': uniform(loc=0.0001, scale=0.01),
                      'init_units': randint(low=5, high= 100),
                      'total_layers': randint(low=1, high=2)}
        param_list = list(ParameterSampler(param_grid, n_iter=params['lstm']['search_iter'], random_state=3))
        
        # Perform hyperparameter search
        res = []
        for param_dict in param_list:
            lr = param_dict['lr']
            init_units = param_dict['init_units']
            tot_lay = param_dict['total_layers']
        
            # Early Stopping will allow us to trigger when to stop training the model
            # The stopping criteria will be when the validation loss doesn't decrease for 
            # two consecutive steps. This prevents us from overfitting the model. 
            earlyStop=EarlyStopping(monitor="val_loss", verbose=0, mode='min', patience=2)
            # Initialize the model and train
            lstm_model = set_model(n_layers=tot_lay, init_units=init_units,
                                   n_unit_strategy=units_strategy, dropout_p=dp,
                                   num_timesteps=ws, num_series=num_series,
                                   lr=lr, optimizer=optimizers, loss=losses)
            history = lstm_model.fit(train_data, train_label, validation_data=(valid_data, valid_label), 
                                     epochs=100, verbose=0, shuffle=False, batch_size=1, callbacks=[earlyStop])

            # _ = lstm_model.predict(np.expand_dims(valid_data[0],axis=0))[0]
            # There are many warnings when making predictions, thus I use a comprehension loop
            # val_predictions = np.array([lstm_model.predict(np.expand_dims(dd,axis=0))[0] for dd in valid_data])
            val_predictions = lstm_model.predict(valid_data)
            
            y_pred = scalers['label'].inverse_transform(val_predictions)
            y_true = scalers['label'].inverse_transform(valid_label)

            val_mape = mean_absolute_percentage_error(y_true, y_pred)

            param_res = {'learning_rate':lr,
                         'init_units':init_units,
                         'n_layers':tot_lay,
                         'val_mape':val_mape}

            res.append(param_res)
        
        # Select the optimal hyper-parameters
        best_params = pd.DataFrame(res).sort_values(by='val_mape', ascending=True).iloc[0]
        
        lstm_model = set_model(n_layers=int(best_params['n_layers']),
                               init_units=best_params['init_units'],
                               n_unit_strategy=units_strategy,
                               dropout_p=dp,
                               num_timesteps=ws,
                               num_series=num_series,
                               lr=best_params['learning_rate'],
                               optimizer=optimizers,
                               loss=losses)

        earlyStop=EarlyStopping(monitor="val_loss", verbose=0, mode='min', patience=2)
        history = lstm_model.fit(train_data, train_label, validation_data=(valid_data, valid_label),
                                 epochs=100, verbose=0, shuffle=False, batch_size=1, callbacks=[earlyStop])

        test_predictions = lstm_model.predict(test_data) 
        
        y_pred = scalers['label'].inverse_transform(test_predictions)
        y_true = scalers['label'].inverse_transform(test_label)
        
        all_y_true.extend(y_true)
        all_predictions.extend(y_pred)
        all_hyperparameters.append(best_params)
        
    df_filtered = merged_data[(merged_data['product_category_name'] == prod_cat) &
                              (merged_data['order_approved_at'] >= params['olist']['experiment_dates']['test_start']) & 
                              (merged_data['order_approved_at'] <= params['olist']['experiment_dates']['test_end'])].copy()
    
    save_data = pd.DataFrame({'y_true': all_y_true,
                              'y_pred': np.array(all_predictions).flatten(),
                              'dates': df_filtered['order_approved_at']})
        
    test_metrics = get_metrics(all_y_true, all_predictions)

    fdir = os.path.join(proj_path, catalog['results']['dir'])
    fname = os.path.join(fdir, f'lstm_exp1_{prod_cat}.csv')
    create_folder(fdir)
    save_data.to_csv(fname)
    
    fdir_hparam = os.path.join(proj_path, catalog['extra']['dir'], catalog['extra']['lstm_hyperparams'])
    fname_hparam = os.path.join(fdir_hparam, f'lstm_exp1_{prod_cat}.pickle')
    create_folder(fdir_hparam)
    
    with open(fname_hparam, "wb") as filehandler:
        pickle.dump(all_hyperparameters, filehandler)

    duration_min = int((time() - start_timer) // 60)
    
    with mlflow.start_run():
        mlflow.log_param('model','lstm')
        mlflow.log_param('experiment','exp1')
        mlflow.log_params(params['lstm'])
        mlflow.log_metrics(test_metrics)
        mlflow.log_artifact(fname)
        mlflow.log_metric('time', duration_min)
        mlflow.log_artifact(fname_hparam)
        mlflow.log_params({'g_cat_state':False,
                           'g_cat_br':False,
                           'g_cc_state':False,
                           'g_cc_br':False})
        

Processing product category: bed_bath_table
  - - Processing range 2017-01-01 to 2018-01-28
  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

  tensor_proto.tensor_content = nparray.tostring()


Processing product category: health_beauty
  - - Processing range 2017-01-01 to 2018-01-28
  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12
Processing product category: sports_leisure
  - - Processing range 2017-01-01 to 2018-01-28


  tensor_proto.tensor_content = nparray.tostring()


  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12
Processing product category: furniture_decor
  - - Processing range 2017-01-01 to 2018-01-28


  tensor_proto.tensor_content = nparray.tostring()


  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12
Processing product category: housewares
  - - Processing range 2017-01-01 to 2018-01-28


  tensor_proto.tensor_content = nparray.tostring()


  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12
Processing product category: watches_gifts
  - - Processing range 2017-01-01 to 2018-01-28


  tensor_proto.tensor_content = nparray.tostring()


  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12
Processing product category: telephony
  - - Processing range 2017-01-01 to 2018-01-28


  tensor_proto.tensor_content = nparray.tostring()


  - - Processing range 2017-01-29 to 2018-02-25
  - - Processing range 2017-02-26 to 2018-03-25
  - - Processing range 2017-03-26 to 2018-04-22
  - - Processing range 2017-04-23 to 2018-05-20
  - - Processing range 2017-05-21 to 2018-06-17
  - - Processing range 2017-06-18 to 2018-07-15
  - - Processing range 2017-07-16 to 2018-08-12
Wall time: 10h 6min 30s
