In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
import mlflow
import pickle
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [5]:
# load the dataset

data= pd.read_csv('merged.csv',parse_dates =True)

# only the date and sales columns will be used for LSTM
data= data.groupby("Date").agg({'Sales':'mean'})

data

Unnamed: 0_level_0,Sales
Date,Unnamed: 1_level_1
2013-01-01,76.857271
2013-01-02,6050.476233
2013-01-03,5577.322870
2013-01-04,5832.756054
2013-01-05,5178.539910
...,...
2015-07-27,8694.100448
2015-07-28,7742.983857
2015-07-29,7326.593722
2015-07-30,7510.558744


In [6]:
# Normalize the sales data using Standardscaler
scaler = StandardScaler()
scaled_array = scaler.fit_transform(data)
data['DataScaled'] = scaled_array

In [7]:
SIZE = len(data.DataScaled)  # Get the total number of data points in the 'DataScaled' column

# Define parameters for time series analysis
WINDOW_SIZE = 48  # Set the size of the time window, indicating the number of past data points to consider

BATCH_SIZE = SIZE - WINDOW_SIZE * 2  # Set the batch size for training, leaving a gap equal to twice the window size for validation

EPOCHS = 200  # Set the number of training epochs

In [8]:
# Extract the indices for the training set
DateTrain = data.index.values[0:BATCH_SIZE]

# Extract the indices for the validation set
DateValid = data.index.values[BATCH_SIZE:]

# Extract the training data and convert to float32
XTrain = data.DataScaled.values[0:BATCH_SIZE].astype('float32')

# Extract the validation data and convert to float32
XValid = data.DataScaled.values[BATCH_SIZE:].astype('float32')

# Reshape date series to have a single column for compatibility with the model
DateTrain = np.reshape(DateTrain, (-1, 1))
DateValid = np.reshape(DateValid, (-1, 1))

# Print the shapes of the training and validation sets for date series
print("Shape of the training set date series: ", DateTrain.shape)
print("Shape of the validation set date series: ", DateValid.shape)
print()

# Print the shapes of the training and validation sets for the sales series
print("Shape of the training set logarithm of sales series: ", XTrain.shape)
print("Shape of the validation set logarithm of sales series: ", XValid.shape)


Shape of the training set date series:  (846, 1)
Shape of the validation set date series:  (96, 1)

Shape of the training set logarithm of sales series:  (846,)
Shape of the validation set logarithm of sales series:  (96,)


In [9]:
# Set the seed for reproducibility using TensorFlow's random number generator
tf.random.set_seed(1234)

# Add an extra dimension to the training data (XTrain) using tf.expand_dims
# This is typically done to make the data compatible with the input requirements of certain models, such as LSTM
series = tf.expand_dims(XTrain, axis=-1)

# Print the shape of the resulting data with the added dimension
print("Shape after adding an extra dimension: ", series.shape)

Shape after adding an extra dimension:  (846, 1)


In [10]:
#  create tensor from each individual element
dataset = tf.data.Dataset.from_tensor_slices(series)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(1,), dtype=tf.float32, name=None)>

In [11]:
# takes a window_size + 1 chunk from the slices
dataset = dataset.window(WINDOW_SIZE + 1, shift=1, drop_remainder=True)

In [12]:
# Example of Window
datasetEx = tf.data.Dataset.from_tensor_slices(tf.range(10))
datasetEx = datasetEx.window(5, shift=1, drop_remainder=True)
for window in datasetEx:
    print([elem.numpy() for elem in window])

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]


In [13]:
dataset = dataset.flat_map(lambda window: window.batch(WINDOW_SIZE + 1))

In [14]:
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))

In [15]:
dataset = dataset.batch(BATCH_SIZE).prefetch(1)

In [16]:
def windowed_dataset(series, window_size=WINDOW_SIZE, batch_size=BATCH_SIZE):
  series = tf.expand_dims(series, axis=-1)
  dataset = tf.data.Dataset.from_tensor_slices(series)
  dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
  dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
  dataset = dataset.batch(batch_size).prefetch(1)
  return dataset


In [17]:
DatasetTrain = windowed_dataset(XTrain)
DatasetVal = windowed_dataset(XValid)

# Model Training


In [18]:
model = Sequential()
model.add(LSTM(8, input_shape=[None, 1], return_sequences=True))
model.add(LSTM(4, input_shape=[None, 1]))
model.add(Dense(1))
model.compile(loss="huber_loss", optimizer='adam')





In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 8)           320       
                                                                 
 lstm_1 (LSTM)               (None, 4)                 208       
                                                                 
 dense (Dense)               (None, 1)                 5         
                                                                 
Total params: 533 (2.08 KB)
Trainable params: 533 (2.08 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
mlflow.set_experiment("LSTM")
mlflow.tensorflow.autolog()
history = model.fit(DatasetTrain, epochs=EPOCHS, validation_data=DatasetVal, verbose=1)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2024/02/05 19:19:37 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'edf5fa2a615b499bbf3f53fe1f08579c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow
2024/02/05 19:19:38 INFO mlflow.types.utils: MLflow 2.9.0 introduces model signature with new data types for lists and dictionaries. For input such as Dict[str, Union[scalars, List, Dict]], we infer dictionary values types as `List -> Array` and `Dict -> Object`. 
2024/02/05 19:19:38 INFO mlflow.types.utils: MLflow 2.9.0 introd

Epoch 1/200

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 7

INFO:tensorflow:Assets written to: C:\Users\VARUN\AppData\Local\Temp\tmpa4lf5sf1\model\data\model\assets


In [27]:
from time import gmtime, strftime

In [28]:
time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model.save(f'../models/LSTM_sales-{time}.pkl')


INFO:tensorflow:Assets written to: ../models/LSTM_sales-2024-02-05-13-57-34.pkl\assets


INFO:tensorflow:Assets written to: ../models/LSTM_sales-2024-02-05-13-57-34.pkl\assets
