# Import Necessary Modules

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, tqdm_notebook

import warnings 
warnings.simplefilter('ignore')

np.set_printoptions(suppress=True) # supress scientific notation

In [135]:
# Read in raw data.
final_data = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/final_data.pickle')

In [3]:
print('Preview of final_data:')
display(final_data.head())

Preview of final_data:


Unnamed: 0,store_dept,date_x,sales_chg_rt_x,sales_chg_rt_2,sales_chg_rt_y,store_label_0,store_label_1,store_label_2,store_label_3,store_label_4,...,date_x_month_8,date_x_month_9,date_x_month_10,date_x_month_11,date_x_month_12,date_x_wom_1,date_x_wom_2,date_x_wom_3,date_x_wom_4,date_x_wom_5
53,10_1,2011-02-11,0.441881,-0.033033,0.683508,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
54,10_1,2011-02-18,0.228364,0.209259,-0.265154,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
55,10_1,2011-02-25,-0.573973,0.441881,-0.324569,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
56,10_1,2011-03-04,0.112619,0.228364,0.088423,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
57,10_1,2011-03-11,-0.013872,-0.573973,-0.005243,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
print('The completed discription of columns:')
final_data.info()

The completed discription of columns:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 299790 entries, 53 to 476332
Data columns (total 34 columns):
store_dept         299790 non-null object
date_x             299790 non-null datetime64[ns]
sales_chg_rt_x     299790 non-null float64
sales_chg_rt_2     299790 non-null float64
sales_chg_rt_y     299790 non-null float64
store_label_0      299790 non-null uint8
store_label_1      299790 non-null uint8
store_label_2      299790 non-null uint8
store_label_3      299790 non-null uint8
store_label_4      299790 non-null uint8
dept_label_0       299790 non-null uint8
dept_label_1       299790 non-null uint8
dept_label_2       299790 non-null uint8
dept_label_3       299790 non-null uint8
dept_label_5       299790 non-null uint8
dept_label_6       299790 non-null uint8
dept_label_7       299790 non-null uint8
date_x_month_1     299790 non-null uint8
date_x_month_2     299790 non-null uint8
date_x_month_3     299790 non-null uint8
date_x_month_4

In [136]:
final_data = final_data[final_data['sales_chg_rt_2'] != np.inf]
final_data = final_data[final_data['sales_chg_rt_y'] != np.inf]
final_data = final_data[final_data['sales_chg_rt_x'] != np.inf]

In [133]:
final_data.max()

store_dept                        9_98
date_x             2012-10-26 00:00:00
sales_chg_rt_x                   12550
sales_chg_rt_2                   12550
sales_chg_rt_y                 7652.69
store_label_0                        1
store_label_1                        1
store_label_2                        1
store_label_3                        1
store_label_4                        1
dept_label_0                         1
dept_label_1                         1
dept_label_2                         1
dept_label_3                         1
dept_label_5                         1
dept_label_6                         1
dept_label_7                         1
date_x_month_1                       1
date_x_month_2                       1
date_x_month_3                       1
date_x_month_4                       1
date_x_month_5                       1
date_x_month_6                       1
date_x_month_7                       1
date_x_month_8                       1
date_x_month_9           

# Normalize Input Features

There exist inf in some features, thus in order to avoid exploding gradient, I will normalize all features first.

In [138]:
from sklearn.preprocessing import MinMaxScaler

# standardize numerical feature
final_data_2 = final_data.drop(labels=['date_x','store_dept'], axis=1)
scaler = MinMaxScaler(feature_range=(0,1)) # force values limited to (0,1)
scaler.fit(final_data_2)

final_data_std = pd.DataFrame(scaler.transform(final_data_2),
                        index=final_data_2.index.values, 
                        columns=[col + '_std' for col in final_data_2.columns.values])

In [139]:
final_data = pd.concat([final_data[['store_dept','date_x']],final_data_std], axis=1)

# Create Training, Validation and Testing Dataset

Noted that the splitting point for training data matches the same splitting timing for training dataset when clustering time series. That is on date `2011-12-31`.

In [140]:
train_data = pd.DataFrame()
validation_data = pd.DataFrame()
test_data = pd.DataFrame()

# Create train and test dataset for each store_dept.
for k, group in final_data.groupby('store_dept'):
    train_g = group[group['date_x'] <= '2011-12-31'] # match the criterion for clustering time series
    remain = group[group['date_x'] > '2011-12-31']
    remain_half_row = remain.shape[0] // 2
    train_data = train_data.append(train_g)
    validation_data = validation_data.append(remain.iloc[:remain_half_row,:]) # half of remaining rows are validation
    test_data = test_data.append(remain.iloc[remain_half_row:,:]) # half of remaining rows are testing dataset

In [142]:
print('The Size of Training Dataset:')
display(train_data.shape)

The Size of Training Dataset:


(156545, 34)

In [143]:
print('The Size of Validation Dataset:')
display(validation_data.shape)

The Size of Validation Dataset:


(69949, 34)

In [144]:
print('The Size of Testing Dataset:')
display(test_data.shape)

The Size of Testing Dataset:


(73277, 34)

In [145]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156545 entries, 53 to 476289
Data columns (total 34 columns):
store_dept             156545 non-null object
date_x                 156545 non-null datetime64[ns]
sales_chg_rt_x_std     156545 non-null float64
sales_chg_rt_2_std     156545 non-null float64
sales_chg_rt_y_std     156545 non-null float64
store_label_0_std      156545 non-null float64
store_label_1_std      156545 non-null float64
store_label_2_std      156545 non-null float64
store_label_3_std      156545 non-null float64
store_label_4_std      156545 non-null float64
dept_label_0_std       156545 non-null float64
dept_label_1_std       156545 non-null float64
dept_label_2_std       156545 non-null float64
dept_label_3_std       156545 non-null float64
dept_label_5_std       156545 non-null float64
dept_label_6_std       156545 non-null float64
dept_label_7_std       156545 non-null float64
date_x_month_1_std     156545 non-null float64
date_x_month_2_std     156545 non-nu

# Create Target and Features

In [146]:
train_y = train_data['sales_chg_rt_x_std']
train_x = train_data.drop(labels='sales_chg_rt_x_std', axis=1)

# Build up MLP Model

In [147]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

In [148]:
def hidden_init(layer):
    '''
    Provide fan in (the number of input units) of each hidden layer
    as the component of normalizer.
    :param
        layer: hidden layer
    :return
        (-lim, lim): tuple of min and max value for uniform distribution
    '''

    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

In [149]:
# Set up MLP model.
class MLP(nn.Module):
    def __init__(self, input_size, output_size, fc1_units=128, fc2_units=64):
        super().__init__()
        self.fc1 = nn.Linear(input_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, output_size)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x

In [150]:
from math import floor

def get_batch(df, target, batch_size=32):
    """Helper function to get batch of data for training in Pytorch.
    
    @params
        df: scipy csr matrix
        target: pandas series
    
    """
    n_batch = floor(df.shape[0]/batch_size)
    
    complete_df = df.iloc[:n_batch*batch_size,:]
    y_target = target[:n_batch*batch_size]

    for i in range(0, n_batch*batch_size, batch_size):
        batch_feature = complete_df.iloc[i:i+batch_size, :]
        batch_y_target = y_target[i:i+batch_size]
        
        yield batch_feature, batch_y_target

In [157]:
# Hyper-parameters.
input_size = train_x.shape[1] - 2
output_size = 1
learning_rate = 0.02
batch_size = 32

epochs = 15
step = 0
print_every = 2500

In [160]:
mlp_model = MLP(input_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=learning_rate)

In [161]:
# train
for e in range(epochs):
    running_loss = 0
    generator = get_batch(df=train_x, target=train_y, batch_size=batch_size)
    for feature, y_true in generator:
        step += 1
        optimizer.zero_grad()
        
        feature = feature.drop(labels=['store_dept','date_x'], axis=1) # drop non-numeric columns
        
        feature = torch.tensor(np.array(feature), dtype=torch.float)
        y_true = torch.tensor(np.array(y_true), dtype=torch.float)
        
        output = mlp_model(feature)
        loss = criterion(output, y_true)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(mlp_model.parameters(), 1)
        optimizer.step()
        
        running_loss += loss.item()
        
        if step % print_every == 0:
            print('Epoch: {}/{}...'.format(e+1, epochs),
                 'Loss: {:.9f}'.format(running_loss/print_every))
            
            running_loss = 0

Epoch: 1/15... Loss: 0.000120591
Epoch: 1/15... Loss: 0.000019274
Epoch: 2/15... Loss: 0.000027100
Epoch: 2/15... Loss: 0.000112223
Epoch: 3/15... Loss: 0.000025700
Epoch: 3/15... Loss: 0.000012083
Epoch: 4/15... Loss: 0.000027632
Epoch: 4/15... Loss: 0.000012228
Epoch: 5/15... Loss: 0.000027877
Epoch: 5/15... Loss: 0.000012035
Epoch: 6/15... Loss: 0.000029514
Epoch: 6/15... Loss: 0.000010516
Epoch: 7/15... Loss: 0.000030041
Epoch: 7/15... Loss: 0.000010977
Epoch: 8/15... Loss: 0.000030358
Epoch: 8/15... Loss: 0.000010883
Epoch: 9/15... Loss: 0.000031964
Epoch: 9/15... Loss: 0.000010448
Epoch: 10/15... Loss: 0.000032553
Epoch: 10/15... Loss: 0.000010036
Epoch: 11/15... Loss: 0.000034587
Epoch: 11/15... Loss: 0.000008756
Epoch: 12/15... Loss: 0.000034747
Epoch: 13/15... Loss: 0.000000244
Epoch: 13/15... Loss: 0.000035046
Epoch: 14/15... Loss: 0.000000487
Epoch: 14/15... Loss: 0.000034900
Epoch: 15/15... Loss: 0.000003600
Epoch: 15/15... Loss: 0.000031865


In [105]:
train_x.max()

store_dept                        9_98
date_x             2011-12-30 00:00:00
sales_chg_rt_2                     inf
sales_chg_rt_y                     inf
store_label_0                        1
store_label_1                        1
store_label_2                        1
store_label_3                        1
store_label_4                        1
dept_label_0                         1
dept_label_1                         1
dept_label_2                         1
dept_label_3                         1
dept_label_5                         1
dept_label_6                         1
dept_label_7                         1
date_x_month_1                       0
date_x_month_2                       1
date_x_month_3                       1
date_x_month_4                       1
date_x_month_5                       1
date_x_month_6                       1
date_x_month_7                       1
date_x_month_8                       1
date_x_month_9                       1
date_x_month_10          

31