In [1]:
import pandas as pd          
import numpy as np          # For mathematical calculations 
import matplotlib.pyplot as plt  # For plotting graphs 
from datetime import datetime    # To access datetime 
from pandas import Series        # To work on series 
%matplotlib inline 
import warnings   
import seaborn as sns
import itertools
import math
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


from math import sqrt 
from matplotlib import pyplot

# univariate mlp example
from numpy import array
from keras.models import Sequential
from keras.layers import Dense


from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

from keras.layers import LSTM

Using TensorFlow backend.


In [2]:
data=pd.read_csv("data.csv") 
data_original=data.copy()

#### Convert date column into date format

In [3]:
data['calendar_date'] = pd.to_datetime(data.calendar_date,format='%m/%d/%Y') 
data.index = data['calendar_date']
data.sort_index(inplace=True)

### There are some negative volume, revenue
+ It could be due to goods return
+ It might be data error, some numbers are mistakely with negative sign

Assumption: I consider it as an error. Therefore, I would change the sign of negative numbers

Before that let's look at number of rows with negative values corresponding each column

In [4]:
data[data<0].count()

product_id          0
calendar_date    1340
volume_sold         1
revenue             1
cost                1
vdm1                1
vwm1                6
vmm1                6
vmm2                0
vmm13               0
rdm1                1
rwm1                6
rmm1                5
rmm2                0
rmm13               0
cdm1                1
cwm1                6
cmm1                6
cmm2                0
cmm13               0
mpdm1               0
mpwm1               0
mpmm1               0
mpmm2               0
mpmm13              0
stock_level         2
retail_price        0
woy                 0
dtype: int64

In [5]:
data['volume_sold']=data['volume_sold'].abs()
data['revenue']=data['revenue'].abs()
data['cost']=data['cost'].abs()
data['vdm1']=data['vdm1'].abs()
data['vwm1']=data['vwm1'].abs()
data['vmm1']=data['vmm1'].abs()
data['rdm1']=data['rdm1'].abs()
data['rwm1']=data['rwm1'].abs()
data['rmm1']=data['rmm1'].abs()
data['cdm1']=data['cdm1'].abs()
data['cwm1']=data['cwm1'].abs()
data['cmm1']=data['cmm1'].abs()
data['stock_level']=data['stock_level'].abs()

### Helper Functions

In [6]:
def model_quality(observed,predictions,model):
    print("__"*50,"\n")
    print('The RMSE of the '+str(model)+' Regression is',sqrt(mean_squared_error(observed,predictions)))
    print("__"*50,"\n")
    
    print('The MSE of the '+str(model)+' Regression is',mean_squared_error(observed,predictions))
   
    print("__"*50,"\n")
    print('The MAE of the '+str(model)+' Regression is',mean_absolute_error(observed,predictions))

    
    print("__"*50,"\n")
    print("Sum of Original Sales for next 28 days: ",observed.sum())
    print("Sum of Predicted Sales for next 28 days: ",predictions.sum())
    print('The difference between model original and predictions values of the '+str(model)+' Regression is',observed.sum() - predictions.sum())

    
    print("__"*50,"\n")

In [7]:
def pre_process_data(data,product,size_test_data,product_list):
    
    data=data.dropna()
    
    data_model_df=data[data['product_id']==product]
    
    for j in range(len(product_list)):
        if product_list[j]!=product:
            data_model_df[str(product_list[j])+'_comp1_price']=data[data['product_id']==product_list[j]]['retail_price']
            data_model_df['Item_comp_price_diff_'+str(product_list[j])]=data[data['product_id']==product]['retail_price'] - data_model_df[str(product_list[j])+'_comp1_price']
    features_col=list(set(data_model_df.columns.tolist())-set(["product_id","calendar_date","revenue","cost","stock_level","woy","vdm1","vwm1","vmm1","rdm1","rwm1","rmm1","cdm1","cwm1","cmm1","mpdm1","mpwm1","mpmm1"]))
    
    
    df_train=data_model_df[data_model_df["calendar_date"]<="2017-10-31"][features_col] 
    df_test=data_model_df[data_model_df["calendar_date"]>"2017-10-31"][features_col]
   
    df_test=df_test.head(size_test_data)
    
    return df_train,df_test

In [8]:
def fit_features_scaler(df_train):
    
    scaler = StandardScaler()
    scaler.fit(df_train.drop('volume_sold', axis=1))
        
    return scaler
    
def transform_features(df, scaler):
   
    df_features = df.drop('volume_sold', axis=1)
    df_scaled = pd.DataFrame(scaler.transform(df_features), columns=df_features.columns)
       
    
    return df_scaled

In [9]:
def plot_series(df_train,df_test,Prediction):
        
        ts = df_train['volume_sold'] 
        ts1=ts.append(df_test['volume_sold'])
        ts2=ts.append(df_test['predictions'])
        
        plt.figure(figsize=(16,8)) 
        plt.plot(ts2, 'r',label='Predictions')
        plt.plot(ts1,'g',label='Original Series') 
        plt.title('Time Series') 
        plt.xlabel("Time(year-month)") 
        plt.ylabel("Item sold count") 
        plt.legend(loc='best')
        plt.show()

In [10]:
def weekend(row):
    if row.dayofweek == 5 or row.dayofweek == 6:
        return 1
    else:
        return 0 

In [11]:
# Based on assumption that we have mean_retail_price (retail_price on next 1 week)
def price_diff_retail_price_with_max_price_in_previous_periods(df):
    
    df['Retail_price_diff_max_price_in_last_2month']=df['retail_price'] -df['mpmm2']/28
    df['Retail_price_diff_max_price_in_last_year_13month']=df['retail_price'] -df['mpmm13']/28
    
    return df

# Features

#### Time Features

In [12]:
data['month']=data.calendar_date.dt.month 
data['day']=data.calendar_date.dt.day

In [13]:
temp2 = data['calendar_date'].apply(weekend) 
data['weekend']=temp2

#### Difference in current retail price with previous maximum prices

In [14]:
data=price_diff_retail_price_with_max_price_in_previous_periods(data)

# Modeling

### MLP

In [15]:
product_list=list(data['product_id'].unique())

for i in range(len(list(product_list))):
    print("______________")
    print("Product: "+str(product_list[i]))

    
    df_train,df_test=pre_process_data(data,product_list[i],28,product_list) 

    # scale features
    feature_scaler = fit_features_scaler(df_train)
    feature_train_scaled = transform_features(df_train, feature_scaler)
    feature_test_scaled = transform_features(df_test, feature_scaler)
    
    X=feature_train_scaled.as_matrix()
    y=df_train['volume_sold'].as_matrix()
    
    # model design
    model = Sequential()
    model.add(Dense(100, activation='relu', input_dim=X.shape[1]))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    
    # fit model
    model.fit(X, y, epochs=500, verbose=0)
    
    # demonstrate prediction
    x_test = feature_test_scaled.as_matrix()
    y_input = df_test['volume_sold'].as_matrix()
    
    yhat = model.predict(x_test, verbose=0)
    
    model_quality(y_input,yhat.flatten(),'MLP')
   

    

______________
Product: 13701


W0825 10:24:25.961333 139879973164800 deprecation_wrapper.py:119] From /home/bhbhan/.conda/envs/newron_clone/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0825 10:24:25.992407 139879973164800 deprecation_wrapper.py:119] From /home/bhbhan/.conda/envs/newron_clone/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0825 10:24:25.995949 139879973164800 deprecation_wrapper.py:119] From /home/bhbhan/.conda/envs/newron_clone/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0825 10:24:26.035867 139879973164800 deprecation_wrapper.py:119] From /home/bhbhan/.conda/envs/newron_clone/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated

____________________________________________________________________________________________________ 

The RMSE of the MLP Regression is 574.6362965616187
____________________________________________________________________________________________________ 

The MSE of the MLP Regression is 330206.87332605256
____________________________________________________________________________________________________ 

The MAE of the MLP Regression is 419.95759955474307
____________________________________________________________________________________________________ 

Sum of Original Sales for next 28 days:  16510
Sum of Predicted Sales for next 28 days:  5853.206
The difference between model original and predictions values of the MLP Regression is 10656.7939453125
____________________________________________________________________________________________________ 

______________
Product: 26104
__________________________________________________________________________________________________

#### LSTM

In [17]:
product_list=list(data['product_id'].unique())

for i in range(len(list(product_list))):
    print("______________")
    print("Product: "+str(product_list[i]))

    
    df_train,df_test=pre_process_data(data,product_list[i],28,product_list) 

    # scale features
    feature_scaler = fit_features_scaler(df_train)
    feature_train_scaled = transform_features(df_train, feature_scaler)
    feature_test_scaled = transform_features(df_test, feature_scaler)
    
    X=feature_train_scaled.as_matrix()
    y=df_train['volume_sold'].as_matrix()
    
    X = X.reshape((X.shape[0], X.shape[1], 1))
    
    # model design
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], 1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')

    
    # fit model
    model.fit(X, y, epochs=20, verbose=0)
    
    # demonstrate prediction
    x_test = feature_test_scaled.as_matrix()  
    x_test = x_test.reshape((x_test.shape[0],x_test.shape[1], 1))
    
    y_input = df_test['volume_sold'].as_matrix()
    
    yhat = model.predict(x_test, verbose=0)
    
    model_quality(y_input,yhat.flatten(),'LSTM')


______________
Product: 13701


W0825 10:25:18.607578 139879973164800 deprecation.py:323] From /home/bhbhan/.conda/envs/newron_clone/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


____________________________________________________________________________________________________ 

The RMSE of the LSTM Regression is 728.118287486699
____________________________________________________________________________________________________ 

The MSE of the LSTM Regression is 530156.2405725633
____________________________________________________________________________________________________ 

The MAE of the LSTM Regression is 570.0844658187458
____________________________________________________________________________________________________ 

Sum of Original Sales for next 28 days:  16510
Sum of Predicted Sales for next 28 days:  547.63495
The difference between model original and predictions values of the LSTM Regression is 15962.365051269531
____________________________________________________________________________________________________ 

______________
Product: 26104
______________________________________________________________________________________________