## Demand Prediction using Machine Learning

In [18]:
import pandas as pd
from datetime import datetime, timedelta
import calendar
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,mean_absolute_error
import lightgbm as lgb
from sklearn.svm import SVR
import seaborn as sns
%matplotlib inline

In [2]:
df_initial = pd.read_pickle('./DemandDataFile', compression='infer')
df_region = pd.read_pickle('./RegionDataFile', compression='infer')
df_initial = pd.merge(df_initial, df_region, how='inner', right_on=['CITY_NAME'], left_on=['CITY'])
df_initial = df_initial.drop(['CITY_NAME'], axis=1)

df_initial = df_initial[~df_initial['PRODUCT_NAME'].str.contains("Small Flyers|Large Flyers|Meter Bubble Wrap|Bundle of 50 Boxes|Wrap", na=False)]
df_initial.rename(columns = {'ORDER_DATE':'DATE'},inplace = True)
df_initial.sort_values('DATE',ascending=True, inplace = True)
df_initial.DATE = pd.to_datetime(df_initial['DATE'])

df_reviews = pd.read_csv('./ProductReviews.csv')
df_initial = pd.merge(df_initial, df_reviews, how='left', right_on=['COD_SKU_CONFIG'], left_on=['SKU'])
df_initial = df_initial.drop(['COD_SKU_CONFIG'], axis=1)

In [3]:
df_fraud = pd.read_csv('./FradulentOrders.csv',dtype={'ORDER_NR': str})

df_initial = df_initial[~df_initial.COD_ORDER_NR.isin(df_fraud.ORDER_NR.tolist())]

df_initial['WareHouse'] = 'Null'
df_initial.loc[:,"WareHouse"][df_initial['REGION_NAME'].isin(['Sindh','Balochistan'])] = 'Karachi'
df_initial.loc[:,"WareHouse"][~df_initial['REGION_NAME'].isin(['Sindh','Balochistan'])] = 'Lahore'
#df_initial['WareHouse'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [4]:
def isHoliday(x):
    if x in df_hday18.Date.values:
        return 1
    else:
        return 0

In [5]:
df_hday18 = pd.read_csv('./Holidays2018.csv')
df_hday18.Date = pd.to_datetime(df_hday18['Date'])


In [6]:
#df_initial[df_initial.IsHoliday == 1].head()
df_initial.shape

(2683504, 23)

In [7]:
df_initial['MedianPrice'] = df_initial.groupby('SKU')['UNIT_PRICE'].transform('median')
df_initial['MedianPrice'] = pd.to_numeric(df_initial['MedianPrice'])

In [8]:
df_initial['CatConcat'] = df_initial[['PRODUCT_NAME','BRAND_NAME','CATEGORY_LEVEL_1', 'CATEGORY_LEVEL_2','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4']].apply(lambda x: ' | '.join(x.str.strip()), axis=1)
df_initial.shape

(2683504, 25)

In [9]:
def compute_shift(df,dateCol,groupCol):
    df['group_no'] = df.groupby([groupCol]).ngroup()
    tmp = df[[dateCol,'Quantity','group_no']].set_index(['group_no',dateCol])\
                                          .unstack('group_no')\
                                          .resample('D').asfreq()
    tmp1 = tmp.shift(1).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D1')
    tmp2 = tmp.shift(2).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D2')
    tmp3 = tmp.shift(3).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D3')
    tmp4 = tmp.shift(4).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D4')
    tmp5 = tmp.shift(5).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D5')
    
    df = df.join(tmp1, on=[dateCol,'group_no'])
    df = df.join(tmp2, on=[dateCol,'group_no'])
    df = df.join(tmp3, on=[dateCol,'group_no'])
    df = df.join(tmp4, on=[dateCol,'group_no'])
    df = df.join(tmp5, on=[dateCol,'group_no'])
    
    df.drop(axis=1, columns=['group_no'], inplace = True)
    return df

In [10]:
def log_inf(x):
    if x>0:
        return np.log(x) 
    else:
        return np.log(1) 

def is_bundle(x):
    if 'Bundle' in x or 'Pack' in x or '+' in x:
        return 1
    else:
        return 0


In [11]:
def product_Gender(x):
    if 'Men' in x:
        return 'MEN'
    elif 'Woman' in x:
        return 'WOMAN'
    else:
        return 'NEUTRAL'
    
def is_GroceryOrBaby(x):
    if 'Grocer' in x or 'Baby' in x:
        return 1
    else:
        return 0
    

def is_PrevWeekHoliday(x):
    if len(tempHolidayWeek[tempHolidayWeek  == x].values) > 0 :
        return True
    else:
        return False

In [12]:
import time
def prepareDataFrame(wareHouse):
    train_df = df_initial[['SKU','DATE','WareHouse','Quantity','MedianPrice','PRODUCT_NAME','CatConcat']][df_initial.WareHouse == wareHouse]
    
    
    train_df['IsBundle'] = train_df['PRODUCT_NAME'].map(is_bundle)
    train_df['ProductGender'] = train_df['CatConcat'].map(product_Gender)
    train_df['IsGrocery'] = train_df['CatConcat'].map(is_GroceryOrBaby)
    
    #train_df = train_df[(train_df.SKU == 'HP770OT03D0JKNAFAMZ') | (train_df.SKU == 'SH069FA039PJONAFAMZ')]
    train_df = train_df.groupby(by=['SKU','DATE','WareHouse','MedianPrice','IsBundle','IsGrocery','ProductGender','PRODUCT_NAME','CatConcat'], as_index=False)['Quantity'].sum()
    train_df.sort_values('DATE',ascending=True, inplace = True)
    train_df.DATE = pd.to_datetime(train_df['DATE'])
    train_df = train_df.set_index('DATE')

    
    #Gettign the SKUs whcih were not demanded on the start date 
    startDate = '2017-12-01'
    temp = train_df.reset_index().groupby('SKU').first()
    temp.drop(temp[temp.DATE == startDate].index, inplace=True)

    # replacing date to the Min Start date & Quantity Demand to None
    temp['DATE'] = pd.to_datetime(startDate)
    temp['Quantity'] = 0
    if temp.index.name == 'SKU':
        temp.reset_index(inplace = True)
    
    temp = temp.set_index('DATE')
    
    train_df = train_df.append(temp)
    train_df.reset_index(inplace=True)
    train_df['WEEKDAY'] = train_df['DATE'].apply(lambda x:calendar.day_name[x.weekday()])
    train_df['MONTH'] = train_df['DATE'].apply(lambda x:calendar.month_abbr[x.month])
    train_df['IsHoliday'] = [isHoliday(x) for x in train_df['DATE'].values]
    
    train_df['YEAR'] = train_df['DATE'].apply(lambda x:x.year)
    train_df['YEAR'] = train_df['YEAR'].apply(str)
    
    train_df['WEEKNO'] = train_df['DATE'].dt.week
    #train_df['WEEKNO'] = train_df['WEEKNO'].apply(str)
    
    
    
   
    train_df = train_df.groupby(by=['SKU','YEAR','WEEKNO','WareHouse','MedianPrice','IsGrocery','IsBundle','ProductGender','PRODUCT_NAME','CatConcat'], as_index=False)['Quantity','IsHoliday'].sum()
   
    return train_df

In [13]:
tempKhi = prepareDataFrame('Karachi')
tempKhi = tempKhi.sort_values(by=['SKU','YEAR', 'WEEKNO'], ascending=True)
tempKhi['WEEKNO'] = tempKhi['WEEKNO'].apply(int)
tempKhi.fillna(0, inplace=True)
print(tempKhi.shape)
#train_df = temp.copy()

tempLhr = prepareDataFrame('Lahore')
tempLhr = tempLhr.sort_values(by=['SKU','YEAR', 'WEEKNO'], ascending=True)
tempLhr['WEEKNO'] = tempLhr['WEEKNO'].apply(int)
tempLhr.fillna(0, inplace=True)
print(tempLhr.shape)

(722498, 12)
(1112549, 12)


In [14]:
tempLhr['Last_Week_Sales'] = tempLhr.groupby(['SKU'])['Quantity'].shift(1)[(tempLhr.WEEKNO == tempLhr.WEEKNO.shift(1) + 1)]
tempLhr['Last_Week_Diff'] = tempLhr.groupby(['SKU'])['Last_Week_Sales'].diff(1)[(tempLhr.WEEKNO == tempLhr.WEEKNO.shift(1) + 1)]
#tempLhr[(tempLhr.SKU == '00301FA025DPKNAFAMZ')]

In [15]:
tempKhi['Last_Week_Sales'] = tempKhi.groupby(['SKU'])['Quantity'].shift()[(tempKhi.WEEKNO == tempKhi.WEEKNO.shift() + 1)]
tempKhi['Last_Week_Diff'] = tempKhi.groupby(['SKU'])['Last_Week_Sales'].diff(1)[(tempKhi.WEEKNO == tempKhi.WEEKNO.shift(1) + 1)]
#tempKhi[(tempKhi.SKU == '00301FA025DPKNAFAMZ')]
#train_df[train_df.SKU == '00301FA025DPKNAFAMZ']

In [16]:
temp = pd.concat([tempKhi, tempLhr])
tempHolidayWeek = temp[temp.IsHoliday == 1]['WEEKNO'].unique()

tempHolidayWeek = (pd.Series(tempHolidayWeek + 1))
temp['PrevWeekHoliday'] = temp.WEEKNO.apply(lambda x : len(tempHolidayWeek[tempHolidayWeek  == x].values) > 0)

#since we added 1 above so we are subtracting 2
tempHolidayWeek = (pd.Series(tempHolidayWeek - 2))
temp['NextWeekHoliday'] = temp.WEEKNO.apply(lambda x : len(tempHolidayWeek[tempHolidayWeek  == x].values) > 0)

In [122]:
temp.fillna(0, inplace=True)
temp.head()

Unnamed: 0,SKU,YEAR,WEEKNO,WareHouse,MedianPrice,IsGrocery,IsBundle,ProductGender,PRODUCT_NAME,CatConcat,...,IsHoliday,Last_Week_Sales,Last_Week_Diff,PrevWeekHoliday,NextWeekHoliday,Karachi,Lahore,MEN,NEUTRAL,WOMAN
0,00301FA025DPKNAFAMZ,2017,48,Karachi,399.0,0,0,MEN,Unisex Style Baseball Cap - Black,Unisex Style Baseball Cap - Black | 0092 store...,...,1,0.0,0.0,0,0,1,0,1,0,0
1,00301FA025DPKNAFAMZ,2018,18,Karachi,399.0,0,0,MEN,Unisex Style Baseball Cap - Black,Unisex Style Baseball Cap - Black | 0092 store...,...,0,0.0,0.0,0,0,1,0,1,0,0
2,00301FA025DPKNAFAMZ,2018,19,Karachi,399.0,0,0,MEN,Unisex Style Baseball Cap - Black,Unisex Style Baseball Cap - Black | 0092 store...,...,0,1.0,0.0,1,0,1,0,1,0,0
3,00301FA025DPKNAFAMZ,2018,21,Karachi,399.0,0,0,MEN,Unisex Style Baseball Cap - Black,Unisex Style Baseball Cap - Black | 0092 store...,...,0,0.0,0.0,0,0,1,0,1,0,0
4,00301FA0QSN4YNAFAMZ,2017,48,Karachi,600.0,0,0,NEUTRAL,Blue Golden Tulip Brooch For Women,Blue Golden Tulip Brooch For Women | 0092 stor...,...,1,0.0,0.0,0,0,1,0,0,1,0


In [None]:
fig, ax = plt.subplots(figsize=(13, 6))
#bins = np.arange(0,60,5) , use bins=bins in hist function below for smaller values
train_df['Quantity'].hist(ax=ax, bottom=0.1)

#formatter = FuncFormatter(log_10_product)
ax.set_yscale('log')


In [None]:
#train_df[train_df['Quantity'] > 500].PRODUCT_NAME.value_counts()

### Visualization

In [None]:
test = df_initial.groupby(by=['DATE','SKU','CATEGORY_LEVEL_1'], as_index=False)['Quantity'].sum()

In [None]:
   
g = sns.FacetGrid(test[test.Quantity < 1000], col="CATEGORY_LEVEL_1")
g = g.map(plt.hist, "Quantity", log=True)

### BenchMark Model (Predict Demand as Avergae of last N days demand)

In [None]:
test_df = train_df[(train_df.DATE >= '2018-05-01') & (train_df.Quantity <= 100)]

In [None]:
test_df['PredictedDemand'] = np.int64((test_df.D1+test_df.D2+test_df.D3+test_df.D4+test_df.D5)/5)
#test_df.loc[:,'PredictedDemand'] = test_df['PredictedDemand'].apply(lambda x : log_inf(x))
test_df.head()

In [None]:
import math
print("MSE: ",mean_squared_error(test_df.Quantity, test_df.PredictedDemand),
      "RMSE: ",math.sqrt(mean_squared_error(test_df.Quantity, test_df.PredictedDemand))
     )


### ML model Data Prepration

In [19]:
#train_df.drop(axis=1, columns=['Karachi','Lahore'], inplace = True)
if not {'MEN', 'NEUTRAL','Karachi','Lahore'}.issubset(temp.columns):
    dummyWareHouse = pd.get_dummies(temp['WareHouse']).astype(int)
    dummyProductGender = pd.get_dummies(temp['ProductGender']).astype(int)
    temp = pd.concat([temp,dummyWareHouse], axis = 1)
    temp = pd.concat([temp,dummyProductGender], axis = 1)
    temp.PrevWeekHoliday = temp.PrevWeekHoliday.astype(int)
    temp.NextWeekHoliday = temp.NextWeekHoliday.astype(int)


#### **Train Test Data Split

In [136]:
colList = ['WEEKNO','Lahore','IsHoliday','IsBundle','MEN','NEUTRAL','WOMAN','MedianPrice','PrevWeekHoliday','Last_Week_Sales','Last_Week_Diff','Quantity']
testWeeks = [19,20,21,22]
train_df = temp
X = train_df[~train_df.WEEKNO.isin(testWeeks)][colList]
#X.MedianPrice = X.MedianPrice.map(log_inf)
#Y_orig = train_df[~train_df.WEEKNO.isin(testWeeks)][['Quantity']]
#Y = Y_orig.Quantity.map(log_inf)
Y = X.iloc[:,-1]
X = X.iloc[:,0:X.shape[1]-1]


In [137]:
X_test = train_df[train_df.WEEKNO.isin(testWeeks)][colList]
#X_test.MedianPrice = X_test.MedianPrice.map(log_inf)
#Y_test_orig = train_df[train_df.WEEKNO.isin(testWeeks)][['Quantity']]
#Y_test = Y_test_orig.Quantity.map(log_inf)
Y_test = X_test.iloc[:,-1]
#X_test_SKUs = train_df[(train_df.DATE >= '2018-05-01') & (train_df.Quantity <= 200)][['SKU']]
X_test = X_test.iloc[:,0:X_test.shape[1]-1]

#### **Decision Tree Regression Model

In [138]:
from sklearn.tree import DecisionTreeRegressor
regr = DecisionTreeRegressor(max_depth=8,min_samples_split=7,min_samples_leaf=7)
regr.fit(X, Y)
y_pred = regr.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )


MSE:  500.78440176534644 RMSE:  22.378212657970394


In [139]:
hold = pd.DataFrame()
# hold['Actual'] = np.exp(Y_test)
# hold['Predicted'] = np.exp(y_pred)

hold['Actual'] = Y_test
hold['Predicted'] = y_pred

print('Total Actual Demand : ',hold.Actual.sum(),'\nTotal Predicted Demand : ',hold.Predicted.sum())

hold[hold.Actual > 50][['Actual','Predicted']].head(25)

Total Actual Demand :  490434 
Total Predicted Demand :  457393.0996375618


Unnamed: 0,Actual,Predicted
1398,66,19.604167
9106,68,36.460879
21513,71,26.032882
27847,89,71.5
27849,62,36.460879
44374,54,1.521114
48272,61,71.5
48273,60,58.552058
51425,76,36.460879
51427,58,19.877159


In [71]:
from sklearn.tree import export_graphviz
import os
import subprocess

def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [None]:
features = ['WeekDayNo','Lahore','IsBundle','MEN','NEUTRAL']
visualize_tree(regr, features)

In [None]:

main_list = list(set(X_test.SKU.unique())-set(X.SKU.unique()))
len(main_list)



### XGBOOST Model

In [140]:
from xgboost import XGBRegressor

from xgboost.sklearn import XGBRegressor  
import scipy.stats as st

one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(1, 20)

params = {  
    "n_estimators": st.randint(2, 20),
    "max_depth": st.randint(2, 10),
    "learning_rate": st.uniform(0.01, 0.1),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

xgbreg = XGBRegressor(params=params) 

xgbreg.fit(X, Y)
y_pred = xgbreg.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )

MSE:  493.6423750102816 RMSE:  22.218064159829083


### POLYNOMIAL REGRESSION

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# create a Linear Regressor   
lin_regressor = LinearRegression()

# pass the order of your polynomial here  
poly = PolynomialFeatures(2)

# convert to be used further to linear regression
X_transform = poly.fit_transform(X)

# fit this to Linear Regressor
lin_regressor.fit(X_transform,Y.Quantity) 

# get the predictions
y_pred = lin_regressor.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )


In [None]:
temp = X_test
temp['Pred'] = y_pred
temp['Act'] = Y_test

temp[temp.Act >= 10]

### GridSearch For Best Model 

In [111]:
from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(xgbreg, params, n_jobs=1)  
gs.fit(X, Y)


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear',
       params={'n_esti..._state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f72423bdb70>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f72445125c0>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f724462e828>, 'colsample_bytree...3c9ba8>, 'min_child_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f72443c9ba8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=Tru

In [92]:
#gs.best_estimator_

In [112]:
y_pred = gs.best_estimator_.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )

MSE:  1509.2202431302985 RMSE:  38.84868393047953


### LGBM MODEL

In [116]:
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 8,
    'learning_rate': 0.01,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 1,
    'metric': 'l1',
    'num_threads': 4
}
MAX_ROUNDS = 200

lgb_train = lgb.Dataset(X, Y)
lgb_test = lgb.Dataset(X_test, Y_test, reference=lgb_train)

gbm = lgb.train(
       params, lgb_train, num_boost_round=MAX_ROUNDS,
       valid_sets=lgb_test, early_stopping_rounds=50, verbose_eval=50
   )
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's l1: 2.35096
[100]	valid_0's l1: 2.34144
[150]	valid_0's l1: 2.35066
Early stopping, best iteration is:
[112]	valid_0's l1: 2.33982
MSE:  1508.388165523654 RMSE:  38.837973241708355
