# Business Case #5 - Retail - Demand Forecasting

## Authors:
#### Débora Santos (m20200748),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

#### Group D - D4B Consulting

In [3]:
#IMPORT LIBRARIES
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import seaborn as sns
from datetime import datetime
from itertools import product
from math import ceil
%matplotlib inline 
from collections import Counter
import matplotlib.cm as cm


#Models
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from xgboost import plot_importance



import warnings
warnings.filterwarnings("ignore")

# Seeting seaborn style
sns.set()

In [8]:
def downcast1(df, verbose=True):
    
    """
    Funciton to reduce the memory used of a particular dataframe by downcasting to a less memory-intensive data type.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
# import dataset in csv
df_group = pd.read_csv('df_demand_group.csv')

In [5]:
df_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29318460 entries, 0 to 29318459
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Year_Week   int64  
 1   Quantity    float64
 2   Store_ID    int64  
 3   Product_ID  int64  
dtypes: float64(1), int64(3)
memory usage: 894.7 MB


In [6]:
df_group

Unnamed: 0,Year_Week,Quantity,Store_ID,Product_ID
0,20161,5.0,100,1000
1,20161,2.0,102,1000
2,20161,9.0,103,1000
3,20161,7.0,104,1000
4,20161,3.0,106,1000
...,...,...,...,...
29318455,201944,2.0,310,998
29318456,201944,2.0,333,998
29318457,201944,4.0,410,998
29318458,201944,1.0,65,998


In [9]:
df_group = downcast1(df_group)

68.7% compressed


In [16]:
lista = list(set(df_group['Year_Week']))

In [18]:
# Create a dataframe of the Cartesian Product of the unique stores and unique products for each week

from itertools import product

train = []

for i in lista:
    
    stores = df_group.loc[df_group["Year_Week"] == i, "Store_ID"].unique()
    
    products = df_group.loc[df_group["Year_Week"] == i, "Product_ID"].unique()
    
    train.append(np.array(list(product(*[[i],stores, products]))))
    
index_feats = ["Year_Week", "Store_ID", "Product_ID"]

train = pd.DataFrame(np.vstack(train), columns=index_feats)

In [22]:
# Create the column showing how many of each product have been sold in each week

group = df_group.groupby(index_feats).agg({"Quantity": "sum"})
group = group.reset_index()

train = pd.merge(train, group, on=index_feats, how="left")
train

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201728,1,1000,2.0
1,201728,1,1001,
2,201728,1,1002,4.0
3,201728,1,1003,
4,201728,1,1004,
...,...,...,...,...
120361742,201727,363,993,6.0
120361743,201727,363,994,
120361744,201727,363,995,4.0
120361745,201727,363,997,


In [27]:
# Use garbage collection to minimise memory usage

import gc

del group

gc.collect()

52

In [28]:
df_forecast = train.fillna(0)
df_forecast.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201728,1,1000,2.0
1,201728,1,1001,0.0
2,201728,1,1002,4.0
3,201728,1,1003,0.0
4,201728,1,1004,0.0


In [29]:
df_forecast = downcast1(df_forecast)

35.7% compressed


In [30]:
def add_mean_feats(df, mean_feats, index_features, agg_col="Quantity", agg_func="mean"):
    
    """
    Function to automatically create new features showing the mean of quantity grouped by the specified columns.
    """
    
    if len(index_features) == 2:
        feature_name = index_features[1] + f"_{agg_col}_{agg_func}"
    else: 
        feature_name = index_features[1] + "_" + index_features[2] + f"_{agg_col}_{agg_func}"
        
    group = df.groupby(index_features).agg({agg_col:agg_func}).reset_index().rename(columns={agg_col:feature_name})
    
    df = pd.merge(df, group, on=index_features, how="left")
    
    df = downcast1(df)
    
    mean_feats.append(feature_name)
    
    del group
    gc.collect()
    
    return df, mean_feats

In [31]:
prod_mean_features = []

df_forecast, prod_mean_features = add_mean_feats(df_forecast, prod_mean_features, ["Year_Week","Product_ID"])

df_forecast

15.4% compressed


Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean
0,201728,1,1000,2,1.828851
1,201728,1,1001,0,0.056235
2,201728,1,1002,4,1.303178
3,201728,1,1003,0,2.026895
4,201728,1,1004,0,2.009780
...,...,...,...,...,...
120361742,201727,363,993,6,17.122250
120361743,201727,363,994,0,1.100245
120361744,201727,363,995,4,2.586797
120361745,201727,363,997,0,0.716381


In [32]:
df_forecast, prod_mean_features = add_mean_feats(df_forecast, prod_mean_features, ["Year_Week","Product_ID", 'Store_ID'])

df_forecast

0.0% compressed


Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean
0,201728,1,1000,2,1.828851,2
1,201728,1,1001,0,0.056235,0
2,201728,1,1002,4,1.303178,4
3,201728,1,1003,0,2.026895,0
4,201728,1,1004,0,2.009780,0
...,...,...,...,...,...,...
120361742,201727,363,993,6,17.122250,6
120361743,201727,363,994,0,1.100245,0
120361744,201727,363,995,4,2.586797,4
120361745,201727,363,997,0,0.716381,0


In [34]:
store_mean_features = []
df_forecast, store_mean_features = add_mean_feats(df_forecast, store_mean_features, ["Year_Week", 'Store_ID'])

df_forecast

12.5% compressed


Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean
0,201728,1,1000,2,1.828851,2,2.163561
1,201728,1,1001,0,0.056235,0,2.163561
2,201728,1,1002,4,1.303178,4,2.163561
3,201728,1,1003,0,2.026895,0,2.163561
4,201728,1,1004,0,2.009780,0,2.163561
...,...,...,...,...,...,...,...
120361742,201727,363,993,6,17.122250,6,0.802512
120361743,201727,363,994,0,1.100245,0,0.802512
120361744,201727,363,995,4,2.586797,4,0.802512
120361745,201727,363,997,0,0.716381,0,0.802512


In [37]:
def add_lags(df, lag_features, index_features, lag_feature, lags=[1,2,3], clip=False):
    
    """
    Function to automatically create lag features based on the columns specified.
    """
    
    df_temp = df[index_features + [lag_feature]].copy()
    
    for i in lags:
        
        feat_name = lag_feature + "_lag" + str(i)
        df_temp.columns = index_features + [feat_name]
        df_temp["Year_Week"] += i
        df = pd.merge(df, df_temp.drop_duplicates(), on=index_features, how="left")
        df[feat_name] = df[feat_name].fillna(0)
        
        if clip:
            lag_feats_to_clip.append(feat_name)
            
    df = downcast1(df)
    del df_temp
    gc.collect()
    
    return df, lag_feats_to_clip

In [38]:
lag_feats_to_clip = []
index_features = ["Year_Week", 'Store_ID', "Product_ID"]

df_forecast, lag_feats_to_clip = add_lags(df_forecast, lag_feats_to_clip, index_features, "Quantity", clip=True)

34.6% compressed


In [39]:
df_forecast.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean,Quantity_lag1,Quantity_lag2,Quantity_lag3
0,201728,1,1000,2,1.828851,2,2.163561,0,0,0
1,201728,1,1001,0,0.056235,0,2.163561,0,0,0
2,201728,1,1002,4,1.303178,4,2.163561,2,5,4
3,201728,1,1003,0,2.026895,0,2.163561,0,0,0
4,201728,1,1004,0,2.00978,0,2.163561,0,2,2


In [41]:
# Now use the lists that have saved previously in creating the mean features to create additional lags

for item in prod_mean_features:
    
    df_forecast, lag_feats_to_clip = add_lags(df_forecast, lag_feats_to_clip, index_features, item, clip=True)

0.0% compressed
25.7% compressed


In [42]:
for item in store_mean_features:
    
    df_forecast, lag_feats_to_clip = add_lags(df_forecast, lag_feats_to_clip, index_features, item, clip=True)

0.0% compressed


In [43]:
df_forecast

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean,Quantity_lag1,Quantity_lag2,Quantity_lag3,Product_ID_Quantity_mean_lag1,Product_ID_Quantity_mean_lag2,Product_ID_Quantity_mean_lag3,Product_ID_Store_ID_Quantity_mean_lag1,Product_ID_Store_ID_Quantity_mean_lag2,Product_ID_Store_ID_Quantity_mean_lag3,Store_ID_Quantity_mean_lag1,Store_ID_Quantity_mean_lag2,Store_ID_Quantity_mean_lag3
0,201728,1,1000,2,1.828851,2,2.163561,0,0,0,1.672372,1.657702,1.687042,0,0,0,1.985345,1.992302,1.791754
1,201728,1,1001,0,0.056235,0,2.163561,0,0,0,0.046455,0.039120,0.034230,0,0,0,1.985345,1.992302,1.791754
2,201728,1,1002,4,1.303178,4,2.163561,2,5,4,1.124694,1.144254,1.293398,2,5,4,1.985345,1.992302,1.791754
3,201728,1,1003,0,2.026895,0,2.163561,0,0,0,1.929095,1.782396,1.762836,0,0,0,1.985345,1.992302,1.791754
4,201728,1,1004,0,2.009780,0,2.163561,0,2,2,1.826406,1.515892,1.684597,0,2,2,1.985345,1.992302,1.791754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120361742,201727,363,993,6,17.122250,6,0.802512,6,9,14,17.691931,17.674816,21.574572,6,9,14,0.945927,0.811174,0.800000
120361743,201727,363,994,0,1.100245,0,0.802512,1,0,1,1.063570,1.105134,1.298288,1,0,1,0.945927,0.811174,0.800000
120361744,201727,363,995,4,2.586797,4,0.802512,2,0,2,2.388753,2.396088,3.420538,2,0,2,0.945927,0.811174,0.800000
120361745,201727,363,997,0,0.716381,0,0.802512,0,0,0,0.691932,0.594132,0.655257,0,0,0,0.945927,0.811174,0.800000


In [44]:
# Create feature showing mean of the three lags

df_forecast["Quantity_lag3_mean"] = df_forecast[["Quantity_lag1", "Quantity_lag2", "Quantity_lag3"]].mean(axis=1)

In [45]:
df_forecast[lag_feats_to_clip + ["Quantity_lag3_mean", "Quantity"]] =  df_forecast[lag_feats_to_clip + ["Quantity_lag3_mean", "Quantity"]].clip(0,20)

In [47]:
df_forecast["lag_grad_1"] = df_forecast["Quantity_lag1"] / df_forecast["Quantity_lag2"]
df_forecast["lag_grad_1"] = df_forecast["lag_grad_1"].replace([np.inf, -np.inf], np.nan).fillna(0)

In [49]:
df_forecast["lag_grad_2"] = df_forecast["Quantity_lag2"] / df_forecast["Quantity_lag3"]
df_forecast["lag_grad_2"] = df_forecast["lag_grad_2"].replace([np.inf, -np.inf], np.nan).fillna(0)

In [50]:
df_forecast = downcast1(df_forecast)
df_forecast.info()

21.6% compressed
<class 'pandas.core.frame.DataFrame'>
Int64Index: 120361747 entries, 0 to 120361746
Data columns (total 22 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   Year_Week                               int32  
 1   Store_ID                                int16  
 2   Product_ID                              int16  
 3   Quantity                                int8   
 4   Product_ID_Quantity_mean                float32
 5   Product_ID_Store_ID_Quantity_mean       int16  
 6   Store_ID_Quantity_mean                  float32
 7   Quantity_lag1                           int8   
 8   Quantity_lag2                           int8   
 9   Quantity_lag3                           int8   
 10  Product_ID_Quantity_mean_lag1           float32
 11  Product_ID_Quantity_mean_lag2           float32
 12  Product_ID_Quantity_mean_lag3           float32
 13  Product_ID_Store_ID_Quantity_mean_lag1  int8   
 14  Product_ID_St

In [51]:
df_forecast.to_csv(os.path.join("df_demand_complete.csv"), index=False)