# Business Case #5 - Retail - Demand Forecasting

## Authors:
#### Débora Santos (m20200748),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

#### Group D - D4B Consulting

In [1]:
#IMPORT LIBRARIES
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import seaborn as sns
from datetime import datetime
from itertools import product
from math import ceil
%matplotlib inline 
from collections import Counter
import matplotlib.cm as cm


#Models
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Use garbage collection to minimise memory usage
import gc

import warnings
warnings.filterwarnings("ignore")

# Seeting seaborn style
sns.set()

In [2]:
def downcast1(df, verbose=True):
    
    """
    Funciton to reduce the memory used of a particular dataframe by downcasting to a less memory-intensive data type.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
#import dataset in csv
df_group = pd.read_csv('df_demand_group.csv')

In [4]:
df_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29318460 entries, 0 to 29318459
Data columns (total 4 columns):
 #   Column      Dtype
---  ------      -----
 0   Year_Week   int64
 1   Quantity    int64
 2   Store_ID    int64
 3   Product_ID  int64
dtypes: int64(4)
memory usage: 894.7 MB


In [5]:
df_group.head()

Unnamed: 0,Year_Week,Quantity,Store_ID,Product_ID
0,20161,5,100,1000
1,20161,2,102,1000
2,20161,9,103,1000
3,20161,7,104,1000
4,20161,3,106,1000


In [6]:
#reduce memory usage
df_group = downcast1(df_group)

68.7% compressed


In [7]:
#Create a list with unique yearweek
lista = list(set(df_group['Year_Week']))

In [8]:
# Create a dataframe of the Cartesian Product of the unique stores and unique products for each week

from itertools import product

train = []

for i in lista:
    
    stores = df_group.loc[df_group["Year_Week"] == i, "Store_ID"].unique()
    
    products = df_group.loc[df_group["Year_Week"] == i, "Product_ID"].unique()
    
    train.append(np.array(list(product(*[[i],stores, products]))))
    
index_feats = ["Year_Week", "Store_ID", "Product_ID"]

train = pd.DataFrame(np.vstack(train), columns=index_feats)

In [9]:
# Create the column showing how many of each product have been sold in each week

group = df_group.groupby(index_feats).agg({"Quantity": "sum"})
group = group.reset_index()

train = pd.merge(train, group, on=index_feats, how="left")
train

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201728,1,1000,2.0
1,201728,1,1001,
2,201728,1,1002,4.0
3,201728,1,1003,
4,201728,1,1004,
...,...,...,...,...
120361742,201727,363,993,6.0
120361743,201727,363,994,
120361744,201727,363,995,4.0
120361745,201727,363,997,


In [10]:
# Use garbage collection to minimise memory usage

import gc

del group

gc.collect()

30

In [11]:
#Replace nams in the rows  by 0
train = train.fillna(0)

In [12]:
train.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201728,1,1000,2.0
1,201728,1,1001,0.0
2,201728,1,1002,4.0
3,201728,1,1003,0.0
4,201728,1,1004,0.0


In [13]:
#Split the test set and store it for posterior utilization
test = train[train['Year_Week']>=201938]

In [14]:
test.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
46118840,201938,17,1,1.0
46118841,201938,17,1000,3.0
46118842,201938,17,1001,0.0
46118843,201938,17,1002,5.0
46118844,201938,17,1003,1.0


In [15]:
#Split the train set
train = train[train['Year_Week']<201938]

In [16]:
#Create a copy of test set without the column quantity
test2 = test.drop('Quantity',axis = 1)
test2

Unnamed: 0,Year_Week,Store_ID,Product_ID
46118840,201938,17,1
46118841,201938,17,1000
46118842,201938,17,1001
46118843,201938,17,1002
46118844,201938,17,1003
...,...,...,...
109318742,201940,221,993
109318743,201940,221,994
109318744,201940,221,995
109318745,201940,221,997


In [17]:
#Bring back the test set with the zero values in the column quantity 
df = pd.concat([train, test2], ignore_index=True, keys=index_feats)

In [18]:
df.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201728,1,1000,2.0
1,201728,1,1001,0.0
2,201728,1,1002,4.0
3,201728,1,1003,0.0
4,201728,1,1004,0.0


In [19]:
#Check if it worked
df[df['Year_Week']==201938]

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
115659474,201938,17,1,
115659475,201938,17,1000,
115659476,201938,17,1001,
115659477,201938,17,1002,
115659478,201938,17,1003,
...,...,...,...,...
116330229,201938,220,993,
116330230,201938,220,994,
116330231,201938,220,995,
116330232,201938,220,997,


In [20]:
#copy the data frame and replace nams
df_forecast = df.fillna(0)
df_forecast.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201728,1,1000,2.0
1,201728,1,1001,0.0
2,201728,1,1002,4.0
3,201728,1,1003,0.0
4,201728,1,1004,0.0


In [21]:
#Reduce memory
df_forecast = downcast1(df_forecast)

50.0% compressed


In [22]:
def add_mean_feats(df, mean_feats, index_features, agg_col="Quantity", agg_func="mean"):
    
    """
    Function to automatically create new features showing the mean of quantity grouped by the specified columns.
    """
    
    if len(index_features) == 2:
        feature_name = index_features[1] + f"_{agg_col}_{agg_func}"
    else: 
        feature_name = index_features[1] + "_" + index_features[2] + f"_{agg_col}_{agg_func}"
        
    group = df.groupby(index_features).agg({agg_col:agg_func}).reset_index().rename(columns={agg_col:feature_name})
    
    df = pd.merge(df, group, on=index_features, how="left")
    
    df = downcast1(df)
    
    mean_feats.append(feature_name)
    
    del group
    gc.collect()
    
    return df, mean_feats

In [23]:
#Add means of quantity sold by product by week
prod_mean_features = []

df_forecast, prod_mean_features = add_mean_feats(df_forecast, prod_mean_features, ["Year_Week","Product_ID"])

df_forecast

15.4% compressed


Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean
0,201728,1,1000,2,1.828851
1,201728,1,1001,0,0.056235
2,201728,1,1002,4,1.303178
3,201728,1,1003,0,2.026895
4,201728,1,1004,0,2.009780
...,...,...,...,...,...
120361742,201940,221,993,0,0.000000
120361743,201940,221,994,0,0.000000
120361744,201940,221,995,0,0.000000
120361745,201940,221,997,0,0.000000


In [24]:
#Add means of quantity sold by product by store by week

df_forecast, prod_mean_features = add_mean_feats(df_forecast, prod_mean_features, ["Year_Week","Product_ID", 'Store_ID'])

df_forecast

0.0% compressed


Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean
0,201728,1,1000,2,1.828851,2
1,201728,1,1001,0,0.056235,0
2,201728,1,1002,4,1.303178,4
3,201728,1,1003,0,2.026895,0
4,201728,1,1004,0,2.009780,0
...,...,...,...,...,...,...
120361742,201940,221,993,0,0.000000,0
120361743,201940,221,994,0,0.000000,0
120361744,201940,221,995,0,0.000000,0
120361745,201940,221,997,0,0.000000,0


In [25]:
#Add means of quantity sold by store by week
store_mean_features = []
df_forecast, store_mean_features = add_mean_feats(df_forecast, store_mean_features, ["Year_Week", 'Store_ID'])

df_forecast

12.5% compressed


Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean
0,201728,1,1000,2,1.828851,2,2.163561
1,201728,1,1001,0,0.056235,0,2.163561
2,201728,1,1002,4,1.303178,4,2.163561
3,201728,1,1003,0,2.026895,0,2.163561
4,201728,1,1004,0,2.009780,0,2.163561
...,...,...,...,...,...,...,...
120361742,201940,221,993,0,0.000000,0,0.000000
120361743,201940,221,994,0,0.000000,0,0.000000
120361744,201940,221,995,0,0.000000,0,0.000000
120361745,201940,221,997,0,0.000000,0,0.000000


In [26]:
df_forecast.to_csv(os.path.join("df_forecast.csv"), index=False)