# Business Case #5 - Retail - Demand Forecasting

## Authors:
#### Débora Santos (m20200748),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

#### Group D - D4B Consulting

In [1]:
#IMPORT LIBRARIES
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import seaborn as sns
from datetime import datetime
from itertools import product
from math import ceil
%matplotlib inline 
from collections import Counter
import matplotlib.cm as cm


#Models
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Use garbage collection to minimise memory usage
import gc

import warnings
warnings.filterwarnings("ignore")

# Seeting seaborn style
sns.set()

In [2]:
def downcast1(df, verbose=True):
    
    """
    Funciton to reduce the memory used of a particular dataframe by downcasting to a less memory-intensive data type.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
#import dataset in csv
df = pd.read_csv('df_demand.csv')

In [4]:
df.head()

Unnamed: 0,ProductName_ID,Point-of-Sale_ID,Date,Quantity
0,ProductName_649,POS_1,2017-03-04,2.0
1,ProductName_649,POS_1,2016-05-02,4.0
2,ProductName_649,POS_1,2016-10-24,2.0
3,ProductName_649,POS_1,2017-10-13,2.0
4,ProductName_649,POS_1,2017-10-14,2.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90748395 entries, 0 to 90748394
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   ProductName_ID    object 
 1   Point-of-Sale_ID  object 
 2   Date              object 
 3   Quantity          float64
dtypes: float64(1), object(3)
memory usage: 2.7+ GB


In [6]:
df = downcast1(df)

18.7% compressed


#### Clean data

In [7]:
df1 = df.copy()

### Feature Engineering

In [8]:
#Chage the date type
df1['Date'] = df1['Date'].astype('datetime64[ns]')

In [9]:
#Create a column week of year
df1['WeekofYear'] = df1.Date.dt.weekofyear.astype(str)

In [10]:
#Create a column for year
df1['Year'] = df1.Date.dt.year.astype(str)

In [11]:
#Create a column that agregate the info week and year 
df1['Year_Week']= df1['Year'] + df1['WeekofYear'] 

In [12]:
#Change the type to integer
df1['Year_Week']= df1['Year_Week'].astype(int)

In [13]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90748395 entries, 0 to 90748394
Data columns (total 7 columns):
 #   Column            Dtype         
---  ------            -----         
 0   ProductName_ID    object        
 1   Point-of-Sale_ID  object        
 2   Date              datetime64[ns]
 3   Quantity          int16         
 4   WeekofYear        object        
 5   Year              object        
 6   Year_Week         int32         
dtypes: datetime64[ns](1), int16(1), int32(1), object(4)
memory usage: 3.9+ GB


In [14]:
#Drop date variable
df1.drop(['Date','WeekofYear','Year'], axis=1, inplace=True)

In [15]:
df1 = downcast1(df1)

0.0% compressed


In [16]:
df1.head()

Unnamed: 0,ProductName_ID,Point-of-Sale_ID,Quantity,Year_Week
0,ProductName_649,POS_1,2,20179
1,ProductName_649,POS_1,4,201618
2,ProductName_649,POS_1,2,201643
3,ProductName_649,POS_1,2,201741
4,ProductName_649,POS_1,2,201741


In [17]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90748395 entries, 0 to 90748394
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   ProductName_ID    object
 1   Point-of-Sale_ID  object
 2   Quantity          int16 
 3   Year_Week         int32 
dtypes: int16(1), int32(1), object(2)
memory usage: 1.9+ GB


In [19]:
#Groupby the dataframe to create a sum per year of week per product and point of sale
df_group = pd.DataFrame(df1.groupby(['Year_Week','ProductName_ID','Point-of-Sale_ID'])['Quantity'].sum())

In [20]:
#rest index
df_group.reset_index(inplace = True)

In [21]:
#Transform the colum point of sale id in only numbers (remove string)
df_group[['Position','Store_ID']] =(df_group['Point-of-Sale_ID']).str.split("_",expand = True)
df_group.drop(['Point-of-Sale_ID','Position'], axis=1, inplace=True)
df_group.head()

Unnamed: 0,Year_Week,ProductName_ID,Quantity,Store_ID
0,20161,ProductName_1000,5,100
1,20161,ProductName_1000,2,102
2,20161,ProductName_1000,9,103
3,20161,ProductName_1000,7,104
4,20161,ProductName_1000,3,106


In [22]:
#Transform the colum product id in only numbers (remove string)
df_group[['Product','Product_ID']] =(df_group['ProductName_ID']).str.split("_",expand = True)
df_group.drop(['ProductName_ID','Product'], axis=1, inplace=True)
df_group.head()

Unnamed: 0,Year_Week,Quantity,Store_ID,Product_ID
0,20161,5,100,1000
1,20161,2,102,1000
2,20161,9,103,1000
3,20161,7,104,1000
4,20161,3,106,1000


In [23]:
df_group.to_csv(os.path.join("df_demand_group.csv"), index=False)