<h1><b>M5 Forecasting: Feature Engineering</b></h1>

# **Contents**

<h3> 1. Reading the Data</h3> 

<h3> 2. Data Preprocessing </h3> 

<h3> 3. Feature Engineering </h3> 

<h3> 4. References</h3> 

In [1]:
#import dependencies
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import gc
import random 
import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

# 1. Reading the Data

In [2]:
df_sales = pd.read_csv('sales_train_evaluation.csv')
df_price = pd.read_csv('sell_prices.csv')
df_cal = pd.read_csv('calendar.csv', parse_dates=['date'])

# 2. Data Preprocessing

## 2.1 Data Manipulation 
- Sales Data
    - Convert the day column names to numeric 
    
    - Add day columns for evaluation period (1942 to 1969) 
- Calender Data
    - Convert day column values from object to integer dtype 
- Price Data
    - Create id feature combining store and item id  

In [3]:
#Sales Data
df_sales.columns = list(df_sales.columns[:6]) + list(range(1,1942))

for day in range(1942, 1970):
    df_sales[day] = 0

In [4]:
#Calendar Data
df_cal["d"]= df_cal["d"].apply(lambda x: int(x.split("_")[1])).astype('int16')

In [5]:
#Price Data
df_price["id"] = df_price["item_id"] + "_" + df_price["store_id"] + "_evaluation"

# 3. Feature Engineering

## 3.1 Feature Creation
- Basic Time Based Features
    - Derive quarter, week and day features from date
    - Create a binary feature which identifies weekend days

In [6]:
datetimeFeat = ["quarter", "week", "day"] 

for feat in datetimeFeat:
    df_cal[feat] = getattr(df_cal['date'].dt, feat).astype('int8')

df_cal.rename(columns = {'day':'day_of_month'}, inplace=True)

df_cal['is_weekend'] = df_cal["weekday"].apply(lambda x: 1 if x in ['Saturday','Sunday'] else 0).astype('int8')

## 3.2 Reduce Memory Usage
- Define a downcast function which can be leveraged at any stage 
- Downcast the dataframes to reduce memory usage

In [7]:
#refer - https://stackoverflow.com/questions/1658714/how-to-get-the-range-of-valid-numpy-data-types
def downcast(data):
    cols = data.columns
    for col in cols:
        if data[col].dtype == object:
            if col =='date':
                data[col] = pd.to_datetime(data[col])
            else: 
                data[col] = data[col].astype('category') 
 
        #only check the upper value because we only have positive values in dataframes
        elif data[col].dtype == int: 
            if data[col].max() < np.iinfo('int8').max:
                data[col] = data[col].astype('int8')

            elif data[col].max() < np.iinfo('int16').max:
                data[col] = data[col].astype('int16')

            elif data[col].max() < np.iinfo('int32').max:
                data[col] = data[col].astype('int32')
            else:
                data[col] = data[col].astype('int64')

        elif data[col].dtype == float:
            if data[col].max() < np.finfo('float16').max:
                data[col] = data[col].astype('float16')
            elif data[col].max() < np.finfo('float32').max:
                data[col] = data[col].astype('float32')
            else:
                data[col] = data[col].astype('float64')

    return data

In [8]:
%%time
df_sales = downcast(df_sales)
df_cal = downcast(df_cal)
df_price = downcast(df_price)

CPU times: user 58.6 s, sys: 1min 18s, total: 2min 16s
Wall time: 2min 16s


## 3.3 Encode Categorical Features
- Save category codes in dictionary
- Perform feature encoding on data

In [13]:
#Save category codes
sales_catFeat = ['id','item_id','dept_id','cat_id','store_id','state_id']
cal_catFeat = ['event_name_1','event_type_1','event_name_2','event_type_2']

#refer - https://stackoverflow.com/questions/6181935/how-do-you-create-different-variable-names-while-in-a-loop
for feat in sales_catFeat: 
    exec(f'dict_{feat} = dict(zip(df_sales[feat], df_sales[feat].cat.codes))')
                 
for feat in cal_catFeat: 
    exec(f'dict_{feat} = dict(zip(df_cal[feat], df_cal[feat].cat.codes))')

#refer - https://www.pythonforbeginners.com/basics/convert-string-to-variable-name-in-python#:~:text=is%20pythonforbeginners.com-,String%20Into%20Variable%20Name%20in%20Python%20Using%20the%20vars(),like%20the%20globals()%20function.
for feat in sales_catFeat+cal_catFeat:
    varStr = f'dict_{feat}'
    var = vars()
    pickle.dump(var[varStr], open(f'saved_dicts/dict_{feat}.pkl', 'wb'))

In [11]:
#Feature encoding
for col in sales_catFeat:
    df_sales[col] = df_sales[col].cat.codes
    if col == 'id':
        df_price[col] = df_price[col].cat.codes
        
for col in cal_catFeat:
    df_cal[col] = df_cal[col].cat.codes

In [12]:
%%time
df_sales = downcast(df_sales)
df_cal = downcast(df_cal)
df_price = downcast(df_price)

CPU times: user 72.7 ms, sys: 787 µs, total: 73.5 ms
Wall time: 72.6 ms


## 3.4 Data Transformation & Feature Creation
- Convert the sales data from wide to long format
- Create lag and rolling window based statistical features on sales
- Merge the dataframes
- Create price lag and price change features 

In [13]:
#Melting the sales dataframe
df_long = pd.melt(df_sales, id_vars = [c for c in df_sales.columns if type(c)==str], var_name='d', value_name='sales')
df_long.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,14370,1437,3,1,0,0,1,0
1,14380,1438,3,1,0,0,1,0
2,14390,1439,3,1,0,0,1,0


In [14]:
#noticed that 'd' feature is converted to object dtype after melting, so fixed that
print(df_long[['d']].info())
df_long['d'] = pd.to_numeric(df_long['d']).astype('int16')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60034810 entries, 0 to 60034809
Data columns (total 1 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   d       object
dtypes: object(1)
memory usage: 458.0+ MB
None


In [15]:
#Create features based on historical sales data
lags = [28, 30, 31, 35, 42, 49, 56, 63, 70, 77]

for lag in tqdm(lags):
    df_long[f'lag_{lag}'] = df_long.groupby(["id"])['sales'].shift(lag).astype('float16')
    
    
windows = [7, 14, 28, 30, 45, 60, 90, 120]

for window in tqdm(windows):
    df_long[f"rmean_28_{window}"] = df_long.groupby(["id"])["lag_28"].transform(lambda x: x.rolling(window).mean()).astype('float16')
    
    #alternative:
    #df_long[f"rmean_28_{window}"] = df_long.groupby(["id"]).transform(lambda x: x.shift(28).rolling(window).mean()).astype('float16')

100%|██████████████████████████████████████████████████████████████████████████| 10/10 [00:36<00:00,  3.60s/it]
100%|████████████████████████████████████████████████████████████████████████████| 8/8 [03:51<00:00, 28.94s/it]


#### Common Questions
Why did we create sales lags of 28 days or more?
- The reason we limit ourselves to create sales lags which are lesser than 28 days is that it may cause data leakage. Our forecasting horizon is 28 and suppose if we create a sales lag feature of 7 days which is a value smaller than our horizon then for days which come after the first 7 days in horizon, logically we are creating sales lag using sales value on a day which falls in the horizon and our whole purpose is to create a model which can forecast 28 days in future so in real scenarios we do not have future sales data at our expense to create any lag features which are lesser than 28 days. (read that again)

Why did we create rolling mean feature based on lag_28 feature? 
- Rolling mean is the average sales over a specific time window where in the window keeps shifting. The reason why we use lag_28 to create rolling mean features is the same as previous question's answer. Any value lesser than 28 days would cause data leakage therefore to avoid this we create the rolling means over the closest lag feature available. 

In [16]:
#Merge dataframes
df_long = pd.merge(df_long, df_cal.drop(columns=['weekday']), how='left', on='d')
df_long = pd.merge(df_long, df_price.drop(columns=['store_id', 'item_id']), how='left', on=['id', 'wm_yr_wk'])

In [17]:
#Create features based on historical price data
df_long['price_shift_t1'] = df_long.groupby(['id'])['sell_price'].shift(1)
df_long['price_change_t1'] = (df_long['sell_price'] - df_long['price_shift_t1'])/df_long['price_shift_t1']

In [18]:
#check the 'd' value on 1st Jan'14
df_cal[df_cal['date']=='2014-01-01']['d']

1068    1069
Name: d, dtype: int16

In [19]:
#retain data after 'd' = 1069
df_long = (df_long[df_long['d']>=1069]).reset_index(drop=True)

In [20]:
#check columns with null values
df_long.isnull().sum()[df_long.isnull().sum()>0]

sell_price         939011
price_shift_t1     943562
price_change_t1    943562
dtype: int64

In [21]:
#replace null values with 0 
for feat in ['sell_price','price_shift_t1','price_change_t1']:
    df_long[feat] = df_long[feat].replace(np.nan, 0)

#convert dtype of lag columns to 'int16', now that all the initial NaN rows are dropped
for feat in ["lag_28", "lag_30", "lag_31", "lag_35", "lag_42", "lag_49", "lag_56", "lag_63", "lag_70", "lag_77"]:
    df_long[feat] = df_long[feat].astype('int16')

In [25]:
#check the final dataframe
df_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,lag_28,lag_30,...,snap_CA,snap_TX,snap_WI,quarter,week,day_of_month,is_weekend,sell_price,price_shift_t1,price_change_t1
0,14370,1437,3,1,0,0,1069,1,1,1,...,1,1,0,1,1,1,0,8.257812,8.257812,0.0
1,14380,1438,3,1,0,0,1069,0,0,0,...,1,1,0,1,1,1,0,3.970703,3.970703,0.0
2,14390,1439,3,1,0,0,1069,0,0,0,...,1,1,0,1,1,1,0,0.0,0.0,0.0
3,14400,1440,3,1,0,0,1069,2,1,3,...,1,1,0,1,1,1,0,4.640625,4.640625,0.0
4,14410,1441,3,1,0,0,1069,1,1,1,...,1,1,0,1,1,1,0,3.080078,3.080078,0.0


## 3.5 Save the preprocessed data

In [26]:
df_long = downcast(df_long)

In [27]:
df_long.to_pickle("preprocessed_data.pkl")

# 4. References

- https://www.analyticsvidhya.com/blog/2019/12/6-powerful-feature-engineering-techniques-time-series/