In [0]:
#!pip install optuna

In [2]:
import numpy as np 
import pandas as pd 
import pickle
from math import ceil
import functools

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


 
# Step 1: Feature engineering for hierarchical learning

---


Fist step: feature engineering on total sales in the category for a product 

In [3]:
# Starting point : run the notebook "data_prep" by Antoine to make simple feature engineering "grid_df.pkl"

In [4]:
# Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1


In [5]:
df = pd.read_pickle("data/interim/grid_df.pkl")

In [6]:
count_store_dept_by_date = df.groupby(['store_id','dept_id', 'd'])['sales'].sum()

In [7]:
df_store_dept = count_store_dept_by_date.to_frame().reset_index()

In [8]:
df_store_dept.rename(columns = {'sales':'agg_sales_store_dept'}, inplace = True)

In [9]:
#merging the aggregated total sales by store departement to the dataframe

In [10]:
df3 = merge_by_concat(df, df_store_dept,['store_id', 'dept_id', 'd'])

In [11]:
df3['product_share']=df3['sales']/df3['agg_sales_store_dept']*100.

In [12]:
grid_df = df3

In [13]:
#Feature engineering

In [14]:
# delete some cols first (we're going to recreate some cleaner ones)
grid_df.drop(columns=['wm_yr_wk', 'weekday', 'wday', 'month', 'year'], inplace=True)

# Make some features from date
grid_df['dow'] = grid_df['date'].dt.dayofweek.astype(np.int8)
grid_df['dom'] = grid_df['date'].dt.day.astype(np.int8)
grid_df['month'] = grid_df['date'].dt.month.astype(np.int8)
grid_df['week'] = grid_df['date'].dt.week.astype(np.int8)
grid_df['wom'] = grid_df['dom'].apply(lambda x: ceil(x / 7)).astype(np.int8)
grid_df['quarter'] = grid_df['date'].dt.quarter.astype(np.int8)
grid_df['year'] = grid_df['date'].dt.year.astype(np.int16)

# And other ones
grid_df['is_week_end'] = (grid_df['dow'] >= 5).astype('category')
grid_df['age'] = (grid_df.groupby('id').cumcount() + 1).astype(np.int16)

# delete date
grid_df.drop(columns=['date'], inplace=True)

In [15]:
# one hot encode cat_id, store_id
grid_df = pd.get_dummies(data=grid_df, columns=['cat_id', 'store_id'])

In [16]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [18]:
grid_df.head(5)

Unnamed: 0,id,item_id,dept_id,state_id,d,sales,event_name_1,event_type_1,event_name_2,event_type_2,...,store_id_CA_1,store_id_CA_2,store_id_CA_3,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,897,0.0,,,,,...,1,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,898,0.0,,,,,...,1,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,899,0.0,,,,,...,1,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,900,0.0,,,,,...,1,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,901,0.0,,,,,...,1,0,0,0,0,0,0,0,0,0


In [19]:
#saving to pickle for faster reuse:
grid_df.to_pickle('data/interim/grid_df_gho.pkl')