In [0]:
#!pip install optuna

In [11]:
import numpy as np 
import pandas as pd 
import pickle
from math import ceil
import functools

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


  import pandas.util.testing as tm


In [12]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

gdir = '/gdrive/My Drive/m5data/'

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


 
# Step 1: Feature engineering for hierarchical learning

---


Fist step: feature engineering on total sales in the category for a product 

In [0]:
# Starting point : run the notebook "data_prep" by Antoine to make simple feature engineering "grid_df.pkl"

In [0]:
# Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1


In [0]:
df = pd.read_pickle(gdir+"grid_df.pkl")

In [0]:
count_store_dept_by_date = df.groupby(['store_id','dept_id', 'd'])['sales'].sum()

In [0]:
df_store_dept = count_store_dept_by_date.to_frame().reset_index()

In [0]:
df_store_dept.rename(columns = {'sales':'agg_sales_store_dept'}, inplace = True)

In [0]:
#merging the aggregated total sales by store departement to the dataframe

In [0]:
df3 = merge_by_concat(df, df_store_dept,['store_id', 'dept_id', 'd'])

In [0]:
df3['product_share']=df3['sales']/df3['agg_sales_store_dept']*100.

In [0]:
grid_df = df3

In [0]:
#Feature engineering

In [0]:
# delete some cols first (we're going to recreate some cleaner ones)
grid_df.drop(columns=['wm_yr_wk', 'weekday', 'wday', 'month', 'year'], inplace=True)

# Make some features from date
grid_df['dow'] = grid_df['date'].dt.dayofweek.astype(np.int8)
grid_df['dom'] = grid_df['date'].dt.day.astype(np.int8)
grid_df['month'] = grid_df['date'].dt.month.astype(np.int8)
grid_df['week'] = grid_df['date'].dt.week.astype(np.int8)
grid_df['wom'] = grid_df['dom'].apply(lambda x: ceil(x / 7)).astype(np.int8)
grid_df['quarter'] = grid_df['date'].dt.quarter.astype(np.int8)
grid_df['year'] = grid_df['date'].dt.year.astype(np.int16)

# And other ones
grid_df['is_week_end'] = (grid_df['dow'] >= 5).astype('category')
grid_df['age'] = (grid_df.groupby('id').cumcount() + 1).astype(np.int16)

# delete date
grid_df.drop(columns=['date'], inplace=True)

In [0]:
# one hot encode cat_id, store_id
grid_df = pd.get_dummies(data=grid_df, columns=['cat_id', 'store_id'])

In [0]:
#saving to pickle for faster reuse:
grid_df.to_pickle(gdir+'grid_df_gho.pkl')

 
# Step 2: Hierarchical learning

---


In [0]:
#start of hierarchical learning

In [0]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna
import optuna.integration.lightgbm as lgb

In [0]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [0]:
#load prepared data

In [0]:
df = pd.read_pickle(gdir+'grid_df_gho.pkl')

In [35]:
df.head()

Unnamed: 0,id,item_id,dept_id,state_id,d,sales,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,agg_sales_store_dept,product_share,dow,dom,month,week,wom,quarter,year,is_week_end,age,cat_id_FOODS,cat_id_HOBBIES,cat_id_HOUSEHOLD,store_id_CA_1,store_id_CA_2,store_id_CA_3,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,897,0.0,,,,,0,1,0,9.58,497.0,0.0,5,13,7,28,2,3,2013,True,1,0,1,0,1,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,898,0.0,,,,,0,0,1,9.58,673.0,0.0,6,14,7,28,2,3,2013,True,2,0,1,0,1,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,899,0.0,,,,,0,1,1,9.58,400.0,0.0,0,15,7,29,3,3,2013,False,3,0,1,0,1,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,900,0.0,,,,,0,0,0,9.58,424.0,0.0,1,16,7,29,3,3,2013,False,4,0,1,0,1,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,CA,901,0.0,,,,,0,0,0,9.58,345.0,0.0,2,17,7,29,3,3,2013,False,5,0,1,0,1,0,0,0,0,0,0,0,0,0


In [0]:
#dropping some features

In [0]:
def drop_features(dfx):
    dfx.drop(columns=['id', 
                           'item_id',
                           'event_name_1', 
                           'event_type_1', 
                           'event_name_2', 
                           'event_type_2',
                           'sales', 
                           'dept_id', 
                      'state_id',], inplace=True)
    return dfx

In [0]:
df = drop_features(df)

In [0]:
# TEMPORAL SPLIT 

In [0]:
df_train = df[(df['d'] < 1914-28) & (df['d'] > 800)] 
df_test = df[(df['d'] >= 1914-28) & (df['d'] < 1914)] 
df_validation = df[df['d'] >= 1914] 

In [0]:
df_train.drop(columns=['d'], inplace=True)
df_test.drop(columns=['d'], inplace=True)
df_validation.drop(columns=['d'], inplace=True)

In [0]:
# Setting targets

In [0]:
y_train = df_train['product_share']
y_test = df_test['product_share']

df_train.drop(columns=['product_share'], inplace=True) 
df_test.drop(columns=['product_share'], inplace=True) 

In [0]:
df_train.dtypes

In [0]:
dtrain = lgb.Dataset(df_train, label=y_train)
dval = lgb.Dataset(df_test, label=y_test)

In [0]:
best_params, tuning_history = dict(), list()

params = {
        "objective": "cross_entropy",
        "metric": "cross_entropy",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

model = lgb.train(params, 
                  dtrain, 
                  valid_sets=[dtrain, dval], 
                  verbose_eval=100, 
                  early_stopping_rounds=100, 
                  best_params=best_params,
                  tuning_history=tuning_history)


train is experimental (supported from v0.18.0). The interface can change in the future.

feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

In [0]:
#predict sur le jeu de test 

In [0]:
prediction = np.rint(model.predict(df_test, num_iteration=model.best_iteration))
error = mean_squared_error(y_test, prediction)

In [0]:
best_params = model.params
print("Best params:", best_params)
print("  RMSE = {}".format(error))
print("  Params: ")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

In [0]:
print(‘Best Params:’, best_params)
print(‘Tuning history:’, tuning_history)