# Business Case #5 - Retail - Demand Forecasting

## Authors:
#### Débora Santos (m20200748),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

#### Group D - D4B Consulting

In [1]:
!pip install lightgbm



In [2]:
#IMPORT LIBRARIES
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import seaborn as sns
from datetime import datetime
from itertools import product
from math import ceil
%matplotlib inline 
from collections import Counter
import matplotlib.cm as cm


#Models
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor

# Use garbage collection to minimise memory usage
import gc

import warnings
warnings.filterwarnings("ignore")

# Seeting seaborn style
sns.set()

In [3]:
def downcast1(df, verbose=True):
    
    """
    Funciton to reduce the memory used of a particular dataframe by downcasting to a less memory-intensive data type.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
#import dataset in csv
df_forecast = pd.read_csv('df_demand_complete.csv')

In [6]:
df_forecast.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean,Quantity_lag1,Quantity_lag2,Quantity_lag3,...,Product_ID_Quantity_mean_lag3,Product_ID_Store_ID_Quantity_mean_lag1,Product_ID_Store_ID_Quantity_mean_lag2,Product_ID_Store_ID_Quantity_mean_lag3,Store_ID_Quantity_mean_lag1,Store_ID_Quantity_mean_lag2,Store_ID_Quantity_mean_lag3,Quantity_lag3_mean,lag_grad_1,lag_grad_2
0,201728,1,1000,2,1.828851,2,2.163561,0,0,0,...,1.687042,0,0,0,1.985345,1.992302,1.791754,0.0,0.0,0.0
1,201728,1,1001,0,0.056235,0,2.163561,0,0,0,...,0.03423,0,0,0,1.985345,1.992302,1.791754,0.0,0.0,0.0
2,201728,1,1002,4,1.303178,4,2.163561,2,5,4,...,1.293398,2,5,4,1.985345,1.992302,1.791754,3.666667,0.4,1.25
3,201728,1,1003,0,2.026895,0,2.163561,0,0,0,...,1.762836,0,0,0,1.985345,1.992302,1.791754,0.0,0.0,0.0
4,201728,1,1004,0,2.00978,0,2.163561,0,2,2,...,1.684597,0,2,2,1.985345,1.992302,1.791754,1.333333,0.0,1.0


### Split the data

In [7]:
#Create train, val and test data spliting by last 6 weeks to test set, more 6 weeks to validation set 
train = df_forecast[df_forecast['Year_Week']<201932]

In [8]:
val = df_forecast[df_forecast['Year_Week']>=201932]
val = val[val['Year_Week']<201938]

In [9]:
test = df_forecast[df_forecast['Year_Week']>=201938]

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111647593 entries, 0 to 115659473
Data columns (total 22 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   Year_Week                               int32  
 1   Store_ID                                int16  
 2   Product_ID                              int16  
 3   Quantity                                int8   
 4   Product_ID_Quantity_mean                float32
 5   Product_ID_Store_ID_Quantity_mean       int16  
 6   Store_ID_Quantity_mean                  float32
 7   Quantity_lag1                           int8   
 8   Quantity_lag2                           int8   
 9   Quantity_lag3                           int8   
 10  Product_ID_Quantity_mean_lag1           float32
 11  Product_ID_Quantity_mean_lag2           float32
 12  Product_ID_Quantity_mean_lag3           float32
 13  Product_ID_Store_ID_Quantity_mean_lag1  int8   
 14  Product_ID_Store_ID_Quantity_m

In [11]:
val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4011881 entries, 14602527 to 105285597
Data columns (total 22 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   Year_Week                               int32  
 1   Store_ID                                int16  
 2   Product_ID                              int16  
 3   Quantity                                int8   
 4   Product_ID_Quantity_mean                float32
 5   Product_ID_Store_ID_Quantity_mean       int16  
 6   Store_ID_Quantity_mean                  float32
 7   Quantity_lag1                           int8   
 8   Quantity_lag2                           int8   
 9   Quantity_lag3                           int8   
 10  Product_ID_Quantity_mean_lag1           float32
 11  Product_ID_Quantity_mean_lag2           float32
 12  Product_ID_Quantity_mean_lag3           float32
 13  Product_ID_Store_ID_Quantity_mean_lag1  int8   
 14  Product_ID_Store_ID_Quant

In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4702273 entries, 115659474 to 120361746
Data columns (total 22 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   Year_Week                               int32  
 1   Store_ID                                int16  
 2   Product_ID                              int16  
 3   Quantity                                int8   
 4   Product_ID_Quantity_mean                float32
 5   Product_ID_Store_ID_Quantity_mean       int16  
 6   Store_ID_Quantity_mean                  float32
 7   Quantity_lag1                           int8   
 8   Quantity_lag2                           int8   
 9   Quantity_lag3                           int8   
 10  Product_ID_Quantity_mean_lag1           float32
 11  Product_ID_Quantity_mean_lag2           float32
 12  Product_ID_Quantity_mean_lag3           float32
 13  Product_ID_Store_ID_Quantity_mean_lag1  int8   
 14  Product_ID_Store_ID_Quan

In [13]:
#Remove target variable and save it in separate
X_train = train.drop('Quantity', axis=1)
y_train = train['Quantity']
X_val = val.drop('Quantity', axis=1)
y_val = val['Quantity']

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111647593 entries, 0 to 115659473
Data columns (total 21 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   Year_Week                               int32  
 1   Store_ID                                int16  
 2   Product_ID                              int16  
 3   Product_ID_Quantity_mean                float32
 4   Product_ID_Store_ID_Quantity_mean       int16  
 5   Store_ID_Quantity_mean                  float32
 6   Quantity_lag1                           int8   
 7   Quantity_lag2                           int8   
 8   Quantity_lag3                           int8   
 9   Product_ID_Quantity_mean_lag1           float32
 10  Product_ID_Quantity_mean_lag2           float32
 11  Product_ID_Quantity_mean_lag3           float32
 12  Product_ID_Store_ID_Quantity_mean_lag1  int8   
 13  Product_ID_Store_ID_Quantity_mean_lag2  int8   
 14  Product_ID_Store_ID_Quantity_m

In [15]:
X_train = downcast1(X_train)

0.0% compressed


In [16]:
del df_forecast
gc.collect()

30

### Models

#### DECISION TREE - REGRESSOR

In [17]:
#Run the model with parameters defined 
#Get the R2
model_DT = DecisionTreeRegressor(max_depth=3, min_samples_split=0.5)
model_DT.fit(X_train, y_train)
y_pred_DT = model_DT.predict(X_val)
DT_score = model_DT.score(X_val, y_val)
DT_score

0.8957234869353843

In [18]:
#Get the adjusted R2
r2 = r2_score(y_val, y_pred_DT)
n = len(y_val)
p = len(X_train.columns)

def adj_r2 (r2,n,p):
    return 1-(1-r2)*(n-1)/(n-p-1)

adj_r2(r2,n,p)

0.8957229411019504

In [19]:
#Get MSE
mean_squared_error(y_val, y_pred_DT, squared = True)

1.2680223040678613

#### LIGHT GBM

#### Light GBM is a gradient boosting that uses tree based learning algorithm. This algorithm is very good to work with very big dataset (our case). LGBM has a high speed and it takes lower memory to run.  In LGBM the tree grows vertically while the other algorithms the tree grows horizontally. 


In [20]:
#Create a function to apply the algorithm LIGHT GBM
def build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50,
                     categorical_feature=cat_features)
    return model

In [21]:
#Define the parameters of algorith
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 500,
    'min_data_in_leaf':100,
    'feature_fraction':0.7,
    'learning_rate': 0.01,
    'early_stopping_rounds':10,
    'seed': 1
}

In [22]:
#designating the categorical features which should be focused on
cat_features = ['Year_Week','Store_ID', 'Product_ID']

In [23]:
#Run the model with parameters defined 
lgb_model = build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5290
[LightGBM] [Info] Number of data points in the train set: 111647593, number of used features: 21
[LightGBM] [Info] Start training from score 1.335993
Training until validation scores don't improve for 10 rounds
[50]	training's rmse: 2.31338	valid_1's rmse: 2.17542
[100]	training's rmse: 1.4578	valid_1's rmse: 1.37191
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 1.4578	valid_1's rmse: 1.37191


In [24]:
#Get the R2
y_pred_lgb = lgb_model.predict(X_val)
r2_lgb = r2_score(y_val, y_pred_lgb)
r2_lgb

0.8452219965521868

In [25]:
#Get the adjusted R2
adj_r2(r2_lgb,n,p)

0.8452211863696573

In [26]:
#Get MSE
mean_squared_error(y_val, y_pred_lgb, squared = True)

1.8821300672885404

#### STORE RESULTS

In [28]:
X_train_new = X_train.copy()
X_train_new

Unnamed: 0,Year_Week,Store_ID,Product_ID,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean,Quantity_lag1,Quantity_lag2,Quantity_lag3,Product_ID_Quantity_mean_lag1,...,Product_ID_Quantity_mean_lag3,Product_ID_Store_ID_Quantity_mean_lag1,Product_ID_Store_ID_Quantity_mean_lag2,Product_ID_Store_ID_Quantity_mean_lag3,Store_ID_Quantity_mean_lag1,Store_ID_Quantity_mean_lag2,Store_ID_Quantity_mean_lag3,Quantity_lag3_mean,lag_grad_1,lag_grad_2
0,201728,1,1000,1.828851,2,2.163561,0,0,0,1.672372,...,1.687042,0,0,0,1.985345,1.992302,1.791754,0.000000,0.000000,0.000000
1,201728,1,1001,0.056235,0,2.163561,0,0,0,0.046455,...,0.034230,0,0,0,1.985345,1.992302,1.791754,0.000000,0.000000,0.000000
2,201728,1,1002,1.303178,4,2.163561,2,5,4,1.124694,...,1.293398,2,5,4,1.985345,1.992302,1.791754,3.666667,0.400000,1.250000
3,201728,1,1003,2.026895,0,2.163561,0,0,0,1.929095,...,1.762836,0,0,0,1.985345,1.992302,1.791754,0.000000,0.000000,0.000000
4,201728,1,1004,2.009780,0,2.163561,0,2,2,1.826406,...,1.684597,0,2,2,1.985345,1.992302,1.791754,1.333333,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115659469,201727,363,993,17.122250,6,0.802512,6,9,14,17.691931,...,20.000000,6,9,14,0.945927,0.811174,0.800000,9.666667,0.666667,0.642857
115659470,201727,363,994,1.100245,0,0.802512,1,0,1,1.063570,...,1.298288,1,0,1,0.945927,0.811174,0.800000,0.666667,0.000000,0.000000
115659471,201727,363,995,2.586797,4,0.802512,2,0,2,2.388753,...,3.420538,2,0,2,0.945927,0.811174,0.800000,1.333333,0.000000,0.000000
115659472,201727,363,997,0.716381,0,0.802512,0,0,0,0.691932,...,0.655257,0,0,0,0.945927,0.811174,0.800000,0.000000,0.000000,0.000000


In [29]:
X_val_new = X_val.copy()
X_val_new 

Unnamed: 0,Year_Week,Store_ID,Product_ID,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean,Quantity_lag1,Quantity_lag2,Quantity_lag3,Product_ID_Quantity_mean_lag1,...,Product_ID_Quantity_mean_lag3,Product_ID_Store_ID_Quantity_mean_lag1,Product_ID_Store_ID_Quantity_mean_lag2,Product_ID_Store_ID_Quantity_mean_lag3,Store_ID_Quantity_mean_lag1,Store_ID_Quantity_mean_lag2,Store_ID_Quantity_mean_lag3,Quantity_lag3_mean,lag_grad_1,lag_grad_2
14602527,201937,17,1,0.007335,2,2.530872,0,4,0,0.000000,...,0.009780,0,4,0,0.000000,2.579497,2.560144,1.333333,0.000000,0.00
14602528,201937,17,1000,1.396088,4,2.530872,3,2,4,1.330073,...,1.332518,3,2,4,2.472494,2.579497,2.560144,3.000000,1.500000,0.50
14602529,201937,17,1001,0.024450,0,2.530872,0,0,0,0.026895,...,0.026895,0,0,0,2.472494,2.579497,2.560144,0.000000,0.000000,0.00
14602530,201937,17,1002,0.838631,1,2.530872,1,2,4,0.833741,...,1.046455,1,2,4,2.472494,2.579497,2.560144,2.333333,0.500000,0.50
14602531,201937,17,1003,2.053790,0,2.530872,0,0,0,1.948655,...,2.188264,0,0,0,2.472494,2.579497,2.560144,0.000000,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105285593,201936,171,993,18.413202,14,0.773839,20,19,20,18.608803,...,16.863081,20,19,20,0.870610,0.868227,0.778788,20.000000,1.052632,0.95
105285594,201936,171,994,0.481663,0,0.773839,0,0,0,0.525672,...,0.601467,0,0,0,0.870610,0.868227,0.778788,0.000000,0.000000,0.00
105285595,201936,171,995,2.603912,0,0.773839,2,0,0,2.569682,...,2.246944,2,0,0,0.870610,0.868227,0.778788,0.666667,0.000000,0.00
105285596,201936,171,997,0.308068,0,0.773839,0,0,0,0.442543,...,0.432763,0,0,0,0.870610,0.868227,0.778788,0.000000,0.000000,0.00


In [30]:
X_train_new = downcast1(X_train_new)

0.0% compressed


In [31]:
#Create the columns to store the results LGBM 
X_train_new['lgb_pred'] = lgb_model.predict(X_train).clip(0,20)
X_train_new['target'] = y_train
X_train_new['sq_err_lgb'] = (X_train_new['lgb_pred']-X_train_new['target'])**2

In [32]:
#Create the columns to store the results LGBM 
X_val_new['lgb_pred'] = lgb_model.predict(X_val).clip(0,20)
X_val_new['target'] = y_val
X_val_new['sq_err_lgb'] = (X_val_new['lgb_pred']-X_val_new['target'])**2

In [35]:
#Create the columns to store the results DT
X_train_new['DT_pred'] = model_DT.predict(X_train).clip(0,20)
X_train_new['sq_err_DT'] = (X_train_new['DT_pred']-X_train_new['target'])**2

In [36]:
#Create the columns to store the results DT
X_val_new['DT_pred'] = model_DT.predict(X_val).clip(0,20)
X_val_new['sq_err_DT'] = (X_val_new['DT_pred']-X_val_new['target'])**2

#### APPLY ON TEST SET 

In [38]:
#import dataset in csv
test_original = pd.read_csv('teste.csv')

In [39]:
#TEST SET ORIGINAL
test_original.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,Quantity
0,201938,17,1,1.0
1,201938,17,1000,3.0
2,201938,17,1001,0.0
3,201938,17,1002,5.0
4,201938,17,1003,1.0


In [40]:
#TEST SET MODIFIED (WITHZERO QUANTITIES AND PREPROCESSED)
X_test = test.drop('Quantity',axis = 1 )
y_test = test['Quantity']

In [41]:
#DEFINE FEATURES TO MERGE TEST DATASETS
index_feats = ["Year_Week", "Store_ID", "Product_ID"]

In [42]:
#MERGE DATASETS
X_test_new =pd.merge(X_test, test_original, on=index_feats, how="left")
X_test_new 

Unnamed: 0,Year_Week,Store_ID,Product_ID,Product_ID_Quantity_mean,Product_ID_Store_ID_Quantity_mean,Store_ID_Quantity_mean,Quantity_lag1,Quantity_lag2,Quantity_lag3,Product_ID_Quantity_mean_lag1,...,Product_ID_Store_ID_Quantity_mean_lag1,Product_ID_Store_ID_Quantity_mean_lag2,Product_ID_Store_ID_Quantity_mean_lag3,Store_ID_Quantity_mean_lag1,Store_ID_Quantity_mean_lag2,Store_ID_Quantity_mean_lag3,Quantity_lag3_mean,lag_grad_1,lag_grad_2,Quantity
0,201938,17,1,0.0,0,0.0,2,1,6,0.007335,...,2,1,6,2.530872,2.728897,2.852888,3.000000,2.0,0.166667,1.0
1,201938,17,1000,0.0,0,0.0,4,4,12,1.396088,...,4,4,12,2.530872,2.728897,2.852888,6.666667,1.0,0.333333,3.0
2,201938,17,1001,0.0,0,0.0,0,1,0,0.024450,...,0,1,0,2.530872,2.728897,2.852888,0.333333,0.0,0.000000,0.0
3,201938,17,1002,0.0,0,0.0,1,2,3,0.838631,...,1,2,3,2.530872,2.728897,2.852888,2.000000,0.5,0.666667,5.0
4,201938,17,1003,0.0,0,0.0,0,0,1,2.053790,...,0,0,1,2.530872,2.728897,2.852888,0.333333,0.0,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4702268,201940,221,993,0.0,0,0.0,0,5,3,0.000000,...,0,5,3,0.000000,0.435835,0.426642,2.666667,0.0,1.666667,5.0
4702269,201940,221,994,0.0,0,0.0,0,0,0,0.000000,...,0,0,0,0.000000,0.435835,0.426642,0.000000,0.0,0.000000,0.0
4702270,201940,221,995,0.0,0,0.0,0,0,0,0.000000,...,0,0,0,0.000000,0.435835,0.426642,0.000000,0.0,0.000000,0.0
4702271,201940,221,997,0.0,0,0.0,0,0,0,0.000000,...,0,0,0,0.000000,0.435835,0.426642,0.000000,0.0,0.000000,0.0


In [43]:
#STORE RESULTS LGBM
X_test_new['lgb_pred'] = lgb_model.predict(X_test).clip(0,20)
X_test_new['target'] = X_test_new['Quantity']
X_test_new['sq_err_lgb'] = (X_test_new['lgb_pred']-X_test_new['target'])**2

In [44]:
#STORE RESULTS DT
X_test_new['DT_pred'] = model_DT.predict(X_test).clip(0,20)
X_test_new['sq_err_DT'] = (X_test_new['DT_pred']-X_test_new['target'])**2

In [45]:
#GET RMSE
mean_squared_error(X_test_new['target'], X_test_new['lgb_pred'], squared = True)

36.34839174029889

In [46]:
#GET MSE
mean_squared_error(X_test_new['target'], X_test_new['DT_pred'], squared = True)

38.7793630867455

In [47]:
#CREATE A COLUMN TO IDENTIFY WITCH PART OF DATASET 
X_train_new["dataset"] = 'Train'
X_val_new["dataset"] = 'Validation'
X_test_new["dataset"] = 'Test'

In [48]:
#Create a function to calculate WAPE
def wmape_gr(df_in, st_actual, st_forecast):
    # we take two series and calculate an output a wmape from it

    # make a series called mape
    se_mape = abs(df_in[st_actual] - df_in[st_forecast]) / df_in[st_actual]

    # get a float of the sum of the actual
    ft_actual_sum = df_in[st_actual].sum()

    # get a series of the multiple of the actual & the mape
    se_actual_prod_mape = df_in[st_actual] * se_mape

    # summate the prod of the actual and the mape
    ft_actual_prod_mape_sum = se_actual_prod_mape.sum()

    # float: wmape of forecast
    ft_wmape_forecast = ft_actual_prod_mape_sum / ft_actual_sum

    # return a float
    return ft_wmape_forecast

In [49]:
#Calculate wape to lbgm
df_gr_train_LGB = X_train_new.groupby(['Store_ID', 'Product_ID','dataset']).apply(wmape_gr,'target','lgb_pred')

In [50]:
df_gr_train_LGB.head()

Store_ID  Product_ID  dataset
1         1           Train           NaN
          2           Train           NaN
          3           Train           NaN
          4           Train      0.252321
          5           Train      0.273440
dtype: float64

In [51]:
#Calculate wape to DT
df_gr_train_DT = X_train_new.groupby(['Store_ID', 'Product_ID','dataset']).apply(wmape_gr,'target','DT_pred')

In [52]:
#Reset index
df_gr_train_LGB = df_gr_train_LGB.reset_index()
#Rename aggregate collumn
df_gr_train_LGB = df_gr_train_LGB.rename(columns={0:'wape_lgb'})
#Replace nams by 0
df_gr_train_LGB = df_gr_train_LGB.fillna(0)
df_gr_train_LGB

Unnamed: 0,Store_ID,Product_ID,dataset,wape_lgb
0,1,1,Train,0.000000
1,1,2,Train,0.000000
2,1,3,Train,0.000000
3,1,4,Train,0.252321
4,1,5,Train,0.273440
...,...,...,...,...
865848,410,2846,Train,0.330612
865849,410,2847,Train,0.342647
865850,410,2848,Train,0.000000
865851,410,2849,Train,0.005618


In [53]:
#Reset index
df_gr_train_DT = df_gr_train_DT.reset_index()
#Rename aggregate collumn
df_gr_train_DT = df_gr_train_DT.rename(columns={0:'wape_DT'})
#Replace nams by 0
df_gr_train_DT = df_gr_train_DT.fillna(0)
df_gr_train_DT

Unnamed: 0,Store_ID,Product_ID,dataset,wape_DT
0,1,1,Train,0.000000
1,1,2,Train,0.000000
2,1,3,Train,0.000000
3,1,4,Train,0.309775
4,1,5,Train,0.346096
...,...,...,...,...
865848,410,2846,Train,0.283234
865849,410,2847,Train,0.234630
865850,410,2848,Train,0.000000
865851,410,2849,Train,0.380450


In [54]:
#merge wape DT and WAPE LGBM
df_gr_train = pd.merge(df_gr_train_DT, df_gr_train_LGB, on= ['Store_ID', 'Product_ID','dataset'], how="left")

In [55]:
del df_gr_train_DT
gc.collect()

60

In [56]:
del df_gr_train_LGB
gc.collect()

15

In [57]:
#Calculate wape to lbgm
df_gr_val_LGB = X_val_new.groupby(['Store_ID', 'Product_ID','dataset']).apply(wmape_gr,'target','lgb_pred')

In [58]:
#Calculate wape to DT
df_gr_val_DT = X_val_new.groupby(['Store_ID', 'Product_ID','dataset']).apply(wmape_gr,'target','DT_pred')

In [59]:
df_gr_val_LGB = df_gr_val_LGB.reset_index()
df_gr_val_LGB = df_gr_val_LGB.rename(columns={0:'wape_lgb'})
df_gr_val_LGB = df_gr_val_LGB.fillna(0)
df_gr_val_LGB

Unnamed: 0,Store_ID,Product_ID,dataset,wape_lgb
0,1,1,Validation,0.000000
1,1,2,Validation,0.000000
2,1,3,Validation,0.000000
3,1,4,Validation,0.000000
4,1,5,Validation,0.260470
...,...,...,...,...
777913,410,2846,Validation,0.351313
777914,410,2847,Validation,0.330202
777915,410,2848,Validation,0.000000
777916,410,2849,Validation,0.000000


In [60]:
df_gr_val_DT = df_gr_val_DT.reset_index()
df_gr_val_DT = df_gr_val_DT.rename(columns={0:'wape_DT'})
df_gr_val_DT = df_gr_val_DT.fillna(0)
df_gr_val_DT

Unnamed: 0,Store_ID,Product_ID,dataset,wape_DT
0,1,1,Validation,0.000000
1,1,2,Validation,0.000000
2,1,3,Validation,0.000000
3,1,4,Validation,0.000000
4,1,5,Validation,0.309775
...,...,...,...,...
777913,410,2846,Validation,0.232896
777914,410,2847,Validation,0.275363
777915,410,2848,Validation,0.000000
777916,410,2849,Validation,0.000000


In [61]:
#Merge results
df_gr_val = pd.merge(df_gr_val_DT, df_gr_val_LGB, on= ['Store_ID', 'Product_ID','dataset'], how="left")

In [62]:
#Calculate wape to lbgm
df_gr_test_LGB = X_test_new.groupby(['Store_ID', 'Product_ID','dataset']).apply(wmape_gr,'target','lgb_pred')

In [63]:
#Calculate wape to DT
df_gr_test_DT = X_test_new.groupby(['Store_ID', 'Product_ID','dataset']).apply(wmape_gr,'target','DT_pred')

In [64]:
df_gr_test_LGB = df_gr_test_LGB.reset_index()
df_gr_test_LGB = df_gr_test_LGB.rename(columns={0:'wape_lgb'})
df_gr_test_LGB = df_gr_test_LGB.fillna(0)
df_gr_test_LGB

Unnamed: 0,Store_ID,Product_ID,dataset,wape_lgb
0,1,1,Test,0.000000
1,1,2,Test,0.000000
2,1,3,Test,0.000000
3,1,4,Test,0.000000
4,1,5,Test,0.000000
...,...,...,...,...
795909,410,2846,Test,0.948917
795910,410,2847,Test,0.962015
795911,410,2848,Test,0.000000
795912,410,2849,Test,0.000000


In [65]:
df_gr_test_DT = df_gr_test_DT.reset_index()
df_gr_test_DT = df_gr_test_DT.rename(columns={0:'wape_DT'})
df_gr_test_DT = df_gr_test_DT.fillna(0)
df_gr_test_DT

Unnamed: 0,Store_ID,Product_ID,dataset,wape_DT
0,1,1,Test,0.0
1,1,2,Test,0.0
2,1,3,Test,0.0
3,1,4,Test,0.0
4,1,5,Test,0.0
...,...,...,...,...
795909,410,2846,Test,1.0
795910,410,2847,Test,1.0
795911,410,2848,Test,0.0
795912,410,2849,Test,0.0


In [66]:
#Merge results
df_gr_test = pd.merge(df_gr_test_DT, df_gr_test_LGB, on= ['Store_ID', 'Product_ID','dataset'], how="left")

In [67]:
#Append all in one dataframe
df_gr_all = df_gr_train.append(df_gr_val)
df_gr_all = df_gr_all.append(df_gr_test)

In [68]:
data = X_train_new[['Year_Week', 'Store_ID', 'Product_ID', 'lgb_pred', 'target', 'sq_err_lgb','DT_pred','sq_err_DT','dataset']]

In [69]:
data.head()

Unnamed: 0,Year_Week,Store_ID,Product_ID,lgb_pred,target,sq_err_lgb,DT_pred,sq_err_DT,dataset
0,201728,1,1000,1.552632,2,0.200138,1.38045,0.383842,Train
1,201728,1,1001,0.491883,0,0.241948,0.0,0.0,Train
2,201728,1,1002,2.893234,4,1.224932,4.474146,0.224815,Train
3,201728,1,1003,0.53954,0,0.291103,0.0,0.0,Train
4,201728,1,1004,0.73447,0,0.539447,0.0,0.0,Train


In [70]:
data_val = X_val_new[['Year_Week', 'Store_ID', 'Product_ID', 'lgb_pred', 'target', 'sq_err_lgb','DT_pred','sq_err_DT','dataset']]

In [71]:
data_test = X_test_new[['Year_Week', 'Store_ID', 'Product_ID', 'lgb_pred', 'target', 'sq_err_lgb','DT_pred','sq_err_DT','dataset']]

In [72]:
#Append all in one dataframe
data_all = data.append(data_val)
data_all = data_all.append(data_test)

In [None]:
data_all.to_csv(os.path.join("df_demand_final.csv"), index=False)

In [None]:
df_gr_all.to_csv(os.path.join("df_demand_wape.csv"), index=False)