# XGBoost Estimation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_log_error as rmsle
import xgboost as xgb
import os
import gc

## Load The Training Data


In [2]:
# #data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Demanda_uni_equil': 'adjusted_demand',
                                'Producto_ID': 'Product_ID'})

# # define client-product ID
# train['ID'] = train.groupby(['Client_ID', 'Product_ID']).ngroup()
# unique_ids = train['ID'].unique()

# # Define the fraction of IDs to sample
# fraction = 0.1  # sample 10% of the IDs

# # Calculate the number of IDs to sample
# sample_size = int(len(unique_ids) * fraction)

# rng = np.random.default_rng(4325252122)

# # Choose a random sample of IDs
# sampled_ids = np.random.choice(unique_ids, size=sample_size, replace=False)

# # Filter the DataFrame to keep all rows with the sampled IDs
# train = train[train['ID'].isin(sampled_ids)]
# train = train.drop(columns='ID')
# print(len(train))

  data_path = "..\product-inventory"


In [3]:
# duplicates client-product-week observation -> take the average as adjusted demand
train = train.groupby(['Client_ID', 'Product_ID', 'Week_num'], as_index=False).agg({'adjusted_demand': 'sum'})

# create the target dataset and training dataset
# identify the last observation of each client-product
last_week = train.groupby(['Client_ID', 'Product_ID'], as_index=False).agg({'Week_num':'max'})
train  = train.merge(right = last_week,
                    how='left',
                    on=['Client_ID', 'Product_ID'],
                    suffixes=['','_max'])

del last_week

# target dataset
val = train.loc[train['Week_num'] == train['Week_num_max']]
val = val.drop(columns=['Week_num_max'])

# train dataset
train = train.loc[train['Week_num'] != train['Week_num_max']]
train = train.drop(columns=['Week_num_max'])


In [4]:
print(len(train), len(val))
print('\n')
print(train.columns)
print(val.columns)

48181485 25831538


Index(['Client_ID', 'Product_ID', 'Week_num', 'adjusted_demand'], dtype='object')
Index(['Client_ID', 'Product_ID', 'Week_num', 'adjusted_demand'], dtype='object')


In [5]:
#create a dataframe of aggregate statistics for each client
client_stats = train.groupby(['Client_ID'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median']})
client_stats.columns = ['Client_ID', 'Products', 'adj_dem_mean', 'adj_dem_median']
client_stats['adj_dem_mean'] = client_stats['adj_dem_mean'].round(2)
client_stats['adj_dem_median'] = client_stats['adj_dem_median'].astype(int)

#create a dataframe of aggregate statistics for each product
product_stats = train.groupby(['Product_ID'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median']})
product_stats.columns = ['Product_ID', 'Clients', 'adj_dem_mean', 'adj_dem_median']
product_stats['adj_dem_mean'] = product_stats['adj_dem_mean'].round(2)
product_stats['adj_dem_median'] = product_stats['adj_dem_median'].astype(int)
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

#create a dataframe of aggregate client-product information i.e. removing the time dimension
train = train.groupby(['Client_ID', 'Product_ID'], as_index=False).agg({'Week_num':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})
train.columns = ['Client_ID', 'Product_ID', 'num_weeks', 'adj_dem_mean', 'adj_dem_median', 'adj_dem_min', 'adj_dem_max']

In [6]:
train = train.merge(right=client_stats,
                    how='left',
                    on='Client_ID',
                    suffixes=('', '_c'))
del client_stats

train = train.merge(right=product_stats,
                    how='left',
                    on='Product_ID',
                    suffixes=('','_p'))
del product_stats

train = train.merge(right=val[['Client_ID', 'Product_ID', 'adjusted_demand']],
                how='left',
                on=['Product_ID', 'Client_ID'])
train.columns

Index(['Client_ID', 'Product_ID', 'num_weeks', 'adj_dem_mean',
       'adj_dem_median', 'adj_dem_min', 'adj_dem_max', 'Products',
       'adj_dem_mean_c', 'adj_dem_median_c', 'Clients', 'adj_dem_mean_p',
       'adj_dem_median_p', 'median_pct', 'adjusted_demand'],
      dtype='object')

In [None]:
features = ['num_weeks','adj_dem_median', 'adj_dem_min', 'adj_dem_max',
            'Products','adj_dem_mean_c', 'adj_dem_median_c',
            'Clients', 'adj_dem_mean_p', 'adj_dem_median_p', 'median_pct']
target = ['adjusted_demand']

In [8]:
xgb_reg = xgb.XGBRegressor(objective='reg:squaredlogerror',
                         max_depth=5,
                         n_estimators=300,             # Number of boosting rounds
                         learning_rate=0.01,
                         )

In [None]:
# dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=False)
bst = xgb_reg.fit(train[features], train[target])

In [None]:
# predict the demand
train['pred'] = xgb_reg.predict(train[features])
train['pred'] = np.maximum(0, train['pred']).round(2)
train[target+['pred', 'Client_ID', 'Product_ID']+features]

Unnamed: 0,adjusted_demand,pred,Client_ID,Product_ID,num_weeks,adj_dem_median,adj_dem_min,adj_dem_max,Products,adj_dem_mean_c,adj_dem_median_c,Clients,adj_dem_mean_p,adj_dem_median_p,median_pct
0,0,12.69,26,1182,2,19.5,0,39,38,27.74,20,143631,9.29,5,0.448790
1,42,24.15,26,4767,2,42.0,42,42,38,27.74,20,36244,10.07,3,0.278088
2,13,13.01,26,31393,6,17.0,15,22,38,27.74,20,4796,14.12,2,0.149907
3,10,4.12,26,32962,1,3.0,3,3,38,27.74,20,2089,20.79,15,0.696772
4,30,15.46,26,33246,4,20.0,10,30,38,27.74,20,172,12.79,1,0.065487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16707872,3,17.15,2015152015,2233,3,20.0,17,52,13,32.48,10,391725,6.01,3,0.278088
16707873,10,3.76,2015152015,2665,1,4.0,4,4,13,32.48,10,68693,6.03,3,0.278088
16707874,10,7.74,2015152015,4270,1,10.0,10,10,13,32.48,10,51314,7.39,4,0.374302
16707875,8,4.03,2015152015,4280,1,4.0,4,4,13,32.48,10,92584,8.52,5,0.448790


## Load The Testing Data

In [11]:
# load test.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/test.csv.zip")

test = pd.read_csv(filename, 
                 usecols=['id', 'Producto_ID', 'Cliente_ID'])
# 
# rename columns
test = test.rename(columns={'Cliente_ID': 'Client_ID',
                            'Producto_ID': 'Product_ID'})

  data_path = "..\product-inventory"


In [12]:
test = test[['id','Client_ID', 'Product_ID']].merge(right=train[['Client_ID', 'Product_ID', 'pred']], 
                                                    how='left', 
                                                    on=['Client_ID', 'Product_ID'])
test = test.sort_values(by=['Client_ID', 'Product_ID']).reset_index(drop=True)
test

Unnamed: 0,id,Client_ID,Product_ID,pred
0,1569352,26,31518,
1,4728674,26,31520,
2,1547831,26,34206,39.189999
3,6667200,26,34210,22.450001
4,1592616,26,34785,14.410000
...,...,...,...,...
6999246,6093628,2015152015,1232,2.090000
6999247,2542921,2015152015,1238,
6999248,3223836,2015152015,1250,14.910000
6999249,1889878,2015152015,2233,17.150000


In [13]:
test['pred'].isna().mean()

0.3231074296378284

## Out-of-Sample Prediction
There are a few cases where the client-product combo is not present in the training data. I fill in these observations' missing values. 
### 1. Existing Products
This case consists of two possible scanarios:
* New Client
* Existing Client but a new combo

In both scenarios, the in-sample estimation can not predict the demand. We use the average product demand in weeks 3-9 as our prediction.
In the 2nd scenario, our prediction model has some shortcomings. For example, our measure does not take into account that a client might have a low demand for a new product. On the other hand, if we use the client's average demand as our prediction, it does not take into account the variation in products' demand. As a first pass, we use 'average prodct demand' as the prediction


In [14]:
# use product's average demand in week 3-9 as a prediction for the new client.
# adj_dem_mean_p contains existing product with new client, and a prediction for the client's demand.

#create a dataframe of aggregate statistics for each product
testagg = train.groupby(['Product_ID'], as_index=False).agg({'adj_dem_mean_p':'mean'})

test = test.merge(right=testagg, 
                  how='left', 
                  on='Product_ID')

del testagg
test['pred'] = test['pred'].fillna(test['adj_dem_mean_p'])
print('Share of Missing Preiction:', test['pred'].isna().mean())

test = test.drop(columns='adj_dem_mean_p')

Share of Missing Preiction: 0.02010443688903284


### 2. Existing Clients

In [15]:
# use client's average demand in week 3-9 as a prediction for new product.
#pred_1 contains existing clients with new products, and a prediction for the product.

#create a dataframe of aggregate statistics for each product
testagg = train.groupby(['Client_ID'], as_index=False).agg({'adj_dem_mean_c':'mean'})
test = test.merge(right=testagg, 
                  how='left', 
                  on='Client_ID')

test['pred'] = test['pred'].fillna(test['adj_dem_mean_c'])
print('Share of Missing Preiction:', test['pred'].isna().mean())

del testagg
test = test.drop(columns='adj_dem_mean_c')
# WATCH OUT: This replaces missing values for existing clients and existing products with the client's average demand. 
# THEY SHOULD BE REPLACED WITH THE ACTUAL PREDICTION

Share of Missing Preiction: 0.000156731055937271


In [16]:
train.columns

Index(['Client_ID', 'Product_ID', 'num_weeks', 'adj_dem_mean',
       'adj_dem_median', 'adj_dem_min', 'adj_dem_max', 'Products',
       'adj_dem_mean_c', 'adj_dem_median_c', 'Clients', 'adj_dem_mean_p',
       'adj_dem_median_p', 'median_pct', 'adjusted_demand', 'pred'],
      dtype='object')

### 3. New Clients, New Products
The intersection of new clients and new product in the test data. Here the first guess is the average demand for all product across all weeks.


In [17]:
test['pred'] = test['pred'].fillna(train['adj_dem_mean'].mean())
print('Share of Missing Preiction:', test['pred'].isna().mean())

Share of Missing Preiction: 0.0


In [18]:
test

Unnamed: 0,id,Client_ID,Product_ID,pred
0,1569352,26,31518,18.480000
1,4728674,26,31520,31.440000
2,1547831,26,34206,39.189999
3,6667200,26,34210,22.450001
4,1592616,26,34785,14.410000
...,...,...,...,...
6999246,6093628,2015152015,1232,2.090000
6999247,2542921,2015152015,1238,3.480000
6999248,3223836,2015152015,1250,14.910000
6999249,1889878,2015152015,2233,17.150000


In [19]:
output = test[['id', 'pred']]
output = output.rename(columns={'pred': 'Demanda_uni_equil'})

data_path = "..\product-inventory"
filename = os.path.join(data_path, "prediction_3.csv")
output.to_csv(filename, index=False)

  data_path = "..\product-inventory"
