# XGBoost Estimation

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

## Load The Training Data


In [2]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Demanda_uni_equil': 'adjusted_demand',
                                'Producto_ID': 'Product_ID'})
# define client-product ID
train['ID'] = train.groupby(['Client_ID', 'Product_ID']).ngroup()
unique_ids = train['ID'].unique()

# Define the fraction of IDs to sample
fraction = 0.1  # sample 10% of the IDs

# Calculate the number of IDs to sample
sample_size = int(len(unique_ids) * fraction)

rng = np.random.default_rng(4325252122)

# Choose a random sample of IDs
sampled_ids = np.random.choice(unique_ids, size=sample_size, replace=False)

# Filter the DataFrame to keep all rows with the sampled IDs
train = train[train['ID'].isin(sampled_ids)]
train = train.drop(columns='ID')
print(len(train))

  data_path = "..\product-inventory"


7426328


In [3]:
#create a dataframe of aggregate statistics for each client
testagg = train.groupby(['Client_ID'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

client_stats = pd.DataFrame()

client_stats['Client_ID'] = testagg['Client_ID']
client_stats['Products'] = testagg['Product_ID']['nunique']
client_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
client_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
client_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
client_stats['adj_dem_max'] = testagg['adjusted_demand']['max']

del testagg

#create a dataframe of aggregate statistics for each product
testagg = train.groupby(['Product_ID'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

product_stats =  pd.DataFrame()

product_stats['Product_ID'] = testagg['Product_ID']
product_stats['Clients'] = testagg['Client_ID']['nunique']
product_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
product_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
product_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
product_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

del testagg

In [None]:
train = train.merge(right=client_stats,
                    how='left',
                    on='Client_ID')
train = train.merge(right=product_stats,
                    how='left',
                    on='Product_ID',
                    suffixes=('_c', '_p'))

In [11]:
# denote categorical variables
train['Week_num'] = train['Week_num'].astype('category')
train['Client_ID'] = train['Client_ID'].astype('category')
train['Product_ID'] = train['Product_ID'].astype('category')
# train['Sales_Depot_ID'] = train['Sales_Depot_ID'].astype('category')
# train['Sales_Channel_ID'] = train['Sales_Channel_ID'].astype('category')
# train['Route_ID'] = train['Route_ID'].astype('category')
# train['ID'] = train['ID'].astype('category')
# train['ccid'] = train['ccid'].astype('category')
# train['cpid'] = train['cpid'].astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7426328 entries, 0 to 7426327
Data columns (total 15 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Week_num          category
 1   Client_ID         category
 2   Product_ID        category
 3   adjusted_demand   int64   
 4   Products          int64   
 5   adj_dem_mean_c    float64 
 6   adj_dem_median_c  int32   
 7   adj_dem_min_c     int64   
 8   adj_dem_max_c     int64   
 9   Clients           int64   
 10  adj_dem_mean_p    float64 
 11  adj_dem_median_p  int32   
 12  adj_dem_min_p     int64   
 13  adj_dem_max_p     int64   
 14  median_pct        float64 
dtypes: category(3), float64(3), int32(2), int64(7)
memory usage: 694.3 MB


In [13]:
features = ['Week_num', 'Client_ID', 'Product_ID', 'adjusted_demand', 'Products',
       'adj_dem_mean_c', 'adj_dem_median_c', 'adj_dem_min_c', 'adj_dem_max_c',
       'Clients', 'adj_dem_mean_p', 'adj_dem_median_p', 'adj_dem_min_p',
       'adj_dem_max_p', 'median_pct']

X_train = train[features]
y_train = train['adjusted_demand']

In [None]:
model = xgb.XGBRegressor(objective='reg:squaredlogerror',
                         n_estimators=5,             # Number of boosting rounds
                         learning_rate=0.1,
                         random_state=42)

In [39]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
bst = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, evals=X_train, dtrain=dtrain, num_boost_round=25)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
bst.eval

TypeError: Booster.eval_set() missing 1 required positional argument: 'evals'