In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

In [2]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

#office
train = pd.read_csv("train.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])
test = pd.read_csv("test.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'id'])

train = train.rename(columns={'Semana': 'Week_num',
                              'Agencia_ID': 'Sales_Depot_ID',
                              'Canal_ID': 'Sales_Channel_ID',
                              'Ruta_SAK': 'Route_ID',
                              'Cliente_ID': 'Client_ID',
                              'Venta_uni_hoy': 'Sales_unit_this_week',
                              'Venta_hoy': 'Sales_this_week',
                              'Dev_uni_proxima': 'Returns_unit_next_week',
                              'Dev_proxima': 'Returns_next_week',
                              'Demanda_uni_equil': 'adjusted_demand',
                              'Producto_ID': 'Product_ID'})

test = test.rename(columns={'Semana': 'Week_num',
                            'Agencia_ID': 'Sales_Depot_ID',
                            'Canal_ID': 'Sales_Channel_ID',
                            'Ruta_SAK': 'Route_ID',
                            'Cliente_ID': 'Client_ID',
                            'Venta_uni_hoy': 'Sales_unit_this_week',
                            'Venta_hoy': 'Sales_this_week',
                            'Dev_uni_proxima': 'Returns_unit_next_week',
                            'Dev_proxima': 'Returns_next_week',
                            'Demanda_uni_equil': 'adjusted_demand',
                            'Producto_ID': 'Product_ID'})



#set a unique id for each sales depot id, sales channel id, route id, client, product combination (thanks Gemini)
combined_df = pd.concat([train,test])
combined_df['ID'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID', 'Product_ID']).ngroup()

#set a combined client ID, consisting of a unique sales depot ID, sales channel ID, route ID, and client ID
combined_df['ccid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID']).ngroup()

#set a combined product ID, consisting of a unique sales depot ID, sales channel ID, route ID, and product ID
combined_df['cpid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Product_ID']).ngroup()

train = combined_df.iloc[:len(train)].copy()
test = combined_df.iloc[len(train):].copy()

del combined_df


train = train.drop(columns='id')
train['adjusted_demand'] = train['adjusted_demand'].astype(int)
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

test = test.drop(columns='adjusted_demand')
test['id'] = test['id'].astype(int)
test = test.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

In [3]:
#create a dataframe of aggregate statistics for each client
testagg = train.sort_values(by=['ccid']).groupby(['ccid'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

client_stats = pd.DataFrame()

client_stats['ccid'] = testagg['ccid']
client_stats['Products'] = testagg['Product_ID']['nunique']
client_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
client_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
client_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
client_stats['adj_dem_max'] = testagg['adjusted_demand']['max']

del testagg

#create a dataframe of aggregate statistics for each product
testagg = train.sort_values(by=['cpid']).groupby(['cpid'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

product_stats =  pd.DataFrame()

product_stats['cpid'] = testagg['cpid']
product_stats['Clients'] = testagg['Client_ID']['nunique']
product_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
product_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
product_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
product_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

del testagg

In [4]:
train.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid
0,4,1110,7,3301,15766,325,1,0,0,4
1,4,1110,7,3301,15766,328,1,1,0,5
2,3,1110,7,3301,15766,1212,3,2,0,15
3,4,1110,7,3301,15766,1212,4,2,0,15
4,5,1110,7,3301,15766,1212,5,2,0,15


In [None]:
train = pd.get_dummies(train, columns=['Week_num']).astype(int)

train.head()

Unnamed: 0,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid,Week_num_3,Week_num_4,Week_num_5,Week_num_6,Week_num_7,Week_num_8,Week_num_9
0,1110,7,3301,15766,325,1,0,0,4,False,True,False,False,False,False,False
1,1110,7,3301,15766,328,1,1,0,5,False,True,False,False,False,False,False
2,1110,7,3301,15766,1212,3,2,0,15,True,False,False,False,False,False,False
3,1110,7,3301,15766,1212,4,2,0,15,False,True,False,False,False,False,False
4,1110,7,3301,15766,1212,5,2,0,15,False,False,True,False,False,False,False


In [6]:
train = train.astype(int)

train.head()

Unnamed: 0,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid,Week_num_3,Week_num_4,Week_num_5,Week_num_6,Week_num_7,Week_num_8,Week_num_9
0,1110,7,3301,15766,325,1,0,0,4,0,1,0,0,0,0,0
1,1110,7,3301,15766,328,1,1,0,5,0,1,0,0,0,0,0
2,1110,7,3301,15766,1212,3,2,0,15,1,0,0,0,0,0,0
3,1110,7,3301,15766,1212,4,2,0,15,0,1,0,0,0,0,0
4,1110,7,3301,15766,1212,5,2,0,15,0,0,1,0,0,0,0


In [7]:
train['Week_num_3']

0           0
1           0
2           1
3           0
4           0
           ..
74180459    0
74180460    1
74180461    0
74180462    0
74180463    0
Name: Week_num_3, Length: 74180464, dtype: int32

In [None]:
# denote categorical variables
train['Week_num'] = train['Week_num'].astype('category')
# train['Sales_Depot_ID'] = train['Sales_Depot_ID'].astype('category')
# train['Sales_Channel_ID'] = train['Sales_Channel_ID'].astype('category')
# train['Route_ID'] = train['Route_ID'].astype('category')
# train['Client_ID'] = train['Client_ID'].astype('category')
# train['Product_ID'] = train['Product_ID'].astype('category')
# train['ID'] = train['ID'].astype('category')
# train['ccid'] = train['ccid'].astype('category')
# train['cpid'] = train['cpid'].astype('category')

In [8]:
# get ccid, cpid means, medians and cpid median percentage into training data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_mean'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_median'] = train['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_mean'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['median_pct'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median_pct'] = train['cpid'].map(pidmapping).round(3)

del cidmapping, pidmapping

In [10]:
# make adjusted demand last column

# Separate the column to move from the other columns
columns = [col for col in train.columns if col != 'adjusted_demand']

# Append the column to move at the end
new_order = columns + ['adjusted_demand']

# Reorder the DataFrame columns
train = train[new_order]

In [25]:
train.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,ID,ccid,cpid,ccid_mean,ccid_median,cpid_mean,cpid_median,cpid_median_pct,adjusted_demand
0,4,1110,7,3301,15766,325,0,0,4,5.52,4,1.94,1,0.082,1
1,4,1110,7,3301,15766,328,1,0,5,5.52,4,1.91,2,0.231,1
2,3,1110,7,3301,15766,1212,2,0,15,5.52,4,2.48,2,0.231,3
3,4,1110,7,3301,15766,1212,2,0,15,5.52,4,2.48,2,0.231,4
4,5,1110,7,3301,15766,1212,2,0,15,5.52,4,2.48,2,0.231,5


In [12]:
model = xgb.XGBRegressor(objective='reg:squaredlogerror',
                         n_estimators=25,             # Number of boosting rounds
                         learning_rate=0.1,
                         random_state=42)

In [22]:
X_train = train.iloc[:, 0:-1]
y_train = train.iloc[:, -1]

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

In [23]:
bst = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=25)

XGBoostError: [17:59:15] C:\b\abs_90_bwj_86a\croot\xgboost-split_1724073762025\work\src\common\categorical.h:76: Invalid categorical value detected.  Categorical value should be non-negative, less than total number of categories in training data and less than 16777216

In [None]:
16777216