In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

In [34]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

#office
train = pd.read_csv("train.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])
test = pd.read_csv("test.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'id'])

train = train.rename(columns={'Semana': 'Week_num',
                              'Agencia_ID': 'Sales_Depot_ID',
                              'Canal_ID': 'Sales_Channel_ID',
                              'Ruta_SAK': 'Route_ID',
                              'Cliente_ID': 'Client_ID',
                              'Venta_uni_hoy': 'Sales_unit_this_week',
                              'Venta_hoy': 'Sales_this_week',
                              'Dev_uni_proxima': 'Returns_unit_next_week',
                              'Dev_proxima': 'Returns_next_week',
                              'Demanda_uni_equil': 'adjusted_demand',
                              'Producto_ID': 'Product_ID'})

test = test.rename(columns={'Semana': 'Week_num',
                            'Agencia_ID': 'Sales_Depot_ID',
                            'Canal_ID': 'Sales_Channel_ID',
                            'Ruta_SAK': 'Route_ID',
                            'Cliente_ID': 'Client_ID',
                            'Venta_uni_hoy': 'Sales_unit_this_week',
                            'Venta_hoy': 'Sales_this_week',
                            'Dev_uni_proxima': 'Returns_unit_next_week',
                            'Dev_proxima': 'Returns_next_week',
                            'Demanda_uni_equil': 'adjusted_demand',
                            'Producto_ID': 'Product_ID'})



#set a unique id for each sales depot id, sales channel id, route id, client, product combination (thanks Gemini)
combined_df = pd.concat([train,test])
combined_df['ID'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID', 'Product_ID']).ngroup()

#set a combined client ID, consisting of a unique sales depot ID, sales channel ID, route ID, and client ID
combined_df['ccid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID']).ngroup()

#set a combined product ID, consisting of a unique sales depot ID, sales channel ID, route ID, and product ID
combined_df['cpid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Product_ID']).ngroup()

train = combined_df.iloc[:len(train)].copy()
test = combined_df.iloc[len(train):].copy()

del combined_df


train = train.drop(columns='id')
train['adjusted_demand'] = train['adjusted_demand'].astype(int)
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

test = test.drop(columns='adjusted_demand')
test['id'] = test['id'].astype(int)
test = test.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

In [35]:
#create a dataframe of aggregate statistics for each client
testagg = train[train['Week_num'] <= 8].sort_values(by=['ccid']).groupby(['ccid'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

client_stats = pd.DataFrame()

client_stats['ccid'] = testagg['ccid']
client_stats['Products'] = testagg['Product_ID']['nunique']
client_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
client_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
client_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
client_stats['adj_dem_max'] = testagg['adjusted_demand']['max']

del testagg

#create a dataframe of aggregate statistics for each product
testagg = train[train['Week_num'] <= 8].sort_values(by=['cpid']).groupby(['cpid'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

product_stats =  pd.DataFrame()

product_stats['cpid'] = testagg['cpid']
product_stats['Clients'] = testagg['Client_ID']['nunique']
product_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
product_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
product_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
product_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

del testagg

In [36]:
# get ccid, cpid means, medians and cpid median percentage into training data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_mean'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_median'] = train['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_mean'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['median_pct'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median_pct'] = train['cpid'].map(pidmapping).round(3)

del cidmapping, pidmapping

In [5]:
print("There are", len(train['ID'].unique()), "unique training IDs.")
print("There are", len(test['ID'].unique()), "unique testing IDs.")

There are 26396648 unique training IDs.
There are 6237461 unique testing IDs.


In [5]:
wk8IDs = train[train['Week_num'] == 8]['ID'].unique().tolist()
wk9IDs = train[train['Week_num'] == 9]['ID'].unique().tolist()
wk8and9IDs = list(set(wk8IDs) & set(wk9IDs))

iw9 = train[['ID', 'ccid', 'ccid_mean', 'ccid_median', 'cpid', 'cpid_mean', 'cpid_median', 'cpid_median_pct']].drop_duplicates(subset='ID', keep='first').reset_index(drop=True).copy(deep=True)



# are the cpid, ccids in the training data?
iw9.loc[:, 'cpid_in_train'] = iw9['cpid'].isin(train[train['Week_num'] <= 8]['cpid'].unique().tolist())
iw9.loc[:, 'ccid_in_train'] = iw9['ccid'].isin(train[train['Week_num'] <= 8]['ccid'].unique().tolist())

# initialize adjusted demand column
iw9['adjusted_demand'] = np.zeros(len(iw9))

# for ccid and cpid not in training data, set adjusted demand to 5
iw9.loc[(iw9['cpid_in_train'] == False) & (iw9['ccid_in_train'] == False), 'adjusted_demand'] = np.log1p(5)

# for cpid in training data and ccid not in training data, use cpid median
iw9.loc[(iw9['ccid_in_train'] == False) & (iw9['cpid_in_train'] == True), 'adjusted_demand'] = iw9[(iw9['ccid_in_train'] == False) & (iw9['cpid_in_train'] == True)]['cpid_median']

# for ccid in training data and cpid not in training data, use ccid median
iw9.loc[(iw9['ccid_in_train'] == True) & (iw9['cpid_in_train'] == False), 'adjusted_demand'] = iw9[(iw9['ccid_in_train'] == True) & (iw9['cpid_in_train'] == False)]['ccid_median']

# for ccid, cpid in training data, use (0.5 + (cpid median pct)) * (0.65 * (ccid median) + (1-0.65) * (ccid mean))
iw9.loc[(iw9['ccid_in_train'] == True) & (iw9['cpid_in_train'] == True), 'adjusted_demand'] = (0.5 + iw9[(iw9['ccid_in_train'] == True) & (iw9['cpid_in_train'] == True)]['cpid_median_pct'])*(0.65*iw9[(iw9['ccid_in_train'] == True) & (iw9['cpid_in_train'] == True)]['ccid_median'] + (1-0.65)*iw9[(iw9['ccid_in_train'] == True) & (iw9['cpid_in_train'] == True)]['ccid_mean'])



# override with linear regression predicted adjusted demand for the IDs that have week 9 data


# only use data points (adjusted demand) if they appear in consecutive weeks, i.e. week 3 and week 4 or week 6 and week 7.
conseq_col = train['Week_num'].diff().dropna().astype(int)
conseq_col.loc[0] = 0
conseq_col = conseq_col.sort_index()

train['conseq_pts'] = conseq_col

train['adj_dem_lag1'] = train['adjusted_demand'].shift(1)

lr = LinearRegression()

lr.fit(X=train[(train['conseq_pts'] == 1) & (train['Week_num'] <= 8)][['adj_dem_lag1']].values, y=train[(train['conseq_pts'] == 1) & (train['Week_num'] <= 8)][['adjusted_demand']].values)

# get week 9 prediction
iw9.loc[iw9['ID'].isin(wk8and9IDs), 'adjusted_demand'] = lr.predict(train[(train['Week_num'] == 8) & (train['ID'].isin(train[train['Week_num'] == 9]['ID'].unique().tolist()))][['adjusted_demand']].values)

rmsle(train[train['Week_num'] == 9]['adjusted_demand'], iw9.loc[iw9['ID'].isin(wk9IDs), 'adjusted_demand'])

0.5800542488834454

In [39]:
train['adjusted_demand'].max()

np.int64(5000)

In [41]:
train[train['adjusted_demand'] >= 4000]

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid,ccid_mean,ccid_median,cpid_mean,cpid_median,cpid_median_pct
7855045,7,1139,8,3402,24510,2604,4340,2683521,220878,94303,1241.90,30.0,2205.00,2338.0,1.000
7855201,5,1139,8,3402,853464,2604,4732,2683555,220887,94303,1752.30,1488.0,2205.00,2338.0,1.000
7855203,7,1139,8,3402,853464,2604,4732,2683555,220887,94303,1752.30,1488.0,2205.00,2338.0,1.000
9155613,3,1160,8,3601,827594,1166,4608,3300497,280416,129827,2735.27,3040.0,3072.00,3456.0,1.000
9155620,3,1160,8,3601,827594,1167,4608,3300498,280416,129828,2735.27,3040.0,3643.20,3456.0,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68793008,8,4011,11,3938,7727092,43069,4997,25766544,2125300,1037160,2802.96,3373.0,363.64,3.0,0.387
68793009,9,4011,11,3938,7727092,43069,4343,25766544,2125300,1037160,2802.96,3373.0,363.64,3.0,0.387
68799563,4,4011,11,3974,4214925,43231,4700,25769111,2125726,1037626,2973.30,3028.0,1732.60,77.0,0.952
68799564,5,4011,11,3974,4214925,43231,4100,25769111,2125726,1037626,2973.30,3028.0,1732.60,77.0,0.952


In [42]:
train[train['ID'] == 2683521]

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid,ccid_mean,ccid_median,cpid_mean,cpid_median,cpid_median_pct
7855043,3,1139,8,3402,24510,2604,3388,2683521,220878,94303,1241.9,30.0,2205.0,2338.0,1.0
7855044,6,1139,8,3402,24510,2604,3080,2683521,220878,94303,1241.9,30.0,2205.0,2338.0,1.0
7855045,7,1139,8,3402,24510,2604,4340,2683521,220878,94303,1241.9,30.0,2205.0,2338.0,1.0
7855046,8,1139,8,3402,24510,2604,1512,2683521,220878,94303,1241.9,30.0,2205.0,2338.0,1.0
7855047,9,1139,8,3402,24510,2604,3080,2683521,220878,94303,1241.9,30.0,2205.0,2338.0,1.0


In [37]:
train[train['ID'] == 6257653]

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid,ccid_mean,ccid_median,cpid_mean,cpid_median,cpid_median_pct
16788840,5,1245,1,1027,1440936,1182,76,6257653,541971,229602,39.32,37.0,34.76,41.0,0.914
16788841,6,1245,1,1027,1440936,1182,42,6257653,541971,229602,39.32,37.0,34.76,41.0,0.914
16788842,7,1245,1,1027,1440936,1182,41,6257653,541971,229602,39.32,37.0,34.76,41.0,0.914
16788843,8,1245,1,1027,1440936,1182,21,6257653,541971,229602,39.32,37.0,34.76,41.0,0.914
16788844,9,1245,1,1027,1440936,1182,4872,6257653,541971,229602,39.32,37.0,34.76,41.0,0.914


In [44]:
test[test['ID'] == 6257653]

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,id,ID,ccid,cpid
1615954,10,1245,1,1027,1440936,1182,5156127,6257653,541971,229602
1615955,11,1245,1,1027,1440936,1182,1223407,6257653,541971,229602


In [6]:
# create training data based on ID in order to use lagged adjusted demand
trainIDdf = pd.DataFrame()

trainIDdf = train[['ID', 'Client_ID', 'Product_ID', 'ccid_mean', 'ccid_median', 'cpid_mean', 'cpid_median']].drop_duplicates(subset='ID', keep='first').reset_index(drop=True)

# get adjusted demand for the week for each ID 
for j in [6,7,8,9]:
    wkmap = pd.Series(train[train['Week_num'] == j].set_index('ID')['adjusted_demand'], index=train['ID'].unique()).to_dict()
    trainIDdf[f'Wk_{j}_dem'] = trainIDdf['ID'].map(wkmap)

trainIDdf['ID'] = trainIDdf['ID'].astype('category')
trainIDdf['Client_ID'] = trainIDdf['Client_ID'].astype('category')
trainIDdf['Product_ID'] = trainIDdf['Product_ID'].astype('category')

trainIDdf['Wk_9_dem'] = trainIDdf['Wk_9_dem'].fillna(iw9['adjusted_demand'])

trainIDdf.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  trainIDdf['Wk_9_dem'].fillna(iw9['adjusted_demand'], inplace=True)


Unnamed: 0,ID,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,Wk_6_dem,Wk_7_dem,Wk_8_dem,Wk_9_dem
0,0,15766,325,5.51,4.0,1.83,1.0,,,,2.635587
1,1,15766,328,5.51,4.0,1.88,2.0,,,,3.305805
2,2,15766,1212,5.51,4.0,2.51,2.0,,,4.0,1.0
3,3,15766,1216,5.51,4.0,2.16,2.0,1.0,2.0,5.0,3.305805
4,4,15766,1220,5.51,4.0,3.1,3.0,,,1.0,4.01678


In [25]:
trainIDdf[trainIDdf['Wk_9_dem'] == 6815.79825]

Unnamed: 0,ID,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,Wk_6_dem,Wk_7_dem,Wk_8_dem,Wk_9_dem
10911634,11442862,2408814,32799,3728.33,4983.0,3728.33,4983.0,4999.0,4983.0,1203.0,6815.79825


In [7]:
del train

In [8]:
X_train = trainIDdf.iloc[:, 1:-1].copy(deep=True)
y_train = trainIDdf['Wk_9_dem'].copy(deep=True)

#X_train.drop(['Wk_3_dem', 'Wk_4_dem','Wk_5_dem'], axis=1, inplace=True)
X_train.rename(columns={'Wk_6_dem': 'lag_3', 'Wk_7_dem': 'lag_2', 'Wk_8_dem': 'lag_1'}, inplace=True)

X_train.head()

Unnamed: 0,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1
0,15766,325,5.51,4.0,1.83,1.0,,,
1,15766,328,5.51,4.0,1.88,2.0,,,
2,15766,1212,5.51,4.0,2.51,2.0,,,4.0
3,15766,1216,5.51,4.0,2.16,2.0,1.0,2.0,5.0
4,15766,1220,5.51,4.0,3.1,3.0,,,1.0


In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

In [None]:
model = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=100)

In [12]:
model2 = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=250)

test_pred = model2.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.3306716098506324


In [9]:
model3 = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror', 'tree_method': 'gpu_hist'}, dtrain, num_boost_round=500)


    E.g. tree_method = "hist", device = "cuda"



In [None]:
test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.45922917692141374
0.4553438057287803


In [10]:
test_pred = model3.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))


    E.g. tree_method = "hist", device = "cuda"



0.4519915830991541


In [18]:
# get ccid, cpid means and medians into test data
# cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
# test['ccid_mean'] = test['ccid'].map(cidmapping)

# cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
# test['ccid_median'] = test['ccid'].map(cidmapping)

# pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
# test['cpid_mean'] = test['cpid'].map(pidmapping)

# pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
# test['cpid_median'] = test['cpid'].map(pidmapping)

# del cidmapping, pidmapping

test = test[['id', 'ID', 'Week_num', 'Client_ID', 'Product_ID', 'ccid_mean', 'ccid_median', 'cpid_mean', 'cpid_median']].sort_values(by='id')
test.reset_index(drop=True, inplace=True)


# get adjusted demand from previous weeks
lagmap = pd.Series(trainIDdf[['ID', 'Wk_7_dem']].set_index('ID')['Wk_7_dem'], index=trainIDdf['ID'].tolist()).to_dict()
test['lag_3'] = test['ID'].map(lagmap)

lagmap = pd.Series(trainIDdf[['ID', 'Wk_8_dem']].set_index('ID')['Wk_8_dem'], index=trainIDdf['ID'].tolist()).to_dict()
test['lag_2'] = test['ID'].map(lagmap)

lagmap = pd.Series(trainIDdf[['ID', 'Wk_9_dem']].set_index('ID')['Wk_9_dem'], index=trainIDdf['ID'].tolist()).to_dict()
test['lag_1'] = test['ID'].map(lagmap)

del lagmap


test['Client_ID'] = test['Client_ID'].astype('category')
test['Product_ID'] = test['Product_ID'].astype('category')

test.head()

Unnamed: 0,id,ID,Week_num,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1
0,0,25973294,11,4639078,35305,2.73,2.0,4.79,4.0,,,4.0
1,1,23662849,11,4705135,1238,3.47,2.0,2.07,2.0,,,
2,2,21257171,10,4549769,32940,6.78,4.0,2.45,2.0,,2.0,2.0
3,3,5334985,11,4717855,43066,2.12,1.0,1.33,1.0,,,
4,4,4150753,11,966351,1277,5.24,5.0,,,,,


In [21]:
test['lag_1'].max()

np.float64(4872.0)

In [22]:
test[test['lag_1'] == 4872]

Unnamed: 0,id,ID,Week_num,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1
1223407,1223407,6257653,11,1440936,1182,39.32,37.0,34.76,41.0,41.0,21.0,4872.0
5156127,5156127,6257653,10,1440936,1182,39.32,37.0,34.76,41.0,41.0,21.0,4872.0


In [23]:
submission[submission['id'] == 1223407]

Unnamed: 0,id,Demanda_uni_equil
1223407,1223407,42.454136


In [27]:
# week 10 test data then make the prediction
X_test_wk10 = xgb.DMatrix(test[test['Week_num'] == 10].iloc[:, 3:], enable_categorical=True)
predictions_10 = model2.predict(X_test_wk10)


# add week 10 prediction to test dataframe
test['wk_10_pred_dem'] = np.nan
test.loc[test['Week_num'] == 10, 'wk_10_pred_dem'] = predictions_10


# week 11 test data then make the prediction
test_wk11 = test[test['Week_num'] == 11].copy(deep=True)
test_wk11.drop(['lag_3'], axis=1, inplace=True)
test_wk11.rename(columns={'lag_2': 'lag_3', 'lag_1': 'lag_2', 'wk_10_pred_dem': 'lag_1'}, inplace=True)
X_test_wk11 = xgb.DMatrix(test_wk11.iloc[:, 3:], enable_categorical=True)

predictions_11 = model2.predict(X_test_wk11)

# add week 11 prediction to test dataframe
test['wk_11_pred_dem'] = np.nan
test.loc[test['Week_num'] == 11, 'wk_11_pred_dem'] = predictions_11

test.head()

Unnamed: 0,id,ID,Week_num,Client_ID,Product_ID,ccid_mean,ccid_median,cpid_mean,cpid_median,lag_3,lag_2,lag_1,wk_10_pred_dem,wk_11_pred_dem
0,0,25973294,11,4639078,35305,2.73,2.0,4.79,4.0,,,4.0,,2.279682
1,1,23662849,11,4705135,1238,3.47,2.0,2.07,2.0,,,,,1.728094
2,2,21257171,10,4549769,32940,6.78,4.0,2.45,2.0,,2.0,2.0,2.496694,
3,3,5334985,11,4717855,43066,2.12,1.0,1.33,1.0,,,,,0.829748
4,4,4150753,11,966351,1277,5.24,5.0,,,,,,,4.022825


In [29]:
print(test['lag_3'].max())
print(test['lag_2'].max())
print(test['lag_1'].max())
print(test['wk_10_pred_dem'].max())
print(test['wk_11_pred_dem'].max())

4340.0
4975.0
4872.0
208.63990783691406
135.15640258789062


In [38]:
test[test['lag_2'] == 4975]

KeyError: 'lag_2'

In [15]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = test['wk_10_pred_dem'].combine_first(test['wk_11_pred_dem'])
submission.loc[submission['Demanda_uni_equil'] < 0, 'Demanda_uni_equil'] = 0

submission.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,2.279682
1,1,1.728094
2,2,2.496694
3,3,0.829748
4,4,4.022825


In [17]:
submission['Demanda_uni_equil'].max()

np.float64(208.63990783691406)

In [31]:
y_train.max()

np.float64(6815.79825)

In [33]:
submission.to_csv("xgb_prediction_9.csv", index=False)

In [None]:
# denote categorical variables
# train['Week_num'] = train['Week_num'].astype('category')
# train['Sales_Depot_ID'] = train['Sales_Depot_ID'].astype('category')
# train['Sales_Channel_ID'] = train['Sales_Channel_ID'].astype('category')
# train['Route_ID'] = train['Route_ID'].astype('category')
# train['Client_ID'] = train['Client_ID'].astype('category')
# train['Product_ID'] = train['Product_ID'].astype('category')
# train['ID'] = train['ID'].astype('category')
# train['ccid'] = train['ccid'].astype('category')
# train['cpid'] = train['cpid'].astype('category')

In [5]:
train.drop(columns=['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'cpid_median_pct', 'ID', 'ccid', 'cpid', 'ccid_mean', 'cpid_mean'], axis=1, inplace=True)

train.head()

Unnamed: 0,Week_num,Client_ID,Product_ID,adjusted_demand,ccid_median,cpid_median
0,4,15766,325,1,4,1
1,4,15766,328,1,4,2
2,3,15766,1212,3,4,2
3,4,15766,1212,4,4,2
4,5,15766,1212,5,4,2


In [6]:
# make adjusted demand last column

# Separate the column to move from the other columns
columns = [col for col in train.columns if col != 'adjusted_demand']

# Append the column to move at the end
new_order = columns + ['adjusted_demand']

# Reorder the DataFrame columns
train = train[new_order]

train.head()

Unnamed: 0,Week_num,Client_ID,Product_ID,ccid_median,cpid_median,adjusted_demand
0,4,15766,325,4,1,1
1,4,15766,328,4,2,1
2,3,15766,1212,4,2,3
3,4,15766,1212,4,2,4
4,5,15766,1212,4,2,5


In [19]:
model = xgb.XGBRegressor(objective='reg:squaredlogerror',
                         n_estimators=25,             # Number of boosting rounds
                         learning_rate=0.1,
                         random_state=42)

In [22]:
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

In [23]:
model.fit(X_train, y_train)

In [24]:
test.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,id,ID,ccid,cpid
0,11,1110,7,3301,15766,1216,924190,3,0,16
1,11,1110,7,3301,15766,1240,4521987,6,0,20
2,10,1110,7,3301,15766,1242,6217476,7,0,21
3,10,1110,7,3301,15766,1643,4740450,10,0,25
4,10,1110,7,3301,15766,3894,970784,12,0,31


In [9]:
# get ccid, cpid medians into test data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_median'] = test['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
test['cpid_median'] = test['cpid'].map(pidmapping)

del cidmapping, pidmapping

In [26]:
test.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,id,ID,ccid,cpid,ccid_median,cpid_median
0,11,1110,7,3301,15766,1216,924190,3,0,16,4.0,2.0
1,11,1110,7,3301,15766,1240,4521987,6,0,20,4.0,4.0
2,10,1110,7,3301,15766,1242,6217476,7,0,21,4.0,3.0
3,10,1110,7,3301,15766,1643,4740450,10,0,25,4.0,3.0
4,10,1110,7,3301,15766,3894,970784,12,0,31,4.0,5.0


In [10]:
test.drop(columns=['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'ID', 'ccid', 'cpid'], axis=1, inplace=True)

test.head()

Unnamed: 0,Week_num,Client_ID,Product_ID,id,ccid_median,cpid_median
0,11,15766,1216,924190,4.0,2.0
1,11,15766,1240,4521987,4.0,4.0
2,10,15766,1242,6217476,4.0,3.0
3,10,15766,1643,4740450,4.0,3.0
4,10,15766,3894,970784,4.0,5.0


In [11]:
test = test[['id', 'Week_num', 'Client_ID', 'Product_ID', 'ccid_median', 'cpid_median']].sort_values(by='id')

test.head()

Unnamed: 0,id,Week_num,Client_ID,Product_ID,ccid_median,cpid_median
6558101,0,11,4639078,35305,2.0,4.0
5991443,1,11,4705135,1238,2.0,2.0
5398738,2,10,4549769,32940,4.0,2.0
1383046,3,11,4717855,43066,1.0,1.0
1110570,4,11,966351,1277,5.0,


In [29]:
X_test = test.iloc[:, 2:]

In [30]:
predictions = model.predict(X_test)

In [31]:
predictions

array([2.3926792, 1.5295428, 2.1165032, ..., 2.4110427, 2.4110427,
       1.5295428], dtype=float32)

In [32]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = predictions

submission.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,2.392679
1,1,1.529543
2,2,2.116503
3,3,0.986436
4,4,6.951692


In [33]:
submission.to_csv("xgb_prediction_1st.csv", index=False)

In [None]:


dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

In [23]:
bst = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=25)

XGBoostError: [17:59:15] C:\b\abs_90_bwj_86a\croot\xgboost-split_1724073762025\work\src\common\categorical.h:76: Invalid categorical value detected.  Categorical value should be non-negative, less than total number of categories in training data and less than 16777216

In [7]:
train['Client_ID'] = train['Client_ID'].astype('category')
train['Product_ID'] = train['Product_ID'].astype('category')

X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

In [8]:
del train

In [12]:
model2 = xgb.train({'max_depth': 8, 'eta': 0.1, 'objective': 'reg:squaredlogerror'}, dtrain, num_boost_round=100)

In [13]:
test['Client_ID'] = test['Client_ID'].astype('category')
test['Product_ID'] = test['Product_ID'].astype('category')

In [14]:
X_test = xgb.DMatrix(test.iloc[:, 2:], enable_categorical=True)

In [15]:
predictions = model2.predict(X_test)

In [26]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = predictions
submission[submission['Demanda_uni_equil'] < 0]['Demanda_uni_equil'] = 0

submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission[submission['Demanda_uni_equil'] < 0]['Demanda_uni_equil'] = 0


Unnamed: 0,id,Demanda_uni_equil
0,0,2.841015
1,1,1.466883
2,2,2.406399
3,3,0.991431
4,4,10.515233


In [24]:
submission[submission['Demanda_uni_equil'] < 0] = 0

In [28]:
submission.to_csv("xgb_prediction_4.csv", index=False)