In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error

In [4]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

#office
train = pd.read_csv("train.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])
test = pd.read_csv("test.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'id'])

train = train.rename(columns={'Semana': 'Week_num',
                              'Agencia_ID': 'Sales_Depot_ID',
                              'Canal_ID': 'Sales_Channel_ID',
                              'Ruta_SAK': 'Route_ID',
                              'Cliente_ID': 'Client_ID',
                              'Venta_uni_hoy': 'Sales_unit_this_week',
                              'Venta_hoy': 'Sales_this_week',
                              'Dev_uni_proxima': 'Returns_unit_next_week',
                              'Dev_proxima': 'Returns_next_week',
                              'Demanda_uni_equil': 'adjusted_demand',
                              'Producto_ID': 'Product_ID'})

test = test.rename(columns={'Semana': 'Week_num',
                            'Agencia_ID': 'Sales_Depot_ID',
                            'Canal_ID': 'Sales_Channel_ID',
                            'Ruta_SAK': 'Route_ID',
                            'Cliente_ID': 'Client_ID',
                            'Venta_uni_hoy': 'Sales_unit_this_week',
                            'Venta_hoy': 'Sales_this_week',
                            'Dev_uni_proxima': 'Returns_unit_next_week',
                            'Dev_proxima': 'Returns_next_week',
                            'Demanda_uni_equil': 'adjusted_demand',
                            'Producto_ID': 'Product_ID'})



#set a unique id for each sales depot id, sales channel id, route id, client, product combination (thanks Gemini)
combined_df = pd.concat([train,test])
combined_df['ID'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID', 'Product_ID']).ngroup()

#set a combined client ID, consisting of a unique sales depot ID, sales channel ID, route ID, and client ID
combined_df['ccid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID']).ngroup()

#set a combined product ID, consisting of a unique sales depot ID, sales channel ID, route ID, and product ID
combined_df['cpid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Product_ID']).ngroup()

train = combined_df.iloc[:len(train)].copy()
test = combined_df.iloc[len(train):].copy()

del combined_df


train = train.drop(columns='id')
train['adjusted_demand'] = train['adjusted_demand'].astype(int)
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

test = test.drop(columns='adjusted_demand')
test['id'] = test['id'].astype(int)
test = test.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

In [5]:
#create a dataframe of aggregate statistics for each client
testagg = train.sort_values(by=['ccid']).groupby(['ccid'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

client_stats = pd.DataFrame()

client_stats['ccid'] = testagg['ccid']
client_stats['Products'] = testagg['Product_ID']['nunique']
client_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
client_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
client_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
client_stats['adj_dem_max'] = testagg['adjusted_demand']['max']

del testagg

#create a dataframe of aggregate statistics for each product
testagg = train.sort_values(by=['cpid']).groupby(['cpid'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

product_stats =  pd.DataFrame()

product_stats['cpid'] = testagg['cpid']
product_stats['Clients'] = testagg['Client_ID']['nunique']
product_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
product_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
product_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
product_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

del testagg

The sample submission file - set adjusted demand as 7.

Kaggle score: 0.96195

In [9]:
submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = 7

submission.tail()

# to produce the .csv file, uncomment below
# submission.to_csv("7_prediction.csv", index=False)

Unnamed: 0,id,Demanda_uni_equil
6999246,6999246,7
6999247,6999247,7
6999248,6999248,7
6999249,6999249,7
6999250,6999250,7


Simple prediction - use the same adjusted demand from week 9 for weeks 10, 11 (if possible).  Otherwise, set adjusted demand to 7.

Kaggle score: 0.73549

In [None]:
submission = test[['ID', 'id']].copy(deep=True)

# initialize adjusted demand column
submission['Demanda_uni_equil'] = 7

# for ccid, cpid in training data, use week 9 adjusted demand
wk9mapping = pd.Series(train[(train['Week_num'] == 9)].set_index('ID')['adjusted_demand'], index=train[(train['Week_num'] == 9)]['ID']).to_dict()
submission.loc[submission['ID'].isin(train[(train['Week_num'] == 9)]['ID'].unique().tolist()), 'Demanda_uni_equil'] = submission['ID'].map(wk9mapping)

submission = submission.drop('ID', axis=1)
submission = submission.sort_values(by='id')

submission.head()

# to produce the .csv file, uncomment below
# submission.to_csv("simple_prediction.csv", index=False)

Improved simple prediction

Use week 9 data for weeks 10, 11 when possible.

Otherwise:
If new client and new product, set adjusted demand to 5.

If new client and old product, set adjusted demand as the median of the product's overall sales.

If old client and new product, set adjusted demand as the median of the client's overall sales.

If old client and old product, set adjusted demand as (0.5 + (product median pct)) * (0.65 * (client median) + (1-0.65) * (client mean)).

Kaggle score: 0.58959

In [None]:
submission = test[['ID', 'id', 'ccid', 'cpid']].copy(deep=True)

# are the cpid, ccids in the training data?
submission.loc[:, 'cpid_in_train'] = submission['cpid'].isin(train['cpid'].unique().tolist())
submission.loc[:, 'ccid_in_train'] = submission['ccid'].isin(train['ccid'].unique().tolist())

# get ccid, cpid means, medians and cpid median percentage
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(submission['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(submission['ccid'].unique().tolist())]['ccid']).to_dict()
submission['ccid_mean'] = submission['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(submission['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(submission['ccid'].unique().tolist())]['ccid']).to_dict()
submission['ccid_median'] = submission['ccid'].map(cidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(submission['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(submission['cpid'].unique().tolist())]['cpid']).to_dict()
submission['cpid_mean'] = submission['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(submission['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(submission['cpid'].unique().tolist())]['cpid']).to_dict()
submission['cpid_median'] = submission['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(submission['cpid'].unique().tolist())].set_index('cpid')['median_pct'], index=product_stats[product_stats['cpid'].isin(submission['cpid'].unique().tolist())]['cpid']).to_dict()
submission['cpid_median_pct'] = submission['cpid'].map(pidmapping).round(3)

del cidmapping, pidmapping

# initialize adjusted demand column
submission['Demanda_uni_equil'] = np.zeros(len(test))

# for ccid and cpid not in training data, set adjusted demand to 5
submission.loc[(submission['cpid_in_train'] == False) & (submission['ccid_in_train'] == False), 'Demanda_uni_equil'] = 5

# for cpid in training data and ccid not in training data, use cpid median
submission.loc[(submission['ccid_in_train'] == False) & (submission['cpid_in_train'] == True), 'Demanda_uni_equil'] = submission[(submission['ccid_in_train'] == False) & (submission['cpid_in_train'] == True)]['cpid_median']

# for ccid in training data and cpid not in training data, use ccid median
submission.loc[(submission['ccid_in_train'] == True) & (submission['cpid_in_train'] == False), 'Demanda_uni_equil'] = submission[(submission['ccid_in_train'] == True) & (submission['cpid_in_train'] == False)]['ccid_median']

# for ccid, cpid in training data, use (0.5 + (cpid median pct)) * (0.65 * (ccid median) + (1-0.65) * (ccid mean))
submission.loc[(submission['ccid_in_train'] == True) & (submission['cpid_in_train'] == True), 'Demanda_uni_equil'] = (0.5 + submission[(submission['ccid_in_train'] == True) & (submission['cpid_in_train'] == True)]['cpid_median_pct'])*(0.65*submission[(submission['ccid_in_train'] == True) & (submission['cpid_in_train'] == True)]['ccid_median'] + (1-0.65)*submission[(submission['ccid_in_train'] == True) & (submission['cpid_in_train'] == True)]['ccid_mean'])



# override with week 9 adjusted demand for the IDs that have week 9 data
wk9mapping = pd.Series(train[(train['Week_num'] == 9)].set_index('ID')['adjusted_demand'], index=train[(train['Week_num'] == 9)]['ID']).to_dict()
submission.loc[submission['ID'].isin(train[(train['Week_num'] == 9)]['ID'].unique().tolist()), 'Demanda_uni_equil'] = submission['ID'].map(wk9mapping)
submission['Demanda_uni_equil'] = np.floor(submission['Demanda_uni_equil']).astype(int)

submission.drop(columns=submission.columns.difference(['id', 'Demanda_uni_equil']), inplace=True)
submission.sort_values(by='id', inplace=True)

submission.head()

# to produce the .csv file, uncomment below
# submission.to_csv("improved_simple_baseline.csv", index=False)

In [22]:
submission.head()

Unnamed: 0,id,Demanda_uni_equil
6558101,0,4
5991443,1,1
5398738,2,2
1383046,3,0
1110570,4,5
