# Data Preparation


In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc

sns.set_style("whitegrid")

## Load Data

In [2]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Demanda_uni_equil': 'adjusted_demand',
                                'Producto_ID': 'Product_ID'})
print(len(train))

  data_path = "..\product-inventory"


74180464


In [3]:
# duplicates client-product-week observation -> take the average as adjusted demand
train = train.groupby(['Client_ID', 'Product_ID', 'Week_num'], as_index=False).agg({'adjusted_demand': 'sum'})

## Time Series Estimation

### Define Variables

In [4]:
# Imputed Data
# Fill in the missing values of adjusted demand with average client-product demand.
df_imputed = train.groupby(by=['Product_ID', 'Client_ID'], as_index=False).agg({'adjusted_demand': 'mean'})
df_imputed = df_imputed.rename(columns={'adjusted_demand': 'mean_demand'})
print('Imputed Data Created')

# Define lagged demand in the training data
train['adj_demand_1'] = train['adjusted_demand'].shift(1)
train['week_1'] = train['Week_num'].shift(1)
train = train.merge(right=df_imputed,
                    how = 'left',
                    on = ['Client_ID', 'Product_ID'])
print('Training Data mergred with Imputed Data')
train['adj_demand_1'] = train['adj_demand_1'].where(train['week_1'] + 1 == train['Week_num'], np.nan)
train['adj_demand_1'] = train['adj_demand_1'].fillna(train['mean_demand'])
train['adj_demand_1'] = train['adj_demand_1'].where(train['Week_num'] != 3, np.nan)
train = train.drop(columns=['mean_demand', 'week_1'])


# Define log demand and log lagged demand
train['y'] = np.log(train['adjusted_demand'])
train['y'] = train['y'].replace([np.inf, -np.inf], np.nan)
train['y_1'] = np.log(train['adj_demand_1'])
train['y_1'] = train['y_1'].replace([np.inf, -np.inf], np.nan)

Imputed Data Created
Training Data mergred with Imputed Data


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
train.loc[train['Week_num']==3,'y_1'].isna().mean()

1.0

## Models
1. $demand_{t}$ on $demand_{t-1}$
2. $log(demand_{t})$ on $log(demand_{t-1})$
3. $demand_{t} = demand_{t-1}$

In all model, I drop observations where the outcome variable or the independent variables are missing after imputation.

In [6]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import root_mean_squared_log_error as rmsle

In [7]:
# # cross-validation for linear regression
# # y = adjusted demand
# # x = lagged adjusted demand

# # def cross_val(train):
# gap = 1
# min_week = train['Week_num'].min()
# max_week = train['Week_num'].max()
# n_folds = 4
# n_models = 3
# i = 0
# # model_mse = np.zeros(shape=(n_folds, n_models))
# model_msle = np.zeros(shape=(n_folds, n_models))
# lr = LinearRegression()
# lr2 = LinearRegression()
# for week in range(min_week + gap + 1, max_week):
#     # model 1
#     print('Model 1', i)
#     train = train.dropna(subset=['adj_demand_1', 'adjusted_demand'])
#     df_tt = train[train['Week_num'] < week]
#     df_ho = train[train['Week_num'] == week]
#     lr.fit(X=df_tt[['adj_demand_1']], y=df_tt['adjusted_demand'])
#     pred = lr.predict(X=df_ho[['adj_demand_1']])
#     # model_mse[i, 0] = root_mean_squared_error(y_true = df_ho['adjusted_demand'], y_pred = pred)
#     model_msle[i, 0] = rmsle(y_true = df_ho['adjusted_demand'], y_pred = pred)

#     #model 2
#     print('Model 2', i)
#     train = train.dropna(subset=['adj_demand_1', 'adjusted_demand'])
#     df_tt = train[train['Week_num'] < week]
#     df_ho = train[train['Week_num'] == week]
#     pred = df_ho['adj_demand_1']
#     # model_mse[i, 2] = root_mean_squared_error(y_true = df_ho['adjusted_demand'], y_pred = pred)
#     model_msle[i, 1] = rmsle(y_true = df_ho['adjusted_demand'], y_pred = pred)
    
#     # model 3
#     print('Model 3', i)
#     train = train.dropna(subset=['y', 'y_1'])
#     df_tt = train[train['Week_num'] < week]
#     df_ho = train[train['Week_num'] == week]
#     lr2.fit(X=df_tt[['y_1']], y=df_tt['y'])
#     pred = np.exp(lr2.predict(X=df_ho[['y_1']]))
#     # model_mse[i, 1] = root_mean_squared_error(y_true = np.exp(df_ho['y']), y_pred = pred)
#     model_msle[i, 2] = rmsle(y_true = np.exp(df_ho['y']), y_pred = pred)
#     i += 1
# # print(model_mse.mean(axis=0))  
# print(model_msle.mean(axis=0)) 

# # cross_val(train)

In [8]:
## Model 3 is the best performing model
## Run the final model
train = train.dropna(subset=['y', 'y_1'])

# test data set
# I impute the missing client-product demand in week 9
# by using the average client-product demand in week 3-8
# df_ho = train_wo_na.groupby(by=['Product_ID', 'Client_ID'], as_index=False).agg({'adjusted_demand': 'mean'})
# df_ho['y_1'] = np.log(df_ho['adjusted_demand'])
# df_ho['y_1'] = df_ho['y_1'].replace([np.inf, -np.inf], np.nan)

df_ho = train.loc[train['Week_num'] == 9, ['Client_ID', 'Product_ID', 'y_1']]

In [9]:
# fit the model
lr = LinearRegression()
lr.fit(X=train[['y_1']], y=train['y'])

# predict the demand
df_ho['pred'] = np.exp(lr.predict(X=df_ho[['y_1']]))

In [None]:
train

Unnamed: 0,Client_ID,Product_ID,Week_num,adjusted_demand,adj_demand_1,y,y_1
4,26,4767,7,42,42.0,3.737670,3.737670
5,26,4767,8,42,42.0,3.737670,3.737670
6,26,30235,9,96,42.0,4.564348,3.737670
7,26,30314,7,48,48.0,3.871201,3.871201
9,26,31393,4,16,20.0,2.772589,2.995732
...,...,...,...,...,...,...,...
74013015,2015152015,2665,9,10,7.0,2.302585,1.945910
74013016,2015152015,3270,8,14,14.0,2.639057,2.639057
74013018,2015152015,4270,6,10,10.0,2.302585,2.302585
74013020,2015152015,4280,9,8,6.0,2.079442,1.791759


## Merge with Test Data
Let's merge the prediction value with the test data based on `Product_ID` and `Client_ID`.

In [10]:
# load test.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/test.csv.zip")

test = pd.read_csv(filename, 
                 usecols=['id', 'Semana', 'Producto_ID', 'Cliente_ID'])
# 
# rename columns
test = test.rename(columns={'Semana': 'Week_num',
                            'Cliente_ID': 'Client_ID',
                            'Producto_ID': 'Product_ID'})

  data_path = "..\product-inventory"


In [11]:
test = test[['id','Client_ID', 'Product_ID', 'Week_num']].merge(right=df_ho[['Client_ID', 'Product_ID', 'pred']], 
                                                                how='left', 
                                                                on=['Client_ID', 'Product_ID'])
test = test.sort_values(by=['Client_ID', 'Product_ID', 'Week_num']).reset_index(drop=True)

In [17]:
test.head()

Unnamed: 0,id,Client_ID,Product_ID,Week_num,pred
0,1569352,26,31518,10,
1,4728674,26,31520,11,
2,1547831,26,34206,11,49.971135
3,6667200,26,34210,10,25.353574
4,1592616,26,34785,10,9.617806


**What share of test sample is not in week 9 of the training sample?**

42 percent

In [None]:
test['pred'].isna().mean()

0.42490946531278845

## Out-of-Sample Prediction
One of the main challenges is to predict the demand for the following cases:
1. Existing Clients, New Products
2. New Clients, Exisiting Products
3. New Clients, New Prodcuts

In [None]:
# # load train.csv
# data_path = "..\product-inventory"
# filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

# train = pd.read_csv(filename, 
#                  usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# # rename columns
# train = train.rename(columns={  'Semana': 'Week_num',
#                                 'Cliente_ID': 'Client_ID',
#                                 'Demanda_uni_equil': 'adjusted_demand',
#                                 'Producto_ID': 'Product_ID'})

# # duplicates client-product-week observation -> take the average as adjusted demand
# train = train.groupby(['Client_ID', 'Product_ID', 'Week_num'], as_index=False).agg({'adjusted_demand': 'sum'})

# # load test.csv
# # data_path = "..\product-inventory"
# # filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/test.csv.zip")
# # 
# # test = pd.read_csv(filename, 
#                 #  usecols=['id','Semana', 'Producto_ID', 'Cliente_ID'])

# # rename columns
# # test = test.rename(columns={'Semana': 'Week_num',
#                             # 'Cliente_ID': 'Client_ID',
#                             # 'Producto_ID': 'Product_ID'})


The list of existing clients in the test data.

In [None]:
# # list of exisiting and new clients
# testID = test['Client_ID'].unique().tolist()
# trainID = train['Client_ID'].unique().tolist()
# commonID = list(set(testID).intersection(set(trainID)))
# newID = list(set(testID) - set(trainID))

# print(len(newID)/len(test['Client_ID'].unique()))

# print(len(test['Client_ID'].unique()))

The list of existing and new products in the test data

In [22]:
# # list of existing and new products
# testPID = test['Product_ID'].unique().tolist()
# trainPID = train['Product_ID'].unique().tolist()
# commonPID = list(set(testPID).intersection(set(trainPID)))
# newPID = list(set(testPID) - set(trainPID))

# print(len(newPID)/len(test['Product_ID'].unique()))

# print(len(test['Product_ID'].unique()))

In [None]:
# l = len(test)
# l1 = len(test.loc[(test['Client_ID'].isin(commonID)) & (test['Product_ID'].isin(newPID))])
# l2 = len(test.loc[(test['Client_ID'].isin(newID)) & (test['Product_ID'].isin(commonPID))])
# l3 = len(test.loc[(test['Client_ID'].isin(newID)) & (test['Product_ID'].isin(newPID))])
# l4 = len(test.loc[(test['Client_ID'].isin(commonID)) & (test['Product_ID'].isin(commonPID))])

# print(l1/l, l2/l, l3/l, l4/l)

### 1. Existing Products
This case consists of two possible scanarios:
* New Client
* Existing Client but a new combo

In both scenarios, the in-sample estimation can not predict the demand. We use the average product demand in weeks 3-9 as our prediction.
In the 2nd scenario, our prediction model has some shortcomings. For example, our measure does not take into account that a client might have a low demand for a new product. On the other hand, if we use the client's average demand as our prediction, it does not take into account the variation in products' demand. As a first pass, we use 'average prodct demand' as the prediction


In [23]:
# use product's average demand in week 3-9 as a prediction for the new client.
# pred_1 contains existing product with new client, and a prediction for the client's demand.
pred_1 = train.groupby('Product_ID', as_index=False).agg({'adjusted_demand': 'mean'})
test = test.merge(right=pred_1, 
                  how='left', 
                  on='Product_ID')
test['pred'] = test['pred'].fillna(test['adjusted_demand'])
print('Share of Missing Preiction:', test['pred'].isna().mean())

del pred_1
test = test.drop(columns='adjusted_demand')

Share of Missing Preiction: 0.0036663923039765255


### 2. Existing Clients

In [24]:
# use client's average demand in week 3-9 as a prediction for new product.
#pred_1 contains existing clients with new products, and a prediction for the product.
pred_1 = train.groupby('Client_ID', as_index=False).agg({'adjusted_demand': 'mean'})
test = test.merge(right=pred_1, 
                  how='left', 
                  on='Client_ID')

test['pred'] = test['pred'].fillna(test['adjusted_demand'])
print('Share of Missing Preiction:', test['pred'].isna().mean())

del pred_1
test = test.drop(columns='adjusted_demand')
# WATCH OUT: This replaces missing values for existing clients and existing products with the client's average demand. 
# THEY SHOULD BE REPLACED WITH THE ACTUAL PREDICTION

Share of Missing Preiction: 1.814479863631123e-05


### 3. New Clients, New Products
The intersection of new clients and new product in the test data. Here the first guess is the average demand for all product across all weeks.


In [25]:
test['pred'] = test['pred'].fillna(train['adjusted_demand'].mean())
print('Share of Missing Preiction:', test['pred'].isna().mean())

# test

Share of Missing Preiction: 0.0


In [26]:
output = test[['id', 'pred']]
output = output.rename(columns={'pred': 'Demanda_uni_equil'})

data_path = "..\product-inventory"
filename = os.path.join(data_path, "prediction_2.csv")
output.to_csv(filename, index=False)

  data_path = "..\product-inventory"


In [27]:
output.columns

Index(['id', 'Demanda_uni_equil'], dtype='object')

## Leftover

In [2]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Demanda_uni_equil': 'adjusted_demand',
                                'Producto_ID': 'Product_ID'})
# define client-product ID
train['ID'] = train.groupby(['Client_ID', 'Product_ID']).ngroup()
unique_ids = train['ID'].unique()

# Define the fraction of IDs to sample
fraction = 0.2  # sample 10% of the IDs

# Calculate the number of IDs to sample
sample_size = int(len(unique_ids) * fraction)

rng = np.random.default_rng(4325252122)

# Choose a random sample of IDs
sampled_ids = np.random.choice(unique_ids, size=sample_size, replace=False)

# Filter the DataFrame to keep all rows with the sampled IDs
train = train[train['ID'].isin(sampled_ids)]

print(len(train))

  data_path = "..\product-inventory"


14843968


### Data Imputation Function
This function imputes missing observations based on the firms' demand. In this case, I set all missing observations to zero.

In [None]:
def fillin(df):
    '''
    Input
        df: A dataframe of length at most 7, with column names 'Week_num', 'Client_ID', 'Product_ID', 'adjusted_demand', 'ID',
        where 'ID' is the unique idenifier for client id and product id combinations.  The intended input is train[train['ID' == id]],
        where id is an element of the list train['ID'].unique().

    Outputs
        new_df: If df has 'adjusted_demand' values for each week (3 through 9), new_df = df, i.e. nothing happens.

                If df has missing 'adjusted_demand' values for any week, the 'adjusted_demand' for that week will be 0.
    '''

    # EB: I'm not sure if it matters, but does it need to be a deep copy?
    new_df = df.copy(deep=True).reset_index(drop=True)

    week_list = new_df['Week_num'].unique().tolist()
    missing_week_list = [x for x in [3,4,5,6,7,8,9] if x not in week_list]

    for i in missing_week_list:
        
        #create new row in new_df with the floor of the average value of prev_value and next_value
        new_df = pd.concat([new_df, pd.DataFrame({'Week_num': i,
                                                  'Client_ID': new_df['Client_ID'].iloc[0],
                                                  'Product_ID': new_df['Product_ID'].iloc[0],
                                                  'adjusted_demand': 0,
                                                  'ID': new_df['ID'].iloc[0]}, index=[i])]).sort_values(by=['Week_num']).reset_index(drop=True)
        
        #update week_list
        week_list.append(i)
        
    return new_df

Another way to impute the data is to expand the `train` such as it incluldes all possible ID x Week combination. We'll impute the data a later point.

In [None]:
# unq_week = pd.DataFrame({'Week_num': train['Week_num'].unique()})
# unq_week = unq_week.sort_values(by='Week_num').reset_index(drop=True)
# unq_id = pd.DataFrame({'ID': train['ID'].unique()})
# unq_id = unq_id.sort_values(by='ID').reset_index(drop=True)
# combo = unq_id.merge(unq_week, how='cross')
# train = combo.merge(train, how='outer', on=['ID', 'Week_num'], sort=True)

# del combo, unq_week, unq_id
# train

In [None]:
# x = np.arange(0, 10, 0.5).reshape(-1,1)
# y = lr2.predict(x)

# plt.scatter(df_ho['y_1'], df_ho['y'])
# plt.plot(x, y)

# plt.show()

* We can expand this model to include missing clients
* We can include longer lagged in the model
* Auto ARIMA i.e. find out the right number of lags
* We can use the average of the client's observations for prediction
* Calculate autocorrelation
* XGBoost