# Data Preparation


In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc

sns.set_style("whitegrid")

## Load Data

In [None]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Demanda_uni_equil': 'adjusted_demand',
                                'Producto_PID': 'Product_ID'})
# define client-product ID
train['PIS'] = train.groupby(['Client_PPID', 'Product_ID']).ngroup()
unique_ids = train['ID'].unique()

# Define the fraction of IDs to sample
fraction = 0.01  # sample 1% of the IDs

# Calculate the number of IDs to sample
sample_size = int(len(unique_ids) * fraction)

# Choose a random sample of IDs
sampled_ids = np.random.choice(unique_ids, size=sample_size, replace=False)

# Filter the DataFrame to keep all rows with the sampled IDs
train = train[train['ID'].isin(sampled_ids)]

print(len(train))

  data_path = "..\product-inventory"


746397


In [None]:
# Sort the data frame by ID and Week
train = train.groupby(['PIS', 'Week_num'], as_index=False).agg({'adjusted_demand': 'sum'})
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

## Data Imputation Function
This function imputes missing observations based on the firms' demand. In this case, I set all missing observations to zero.

In [6]:
def fillin(df):
    '''
    Input
        df: A dataframe of length at most 7, with column names 'Week_num', 'Client_ID', 'Product_ID', 'adjusted_demand', 'ID',
        where 'ID' is the unique idenifier for client id and product id combinations.  The intended input is train[train['ID' == id]],
        where id is an element of the list train['ID'].unique().

    Outputs
        new_df: If df has 'adjusted_demand' values for each week (3 through 9), new_df = df, i.e. nothing happens.

                If df has missing 'adjusted_demand' values for any week, the 'adjusted_demand' for that week will be 0.
    '''

    # EB: I'm not sure if it matters, but does it need to be a deep copy?
    new_df = df.copy(deep=True).reset_index(drop=True)

    week_list = new_df['Week_num'].unique().tolist()
    missing_week_list = [x for x in [3,4,5,6,7,8,9] if x not in week_list]

    for i in missing_week_list:
        
        #create new row in new_df with the floor of the average value of prev_value and next_value
        new_df = pd.concat([new_df, pd.DataFrame({'Week_num': i,
                                                  'Client_ID': new_df['Client_ID'].iloc[0],
                                                  'Product_ID': new_df['Product_ID'].iloc[0],
                                                  'adjusted_demand': 0,
                                                  'ID': new_df['ID'].iloc[0]}, index=[i])]).sort_values(by=['Week_num']).reset_index(drop=True)
        
        #update week_list
        week_list.append(i)
        
    return new_df

Another way to impute the data is to expand the `train` such as it incluldes all possible ID x Week combination. We'll impute the data a later point.

In [7]:
unq_week = pd.DataFrame({'Week_num': train['Week_num'].unique()})
unq_week = unq_week.sort_values(by='Week_num').reset_index(drop=True)
unq_id = pd.DataFrame({'ID': train['ID'].unique()})
unq_id = unq_id.sort_values(by='ID').reset_index(drop=True)
combo = unq_id.merge(unq_week, how='cross')
train = combo.merge(train, how='outer', on=['ID', 'Week_num'], sort=True)

del combo, unq_week, unq_id
train

Unnamed: 0,ID,Week_num,adjusted_demand
0,202,3,
1,202,4,
2,202,5,82.0
3,202,6,126.0
4,202,7,42.0
...,...,...,...
1808200,25831272,5,28.0
1808201,25831272,6,26.0
1808202,25831272,7,12.0
1808203,25831272,8,26.0


## Time Series Estimation

## Define Variables
Here I define new variables and modify the existing ones.

In [8]:
# Define lagged demand and lagged week
train['adj_demand_1'] = train['adjusted_demand'].shift(1)
train['week_1'] = train['Week_num']-1
train['adj_demand_1'] = train['adj_demand_1'].where(train['Week_num'] != 3, np.nan)

# Define log demand and log lagged demand
train['y'] = np.log(train['adjusted_demand'])
train['y'] = train['y'].replace([np.inf, -np.inf], np.nan)
train['y_1'] = np.log(train['adj_demand_1'])
train['y_1'] = train['y_1'].replace([np.inf, -np.inf], np.nan)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


## Models
1. $demand_{t}$ on $demand_{t-1}$
2. $log(demand_{t})$ on $log(demand_{t-1})$
3. $demand_{t} = demand_{t-1}$

In all model, I drop observations where the outcome variable or the independent variables are missing i.e. no imputation.

In [9]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import root_mean_squared_log_error as rmsle

In [10]:
# cross-validation for linear regression
# y = adjusted demand
# x = lagged adjusted demand

gap = 1
min_week = train['Week_num'].min()
max_week = train['Week_num'].max()

n_folds = 4
n_models = 3
i = 0
model_mse = np.zeros(shape=(n_folds, n_models))
model_msle = np.zeros(shape=(n_folds, n_models))
lr = LinearRegression()
lr2 = LinearRegression()

for week in range(min_week + gap + 1, max_week):

    # model 1
    train_wo_na = train.dropna(subset=['adj_demand_1', 'adjusted_demand'])
    df_tt = train_wo_na[train_wo_na['Week_num'] < week]
    df_ho = train_wo_na[train_wo_na['Week_num'] == week]

    lr.fit(X=df_tt[['adj_demand_1']], y=df_tt['adjusted_demand'])
    pred = lr.predict(X=df_ho[['adj_demand_1']])

    model_mse[i, 0] = root_mean_squared_error(y_true = df_ho['adjusted_demand'], y_pred = pred)
    model_msle[i, 0] = rmsle(y_true = df_ho['adjusted_demand'], y_pred = pred)

    # model 2
    train_wo_na = train.dropna(subset=['y', 'y_1'])

    df_tt = train_wo_na[train_wo_na['Week_num'] < week]
    df_ho = train_wo_na[train_wo_na['Week_num'] == week]
    lr2.fit(X=df_tt[['y_1']], y=df_tt['y'])
    pred = np.exp(lr2.predict(X=df_ho[['y_1']]))

    model_mse[i, 1] = root_mean_squared_error(y_true = np.exp(df_ho['y']), y_pred = pred)
    model_msle[i, 1] = rmsle(y_true = np.exp(df_ho['y']), y_pred = pred)

    #model 3
    train_wo_na = train.dropna(subset=['adj_demand_1', 'adjusted_demand'])
    df_tt = train_wo_na[train_wo_na['Week_num'] < week]
    df_ho = train_wo_na[train_wo_na['Week_num'] == week]
    pred = df_ho['adj_demand_1']

    model_mse[i, 2] = root_mean_squared_error(y_true = df_ho['adjusted_demand'], y_pred = pred)
    model_msle[i, 2] = rmsle(y_true = df_ho['adjusted_demand'], y_pred = pred)

    i += 1
print(model_mse.mean(axis=0))  
print(model_msle.mean(axis=0)) 


[ 21.2651801  114.79882598  20.73573902]
[0.55606515 0.51268461 0.56430154]


* We can expand this model to include missing clients
* We can include longer lagged in the model
* Auto ARIMA i.e. find out the right number of lags
* We can use the average of the client's observations for prediction
* Calculate autocorrelation
* XGBoost

## Out-of-Sample Prediction
One of the main challenges is to predict the demand for the following cases:
1. Existing Clients, New PID
2. New Clients, Exisiting Products
3. New Clients, New Prodcuts

### 1. Existing Clients, New Products
In this case, we can use the average demand of the client of other products. This model assumes a firm faces a similar demand for all its products.

In [11]:
# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Producto_ID': 'Product_ID'})

# load test.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/test.csv.zip")

test = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID'])

# rename columns
test = test.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Producto_ID': 'Product_ID'})


  data_path = "..\product-inventory"
  data_path = "..\product-inventory"


I find the existing clients in the test data.

In [None]:
# list of exisiting and new clients
testID = test['Client_ID'].unique().tolist()
trainID = train['Client_ID'].unique().tolist()
commonID = list(set(testID).intersection(set(trainID)))
newID = list(set(testID) - set(trainID))

print(len(newID),len(commonID))

print(len(test['Client_ID'].unique()))

9663 735501
745164


In [15]:
# list of existing and new products
testPID = test['Product_ID'].unique().tolist()
trainPID = train['Product_ID'].unique().tolist()
commonPID = list(set(testPID).intersection(set(trainPID)))
newPID = list(set(testPID) - set(trainPID))

print(len(newPID),len(commonPID))

print(len(test['Product_ID'].unique()))

34 1488
1522


In [13]:
# print(len(test['Product_ID'].unique().tolist()))
test_pid = test['Product_ID'].where(test['Client_ID'].isin(commonID)).unique().astype(int).tolist()
print(test_pid)
print(test.loc[test['Client_ID'].isin(commonID),'Product_ID'].unique().tolist())
# train_pid = train['Product_ID'].unique().tolist()
# common_pid = list(set(test_pid).intersection(set(train_pid)))
# new_pid = list(set(test_pid) - set(train_pid))

# print(len(new_pid),len(common_pid))

  test_pid = test['Product_ID'].where(test['Client_ID'].isin(commonID)).unique().astype(int).tolist()


[35305, 1238, 32940, 43066, 1277, 972, 1232, 1240, 43203, 1278, 2233, 4270, 43274, 37361, 43200, 1150, 35456, 30552, 1125, 35525, 43285, 36920, 41938, 37058, 3631, 43069, 43201, 42434, 31507, 40930, 43064, 30314, 2665, 34794, 44371, 1146, 1250, 1220, 30532, 31512, 6469, 325, 73, 30549, 31471, 43065, 815, 43316, 43338, 46772, 1216, 1284, 46085, 31423, 35306, 30555, 43040, 1212, 43206, 1129, -2147483648, 45112, 43058, 43398, 1242, 43220, 4245, 1230, 43307, 43188, 2425, 36610, 1064, 35727, 34868, 43147, 34054, 37577, 1109, 34053, 32873, 43209, 44102, 1309, 36745, 34938, 43118, 32846, 46876, 328, 43215, 43005, 43084, 43197, 43129, 37494, 34786, 35141, 5337, 43068, 48019, 3144, 46128, 35516, 31719, 34768, 31031, 43187, 40447, 1373, 3270, 32323, 35144, 31502, 36598, 8931, 36537, 32861, 43207, 1145, 35651, 8921, 35309, 34089, 43039, 31511, 48417, 31717, 35455, 36823, 34213, 30533, 36711, 35145, 31467, 35147, 43196, 4910, 31033, 31586, 8940, 43204, 34206, 37427, 49973, 3894, 43067, 1182, 72, 4

In [19]:
#use client's average demand as prediction
train.loc[(train['Client_ID'].isin(commonID)) & (train['Product_ID'].isin(commonPID))]

Unnamed: 0,Week_num,Client_ID,Product_ID
0,3,15766,1212
1,3,15766,1216
2,3,15766,1238
3,3,15766,1240
4,3,15766,1242
...,...,...,...
74180459,9,4528866,32873
74180460,9,4528866,34226
74180461,9,4528866,45112
74180462,9,4547943,40217
