# Data Preparation


In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc

sns.set_style("whitegrid")

In [3]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

# load train.csv
data_path = "..\product-inventory"
filename = os.path.join(data_path, "grupo-bimbo-inventory-demand/train.csv.zip")

train = pd.read_csv(filename, 
                 usecols=['Semana', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])

# rename columns
train = train.rename(columns={  'Semana': 'Week_num',
                                'Cliente_ID': 'Client_ID',
                                'Demanda_uni_equil': 'adjusted_demand',
                                'Producto_ID': 'Product_ID'})
# define client-product ID
train['ID'] = train.groupby(['Client_ID', 'Product_ID']).ngroup()
unique_ids = train['ID'].unique()

# Define the fraction of IDs to sample
fraction = 0.01  # sample 1% of the IDs

# Calculate the number of IDs to sample
sample_size = int(len(unique_ids) * fraction)

# Choose a random sample of IDs
sampled_ids = np.random.choice(unique_ids, size=sample_size, replace=False)

# Filter the DataFrame to keep all rows with the sampled IDs
train = train[train['ID'].isin(sampled_ids)]

print(len(train))

# calculate price
# train['Price'] = train['sales_this_week']/train['sales_unit_this_week']
# train['log_price'] = np.log(train['Price'])

# dependent variable
# train['log_adj_demand'] = np.log(train['adjusted_demand'])


  data_path = "..\product-inventory"


743257


In [4]:
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

In [5]:
def fillin(df):
    '''
    Input
        df: A dataframe of length at most 7, with column names 'Week_num', 'Client_ID', 'Product_ID', 'adjusted_demand', 'ID',
        where 'ID' is the unique idenifier for client id and product id combinations.  The intended input is train[train['ID' == id]],
        where id is an element of the list train['ID'].unique().

    Outputs
        new_df: If df has 'adjusted_demand' values for each week (3 through 9), new_df = df, i.e. nothing happens.

                If df has missing 'adjusted_demand' values for any week, the 'adjusted_demand' for that week will be 0.
    '''

    # EB: I'm not sure if it matters, but does it need to be a deep copy?
    new_df = df.copy(deep=True).reset_index(drop=True)

    week_list = new_df['Week_num'].unique().tolist()
    missing_week_list = [x for x in [3,4,5,6,7,8,9] if x not in week_list]

    for i in missing_week_list:
        
        #create new row in new_df with the floor of the average value of prev_value and next_value
        new_df = pd.concat([new_df, pd.DataFrame({'Week_num': i,
                                                  'Client_ID': new_df['Client_ID'].iloc[0],
                                                  'Product_ID': new_df['Product_ID'].iloc[0],
                                                  'adjusted_demand': 0,
                                                  'ID': new_df['ID'].iloc[0]}, index=[i])]).sort_values(by=['Week_num']).reset_index(drop=True)
        
        #update week_list
        week_list.append(i)
        
    return new_df

In [1]:
dfs = []
uid_list = train['ID'].unique().tolist()

for j in range(len(uid_list)):
    dfs.append(fillin(train[train['ID'] == uid_list[j]]))

new_train = pd.concat(dfs).reset_index(drop=True)

NameError: name 'train' is not defined

## Time Series Estimation
Let's write down the first regressions.
1. $demand_{t}$ on $demand_{t-1}$
2. $log(demand_{t})$ on $log(demand_{t-1})$
3. $demand_{t} = demand_{t-1}$

In [5]:
train = train.groupby(['ID', 'Week_num'], as_index=False).agg({'adjusted_demand': 'sum'})

In [6]:
df = train.groupby(by='ID', as_index=False).agg({'Week_num': 'first'}).rename(columns={'Week_num': 'first_week'})
train = train.merge(right = df, how='left', on='ID')

## Define Variables
Here I define new variables and modify the existing ones.

In [14]:

train['adj_demand_1'] = train['adjusted_demand'].shift(1)
train['week_1'] = train['Week_num'].shift(1)
train['adj_demand_1'] = train['adj_demand_1'].where(train['week_1']+1 == train['Week_num'], np.nan)
train['y'] = np.log(train['adjusted_demand'])
train['y'] = train['y'].replace([np.inf, -np.inf], np.nan)
train['y_1'] = np.log(train['adj_demand_1'])
train['y_1'] = train['y_1'].replace([np.inf, -np.inf], np.nan)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [None]:
#cross-validation for linear regression
# y = adjusted demand
# x = lagged adjusted demand

gap = 1
min_week = train['Week_num'].min()
max_week = train['Week_num'].max()

n_folds = 4
n_models = 3
i = 0
model_mse = np.zeros(shape=(n_folds, n_models))
lr = LinearRegression()
lr2 = LinearRegression()

for week in range(min_week + gap + 1, max_week):

    # model 1
    train_wo_na = train.dropna(subset=['adj_demand_1', 'adjusted_demand'])
    df_tt = train_wo_na[train_wo_na['Week_num'] < week]
    df_ho = train_wo_na[train_wo_na['Week_num'] == week]

    lr.fit(X=df_tt[['adj_demand_1']], y=df_tt['adjusted_demand'])
    pred = lr.predict(X=df_ho[['adj_demand_1']])

    model_mse[i, 0] = root_mean_squared_error(y_true = df_ho['adjusted_demand'], y_pred = pred)

    # model 2
    train_wo_na = train.dropna(subset=['y', 'y_1'])

    df_tt = train_wo_na[train_wo_na['Week_num'] < week]
    df_ho = train_wo_na[train_wo_na['Week_num'] == week]
    lr2.fit(X=df_tt[['y_1']], y=df_tt['y'])
    pred = np.exp(lr2.predict(X=df_ho[['y_1']]))

    model_mse[i, 1] = root_mean_squared_error(y_true = np.exp(df_ho['y']), y_pred = pred)

    #model 3
    train_wo_na = train.dropna(subset=['adj_demand_1', 'adjusted_demand'])
    df_tt = train_wo_na[train_wo_na['Week_num'] < week]
    df_ho = train_wo_na[train_wo_na['Week_num'] == week]
    pred = df_ho['adj_demand_1']

    model_mse[i, 2] = root_mean_squared_error(y_true = df_ho['adjusted_demand'], y_pred = pred)

    i += 1
print(model_mse.mean(axis=0))  


[11.90314417 17.70075184 12.09425018]


* We can expand this model to include missing clients
* We can include longer lagged in the model
* Auto ARIMA i.e. find out the right number of lags
* We can use the average of the client's observations for prediction
* Calculate autocorrelation
* XGBoost

## Vector Auto Regression

<!-- https://en.wikipedia.org/wiki/Autoregressive_model#n-step-ahead_forecasting -->
<!-- https://en.wikipedia.org/wiki/Autoregressive_model#Evaluating_the_quality_of_forecasts -->

A potential regression is
$y_t = y_{t-1} + x_{t-1}$

where $x_{t-1}$ is total sales of OTHER goods/total sales of ALL goods.

In [None]:
# for train_index, test_index in kfold.split(train):
#     print("TRAIN INDEX:", train_index)
#     print("TEST INDEX:", test_index)
#     print()
#     print()

In [None]:
# lr = LinearRegression()
# X = train[['Price']]
# X = X.fillna(0)
# y = train['Adjusted_demand']
# lr.fit(X, y)

# preds = lr.predict(X)
# print("Coefficient", lr.coef_ )
# print("Intercept:", lr.intercept_)

In [None]:
# plt.scatter(x = train[['Price']], y = train['Adjusted_demand'], c = 'blue')
# plt.scatter(x = train[['Price']], y = preds, c = 'red')

# plt.show()

In [None]:
# X = train[['log_Price']]
# X = X.fillna(0)
# y = train['log_adjusted_demand']
# lr.fit(X,y)

# preds = lr.predict(X)
# print("Coefficient", lr.coef_ )
# print("Intercept:", lr.intercept_)

In [None]:
# plt.scatter(x = train[['log_Price']], y = train['log_adjusted_demand'], c = 'blue')
# plt.scatter(x = train[['log_Price']], y = preds, c = 'red')

# plt.show()

Let's write down the first regression which includes price, week FE, and state FE.

In [None]:
# state_dummies = [col for col in train.columns if 'state_' in col]
# week_dummies = [col for col in train.columns if 'week_' in col]
# X = train[state_dummies + week_dummies + ['Price']]
# X = X.fillna(0)
# y = train['Adjusted_demand']
# lr.fit(X, y)

# preds = lr.predict(X)
# lr.coef_

In [None]:
# state_dummies = [col for col in train.columns if 'state_' in col]
# week_dummies = [col for col in train.columns if 'week_' in col]
# X = train[state_dummies + week_dummies + ['log_Price']]
# X = X.fillna(0)
# y = train['log_adjusted_demand']
# lr.fit(X, y)

# preds = lr.predict(X)
# lr.coef_

In [None]:
# plt.scatter(x = train[['log_Price']], y = train['log_adjusted_demand'], c = 'blue')
# plt.scatter(x = train[['log_Price']], y = preds, c = 'red')

# plt.show()