# Linear regressison on aggregate stats
Trained client-only, product-only, client-product models and used them as appropriate.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV

## Import Datasets

**Training Dataset**
The training dataset contains the following variables:
* Client ID
* Product ID
* Week
* Sales Depot
* Sales Channel
* Route ID
* Sales Quantity and Value
* Return Quantity and Value
* Adjusted Demand (target variable)

**Test Dataset**
The test dataset containes the following variables:
* Client ID
* Product ID
* Week
* Sales Channel
* Route ID

First, we combine the training and test datasets to define a new ID variables based on unique client ID, product ID, sales Channel, route ID, and sales depot. Then, we split the training and test dataset.

In [2]:
#data set from kaggle: https://www.kaggle.com/competitions/grupo-bimbo-inventory-demand/data

#office
train = pd.read_csv("train.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'Demanda_uni_equil'])
test = pd.read_csv("test.csv", usecols=['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Producto_ID', 'Cliente_ID', 'id'])

train = train.rename(columns={'Semana': 'Week_num',
                              'Agencia_ID': 'Sales_Depot_ID',
                              'Canal_ID': 'Sales_Channel_ID',
                              'Ruta_SAK': 'Route_ID',
                              'Cliente_ID': 'Client_ID',
                              'Venta_uni_hoy': 'Sales_unit_this_week',
                              'Venta_hoy': 'Sales_this_week',
                              'Dev_uni_proxima': 'Returns_unit_next_week',
                              'Dev_proxima': 'Returns_next_week',
                              'Demanda_uni_equil': 'adjusted_demand',
                              'Producto_ID': 'Product_ID'})

test = test.rename(columns={'Semana': 'Week_num',
                            'Agencia_ID': 'Sales_Depot_ID',
                            'Canal_ID': 'Sales_Channel_ID',
                            'Ruta_SAK': 'Route_ID',
                            'Cliente_ID': 'Client_ID',
                            'Venta_uni_hoy': 'Sales_unit_this_week',
                            'Venta_hoy': 'Sales_this_week',
                            'Dev_uni_proxima': 'Returns_unit_next_week',
                            'Dev_proxima': 'Returns_next_week',
                            'Demanda_uni_equil': 'adjusted_demand',
                            'Producto_ID': 'Product_ID'})



#set a unique id for each sales depot id, sales channel id, route id, client, product combination (thanks Gemini)
combined_df = pd.concat([train,test])
combined_df['ID'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID', 'Product_ID']).ngroup()

#set a combined client ID, consisting of a unique sales depot ID, sales channel ID, route ID, and client ID
combined_df['ccid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Client_ID']).ngroup()

#set a combined product ID, consisting of a unique sales depot ID, sales channel ID, route ID, and product ID
combined_df['cpid'] = combined_df.groupby(['Sales_Depot_ID', 'Sales_Channel_ID', 'Route_ID', 'Product_ID']).ngroup()

train = combined_df.iloc[:len(train)].copy()
test = combined_df.iloc[len(train):].copy()

del combined_df


train = train.drop(columns='id')
train['adjusted_demand'] = train['adjusted_demand'].astype(int)
train = train.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

test = test.drop(columns='adjusted_demand')
test['id'] = test['id'].astype(int)
test = test.sort_values(by=['ID', 'Week_num']).reset_index(drop=True)

## Aggregate Data at Client Level and Product Level
In this step, we create various aggregate measures of demand at client level and the product level. We create these measures using data from week 3-8 to avoid data leakage.

In [3]:
#create a dataframe of aggregate statistics for each client
testagg = train[train['Week_num'] <= 8].sort_values(by=['ccid']).groupby(['ccid'], as_index=False).agg({'Product_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

client_stats = pd.DataFrame()

client_stats['ccid'] = testagg['ccid']
client_stats['Products'] = testagg['Product_ID']['nunique']
client_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
client_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
client_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
client_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
client_stats['mean_pct'] = client_stats['adj_dem_mean'].rank(pct=True, method='average')

del testagg

#create a dataframe of aggregate statistics for each product
testagg = train[train['Week_num'] <= 8].sort_values(by=['cpid']).groupby(['cpid'], as_index=False).agg({'Client_ID':'nunique', 'adjusted_demand':['mean', 'median', 'min', 'max']})

product_stats =  pd.DataFrame()

product_stats['cpid'] = testagg['cpid']
product_stats['Clients'] = testagg['Client_ID']['nunique']
product_stats['adj_dem_mean'] = testagg['adjusted_demand']['mean'].round(2)
product_stats['adj_dem_median'] = testagg['adjusted_demand']['median'].astype(int)
product_stats['adj_dem_min'] = testagg['adjusted_demand']['min']
product_stats['adj_dem_max'] = testagg['adjusted_demand']['max']
product_stats['median_pct'] = product_stats['adj_dem_median'].rank(pct=True, method='average')

del testagg

We merge the `client_stats` and `product_stats` dataframes with the `train` dataset.

In [4]:
# get ccid, cpid means, medians and cpid median percentage into training data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_mean'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_median'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_min'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_min'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['adj_dem_max'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_max'] = train['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())].set_index('ccid')['mean_pct'], index=client_stats[client_stats['ccid'].isin(train['ccid'].unique().tolist())]['ccid']).to_dict()
train['ccid_mean_pct'] = train['ccid'].map(cidmapping).round(3)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_mean'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median'] = train['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())].set_index('cpid')['median_pct'], index=product_stats[product_stats['cpid'].isin(train['cpid'].unique().tolist())]['cpid']).to_dict()
train['cpid_median_pct'] = train['cpid'].map(pidmapping).round(3)

del cidmapping, pidmapping

train.head()

Unnamed: 0,Week_num,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,adjusted_demand,ID,ccid,cpid,ccid_mean,ccid_median,ccid_min,ccid_max,ccid_mean_pct,cpid_mean,cpid_median,cpid_median_pct
0,4,1110,7,3301,15766,325,1,0,0,4,5.51,4.0,1.0,23.0,0.693,1.83,1.0,0.082
1,4,1110,7,3301,15766,328,1,1,0,5,5.51,4.0,1.0,23.0,0.693,1.88,2.0,0.23
2,3,1110,7,3301,15766,1212,3,2,0,15,5.51,4.0,1.0,23.0,0.693,2.51,2.0,0.23
3,4,1110,7,3301,15766,1212,4,2,0,15,5.51,4.0,1.0,23.0,0.693,2.51,2.0,0.23
4,5,1110,7,3301,15766,1212,5,2,0,15,5.51,4.0,1.0,23.0,0.693,2.51,2.0,0.23


In [7]:
# create training data based on ID in order to use lagged adjusted demand
trainIDdf = pd.DataFrame()

trainIDdf = train[['ID', 'Client_ID', 'Product_ID', 'ccid_mean', 'ccid_median', 'ccid_min', 'ccid_max', 'cpid_mean', 'cpid_median']].drop_duplicates(subset='ID', keep='first').reset_index(drop=True)

# get adjusted demand for the week for each ID 
for j in [6,7,8,9]:
    wkmap = pd.Series(train[train['Week_num'] == j].set_index('ID')['adjusted_demand'], index=train['ID'].unique()).to_dict()
    trainIDdf[f'Wk_{j}_dem'] = trainIDdf['ID'].map(wkmap)

trainIDdf['ID'] = trainIDdf['ID'].astype('category')
trainIDdf['Client_ID'] = trainIDdf['Client_ID'].astype('category')
trainIDdf['Product_ID'] = trainIDdf['Product_ID'].astype('category')


trainIDdf.head()

Unnamed: 0,ID,Client_ID,Product_ID,ccid_mean,ccid_median,ccid_min,ccid_max,cpid_mean,cpid_median,Wk_6_dem,Wk_7_dem,Wk_8_dem,Wk_9_dem
0,0,15766,325,5.51,4.0,1.0,23.0,1.83,1.0,,,,
1,1,15766,328,5.51,4.0,1.0,23.0,1.88,2.0,,,,
2,2,15766,1212,5.51,4.0,1.0,23.0,2.51,2.0,,,4.0,1.0
3,3,15766,1216,5.51,4.0,1.0,23.0,2.16,2.0,1.0,2.0,5.0,
4,4,15766,1220,5.51,4.0,1.0,23.0,3.1,3.0,,,1.0,


In [8]:
X_train = trainIDdf.drop(columns = ['ID', 'Client_ID', 'Product_ID', 'Wk_6_dem', 'Wk_7_dem', 'Wk_8_dem'])

In [9]:
X_train = X_train.dropna(subset=['Wk_9_dem'])
y_train = X_train['Wk_9_dem']
X_train = X_train.drop(columns = 'Wk_9_dem')
X_train.head()

Unnamed: 0,ccid_mean,ccid_median,ccid_min,ccid_max,cpid_mean,cpid_median
2,5.51,4.0,1.0,23.0,2.51,2.0
5,5.51,4.0,1.0,23.0,3.38,3.0
6,5.51,4.0,1.0,23.0,4.55,4.0
7,5.51,4.0,1.0,23.0,2.86,3.0
8,5.51,4.0,1.0,23.0,6.79,6.0


In [9]:
del train

In [18]:
X_train_product = X_train[np.logical_not(X_train['cpid_mean'].isnull())]
y_train_product = y_train[np.logical_not(X_train['cpid_mean'].isnull())]

X_train_client = X_train[np.logical_not(X_train['ccid_mean'].isnull())]
y_train_client = y_train[np.logical_not(X_train['ccid_mean'].isnull())]

X_train_full = X_train[np.logical_not(np.logical_or(X_train['cpid_mean'].isnull(), X_train['ccid_mean'].isnull()))]
y_train_full = y_train[np.logical_not(np.logical_or(X_train['cpid_mean'].isnull(), X_train['ccid_mean'].isnull()))]

In [19]:
X_train_product = X_train_product.drop(columns = ['ccid_mean', 'ccid_median', 'ccid_min', 'ccid_max'])
X_train_client = X_train_client.drop(columns = ['cpid_mean', 'cpid_median'])

In [21]:
model_product = LinearRegression()
model_product.fit(X_train_product, y_train_product)
test_pred = model_product.predict(X_train_product)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train_product))

0.609716542006597


In [22]:
model_client = LinearRegression()
model_client.fit(X_train_client, y_train_client)
test_pred = model_client.predict(X_train_client)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train_client))

0.7816017969592134


In [23]:
model_full = LinearRegression()
model_full.fit(X_train_full, y_train_full)
test_pred = model_full.predict(X_train_full)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train_full))

0.5796897860978377


In [24]:
# get ccid, cpid means and medians into test data
cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_mean'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_mean'] = test['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_median'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_median'] = test['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_min'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_min'] = test['ccid'].map(cidmapping)

cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['adj_dem_max'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
test['ccid_max'] = test['ccid'].map(cidmapping)

# cidmapping = pd.Series(client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())].set_index('ccid')['mean_pct'], index=client_stats[client_stats['ccid'].isin(test['ccid'].unique().tolist())]['ccid']).to_dict()
# test['ccid_mean_pct'] = test['ccid'].map(cidmapping).round(3)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_mean'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
test['cpid_mean'] = test['cpid'].map(pidmapping)

pidmapping = pd.Series(product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())].set_index('cpid')['adj_dem_median'], index=product_stats[product_stats['cpid'].isin(test['cpid'].unique().tolist())]['cpid']).to_dict()
test['cpid_median'] = test['cpid'].map(pidmapping)

del cidmapping, pidmapping

test = test[['id', 'ID', 'Week_num', 'Client_ID', 'Product_ID', 'ccid_mean', 'ccid_median', 'ccid_min', 'ccid_max', 'cpid_mean', 'cpid_median']].sort_values(by='id')
test.reset_index(drop=True, inplace=True)


In [26]:
test = test.drop(columns = ['ID', 'Week_num', 'Client_ID', 'Product_ID'])

In [27]:
test.head()

Unnamed: 0,id,ccid_mean,ccid_median,ccid_min,ccid_max,cpid_mean,cpid_median
0,0,2.73,2.0,0.0,5.0,4.79,4.0
1,1,3.47,2.0,1.0,8.0,2.07,2.0
2,2,6.78,4.0,1.0,40.0,2.45,2.0
3,3,2.12,1.0,1.0,5.0,1.33,1.0
4,4,5.24,5.0,1.0,20.0,,


In [28]:
X_test_product = test[np.logical_not(test['cpid_mean'].isnull())]
X_test_product = X_test_product.drop(columns = ['id', 'ccid_mean', 'ccid_median', 'ccid_min', 'ccid_max'])
y_test_product = model_product.predict(X_test_product)

In [29]:
X_test_client = test[np.logical_not(test['ccid_mean'].isnull())]
X_test_client = X_test_client.drop(columns = ['id', 'cpid_mean', 'cpid_median'])
y_test_client = model_client.predict(X_test_client)

In [30]:
X_test_full = test[np.logical_not(np.logical_or(test['cpid_mean'].isnull(), test['ccid_mean'].isnull()))]
X_test_full = X_test_full.drop(columns = ['id'])
y_test_full = model_full.predict(X_test_full)

In [31]:
test['adjusted_demand'] = 5

In [34]:
test.loc[np.logical_not(test['cpid_mean'].isnull()), 'adjusted_demand'] = y_test_product
test.loc[np.logical_not(test['ccid_mean'].isnull()), 'adjusted_demand'] = y_test_client
test.loc[np.logical_not(np.logical_or(test['cpid_mean'].isnull(), test['ccid_mean'].isnull())), 'adjusted_demand'] = y_test_full

In [35]:
test.head()

Unnamed: 0,id,ccid_mean,ccid_median,ccid_min,ccid_max,cpid_mean,cpid_median,adjusted_demand
0,0,2.73,2.0,0.0,5.0,4.79,4.0,3.640553
1,1,3.47,2.0,1.0,8.0,2.07,2.0,1.818389
2,2,6.78,4.0,1.0,40.0,2.45,2.0,3.141907
3,3,2.12,1.0,1.0,5.0,1.33,1.0,0.664384
4,4,5.24,5.0,1.0,20.0,,,5.422582


In [None]:
# make the submission file!

submission = pd.DataFrame()

submission['id'] = np.arange(len(test))
submission['Demanda_uni_equil'] = test['adjusted_demand']

submission.to_csv("prediction_linearregression.csv", index=False)

submission.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,3.640553
1,1,1.818389
2,2,3.141907
3,3,0.664384
4,4,5.422582


In [37]:
submission.describe()

Unnamed: 0,id,Demanda_uni_equil
count,6999251.0,6999251.0
mean,3499625.0,7.241931
std,2020510.0,17.98979
min,0.0,-25.18279
25%,1749812.0,2.699317
50%,3499625.0,4.34626
75%,5249438.0,7.212504
max,6999250.0,4345.728
