In [85]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler  
from sklearn.metrics import root_mean_squared_log_error
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
import numpy as np

In [86]:
Test = pd.read_csv("./data_predict/test.csv", index_col="date").drop(columns=["Unnamed: 0", "id"])
Train = pd.read_csv("./data_predict/train.csv", index_col="date").drop(columns=["Unnamed: 0", "transactions"])

In [87]:
# CATEGORICAL TO NUMBERS
categorical_cols = ['family', 'type', 'city', 'state']
for col in categorical_cols:
  encoder = OrdinalEncoder()
  Train[col] = encoder.fit_transform(Train[[col]])
  Test[col] = encoder.transform(Test[[col]])

In [88]:
# NORMALIZE
numerical_cols = ['onpromotion', 'oil_price']
for col in numerical_cols:
  encoder = MinMaxScaler(feature_range=(0,1))
  Train[col] = encoder.fit_transform(Train[[col]])
  Test[col] = encoder.transform(Test[[col]])

In [89]:
# LINEAR
Train = Train.reset_index()
Train['date'] = pd.to_datetime(Train['date'])

linearModelOrder = 10
fourierFrequency = 'MS'
fourierOrder = 10

stores_family = sorted(Train[['store_nbr', 'family']].drop_duplicates().values.tolist())

all_preds = []

for store_nbr, family in stores_family:
  store = Train[(Train['store_nbr'] == store_nbr) & (Train['family'] == family)].copy()

  store = store.set_index('date').asfreq('D')
  store[['sales', 'onpromotion']] = store[['sales', 'onpromotion']].fillna(0)

  y = store['sales']
  store = store.drop(columns=['sales'])

  fourier = CalendarFourier(freq=fourierFrequency, order=fourierOrder)
  dp = DeterministicProcess(
    index=store.index,
    order=linearModelOrder,
    seasonal=True,
    additional_terms=[fourier],
  )
  x = dp.in_sample()

  linear_model = LinearRegression()
  linear_model.fit(x, y)
  y_pred = linear_model.predict(x)
  y_pred = np.clip(y_pred, a_min=0, a_max=None)

  df_pred = pd.DataFrame({
    'date': store.index,
    'store_nbr': store_nbr,
    'family': family,
    'y_pred_lin': y_pred
  })
  all_preds.append(df_pred)

df_all_preds = pd.concat(all_preds, ignore_index=True)
Train = Train.merge(df_all_preds, on=['date', 'store_nbr', 'family'], how='left')

Train = Train.set_index("date")
Train.index = Train.index.date

In [90]:
# XGBRegressor
y = Train['sales']
y_lin = Train['y_pred_lin']

x_tree = Train.drop(columns=['sales', 'y_pred_lin'])
tree_model = XGBRegressor()
tree_model.fit(x_tree, y - y_lin)

y_pred_tree = tree_model.predict(x_tree)

In [91]:
y_pred_tree = np.clip(y_pred_tree, a_min=0, a_max=None)
root_mean_squared_log_error(y, y_pred_tree)

3.312776283829101