In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

In [2]:
store_sales = pd.read_csv(
    'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()

In [6]:
y = store_sales.unstack(['store_nbr', 'family']).loc["2017"]


# Create training data
fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()
X['NewYear'] = (X.index.dayofyear == 1)

y = y.stack(['store_nbr', 'family'])

model = XGBRegressor()
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=y.index, columns=y.columns)

ValueError: Shape of passed values is (227, 1782), indices imply (404514, 1)

In [None]:
df_test = pd.read_csv(
    'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

# Create features for test set
X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1)

In [None]:
y_submit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
y_submit = y_submit.stack(['store_nbr', 'family'])

y_submit = y_submit.join(df_test.id).reindex(columns=['id', 'sales'])
y_submit.to_csv('submission_kaggle_way_old.csv', index=False)

In [None]:
y_submit

In [None]:
x=y_submit.reset_index()

x.date = x.date.astype(np.datetime64)

sns.relplot(data=x[(x['date']>='2017-08-16') & (x['store_nbr']=='1') & (x['family']=='GROCERY I')], x='date', y='sales', kind='line')
plt.xticks(rotation=90)

# sns.relplot(data=all_df[(all_df['date']>='2017-08-16') & (all_df['store_nbr']=='1') & (all_df['family']=='GROCERY I')], x='date', y='sales', kind='line')
# plt.xticks(rotation=90)
# plt.show()