In [120]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib as plp
import lightgbm as lgb

In [92]:
df_train = pd.read_csv(
    '../Data/train_set_long.csv', usecols=[1, 2, 3, 4],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [93]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [94]:
df_val = pd.read_csv(
    '../Data/train_set_short.csv', usecols=[1, 2, 3, 4],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [123]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [95]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [96]:
df_val = df_val.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [97]:
df_train.columns = df_train.columns.get_level_values(1)
df_val.columns = df_val.columns.get_level_values(1)

In [98]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [105]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [106]:
print("Preparing dataset...")
t2017 = date(2017, 6, 20)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [129]:
X_train.shape

(618140, 2)

In [108]:
df_train.shape

(154535, 57)

In [130]:
y_train.shape

(618140, 16)

In [111]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

Training and predicting models...


In [113]:
def prepare_val_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_val, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_val, t2017, 7, 7).mean(axis=1).values,
    })
    if is_train:
        y = df_val[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [115]:
X_val, y_val = prepare_val_dataset(date(2017,6,8))

In [141]:
df_g = pd.read_csv(
    "../Data/valid_set.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
)

In [142]:
df_g = df_g.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_g.columns = df_g.columns.get_level_values(1)

In [143]:
def prepare_g_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_g, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_g, t2017, 7, 7).mean(axis=1).values,
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [148]:
X_test = prepare_g_dataset(date(2017, 8, 16), is_train=False)