In [None]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 500)

In [None]:
def load():
    train = pd.read_csv("Data/train.csv", parse_dates=["date"])
    test = pd.read_csv("Data/test.csv", parse_dates=["date"])

    sample_sub = pd.read_csv("Data/sample_submission.csv")

    df = pd.concat([train, test], sort=False)

    return df, train, test, sample_sub

df, train, test, sample_sub = load()

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Unique #####################")
    print(dataframe.nunique())
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())

In [None]:
check_df(df)

In [None]:
df.groupby(["store"])["item"].nunique()

In [None]:
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})

In [None]:
df.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T

In [None]:
def create_date_features(df):
    df["month"] = df.date.dt.month
    df["day_of_month"] = df.date.dt.day
    df["day_of_year"] = df.date.dt.dayofyear
    df["week_of_year"] = df.date.dt.isocalendar().week
    df["day_of_week"] = df.date.dt.dayofweek
    df["year"] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4
    df["is_month_start"] = df.date.dt.is_month_start.astype(int)
    df["is_month_end"] = df.date.dt.is_month_end.astype(int)
    return df

In [None]:
df = create_date_features(df)
df

In [None]:
print("##################### Shape #####################")
print(df.shape)
print("##################### Types #####################")
print(df.dtypes)

In [None]:
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe)))

In [None]:
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe["sales_lag_" + str(lag)] = dataframe.groupby(["store", "item"])["sales"].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

In [None]:
df = lag_features(df, [91, 98, 105, 112, 119, 126, 182, 364, 546, 728])
df

In [None]:
print("##################### Shape #####################")
print(df.shape)
print("##################### Types #####################")
print(df.dtypes)

In [None]:
def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe["sales_roll_mean_" + str(window)] = dataframe.groupby(["store", "item"])["sales"]. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe

In [None]:
df = roll_mean_features(df, [365, 546])
df

In [None]:
print("##################### Shape #####################")
print(df.shape)
print("##################### Types #####################")
print(df.dtypes)

In [None]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe["sales_ewm_alpha_" + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])["sales"].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

In [None]:
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]

df = ewm_features(df, alphas, lags)
df

In [None]:
print("##################### Shape #####################")
print(df.shape)
print("##################### Types #####################")
print(df.dtypes)

In [None]:
df = pd.get_dummies(df, columns=["store", "item", "day_of_week", "month"], drop_first=True, dtype=int)
df

In [None]:
df["sales"] = np.log1p(df["sales"].values)
df

In [None]:
# Train set until the beginning of 2017 (end of 2016)
train = df.loc[(df["date"] < "2017-01-01"), :]

# Validation set for the first 3 months of 2017. Because the date range we need to forecast is the first 3 months of 2018.
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]

In [None]:
cols = [col for col in train.columns if col not in ["date", "id", "sales", "year"]]

y_train = train["sales"]
X_train = train[cols]

y_val = val["sales"]
X_val = val[cols]

y_train.shape, X_train.shape, y_val.shape, X_val.shape

In [None]:
def smape(y_pred, y_true):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    n = len(y_true)
    masked_arr = ~((y_pred == 0) & (y_true == 0))
    y_pred, y_true = y_pred[masked_arr], y_true[masked_arr]
    num = np.abs(y_pred - y_true)
    denom = np.abs(y_pred) + np.abs(y_true)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

def lgbm_smape(y_pred, y_true):
    smape_val = smape(np.expm1(y_true), np.expm1(y_pred))
    return "SMAPE", smape_val, False

In [None]:
model = LGBMRegressor(num_leaves=10,
                      learning_rate=0.02,
                      feature_fraction=0.8,
                      max_depth=5,
                      n_estimators=10000,
                      n_jobs=-1,
                      random_state=42,
                      force_col_wise=True)

es = early_stopping(stopping_rounds=200, verbose=True)
le = log_evaluation(period=100)

In [None]:
model.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_val, y_val)],
          eval_metric=lgbm_smape,
          callbacks=[es, le])