In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecaster-v2/__results__.html
/kaggle/input/m5-forecaster-v2/__output__.json
/kaggle/input/m5-forecaster-v2/custom.css
/kaggle/input/m5-forecaster-v2/sub_dt_lgb.csv
/kaggle/input/m5-forecaster-v2/__script__.ipynb
/kaggle/input/m5-forecaster-v2/Rplot001.png
/kaggle/input/m5baseline/data.pkl
/kaggle/input/m5baseline/submission.csv
/kaggle/input/m5-magic-blending/__results__.html
/kaggle/input/m5-magic-blending/__output__.json
/kaggle/input/m5-magic-blending/custom.css
/kaggle/input/m5-magic-blending/__notebook__.ipynb
/kaggle/input/m5-magic-blending/submission.csv
/kaggle/input/m5-magic-blending/__resultx__.html
/kaggle/input/lags-features/lags_df_28.pkl


In [2]:
import os
import gc
import warnings

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
register_matplotlib_converters()
sns.set()

In [3]:
def train_lgb(bst_params, fit_params, X, y, cv=None, drop_when_train=None):
    models = []

    if drop_when_train is None:
        drop_when_train = []
    
    if cv is None:   
        train_set = lgb.Dataset(
            X.drop(drop_when_train, axis=1),
            label=y,
            categorical_feature=["item_id"],)
        model = lgb.train(
            bst_params,
            train_set,
            valid_sets=[train_set],
            valid_names=["train"],
            **fit_params,
        )
        
        del X, y
        gc.collect()
        
        return [model]
    
    scores = []
    n_iters = []
    for idx_fold, (idx_trn, idx_val) in enumerate(cv.split(X, y)):
        print(f"\n----- Fold: ({idx_fold + 1} / {cv.get_n_splits()}) -----\n")

        X_trn, X_val = X.iloc[idx_trn], X.iloc[idx_val]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        train_set = lgb.Dataset(
            X_trn.drop(drop_when_train, axis=1),
            label=y_trn,
            categorical_feature=["item_id"],
        )
        val_set = lgb.Dataset(
            X_val.drop(drop_when_train, axis=1),
            label=y_val,
            categorical_feature=["item_id"],
        )

        model = lgb.train(
            bst_params,
            train_set,
            valid_sets=[train_set, val_set],
            valid_names=["train", "valid"],
            **fit_params,
        )
        scores += [model.best_score['valid']['rmse']]
        n_iters += [model.best_iteration]
        models.append(model)

        del idx_trn, idx_val, X_trn, X_val, y_trn, y_val
        gc.collect()
    
    print(f'cv score: {np.mean(scores)}')
    print('all scores: ', *scores)
    print('num_estimators', *n_iters)
    
    return models

class CustomTimeSeriesSplitter:
    def __init__(self, n_splits=5, train_days=80, test_days=20, day_col="d"):
        self.n_splits = n_splits
        self.train_days = train_days
        self.test_days = test_days
        self.day_col = day_col

    def split(self, X, y=None, groups=None):
        SEC_IN_DAY = 3600 * 24
        sec = (X[self.day_col] - X[self.day_col].iloc[0]) * SEC_IN_DAY
        duration = sec.max()

        train_sec = self.train_days * SEC_IN_DAY
        test_sec = self.test_days * SEC_IN_DAY
        total_sec = test_sec + train_sec

        if self.n_splits == 1:
            train_start = duration - total_sec
            train_end = train_start + train_sec

            train_mask = (sec >= train_start) & (sec < train_end)
            test_mask = sec >= train_end

            yield sec[train_mask].index.values, sec[test_mask].index.values

        else:
            # step = (duration - total_sec) / (self.n_splits - 1)
            step = DAYS_PRED * SEC_IN_DAY

            for idx in range(self.n_splits):
                # train_start = idx * step
                shift = (self.n_splits - (idx + 1)) * step
                train_start = duration - total_sec - shift
                train_end = train_start + train_sec
                test_end = train_end + test_sec

                train_mask = (sec > train_start) & (sec <= train_end)

                if idx == self.n_splits - 1:
                    test_mask = sec > train_end
                else:
                    test_mask = (sec > train_end) & (sec <= test_end)

                yield sec[train_mask].index.values, sec[test_mask].index.values

    def get_n_splits(self):
        return self.n_splits
    
def make_submission(test, submission):
    preds = test[["id", "date", "demand"]]
    preds = preds.pivot(index="id", columns="date", values="demand").reset_index()
    preds.columns = ["id"] + ["F" + str(d + 1) for d in range(DAYS_PRED)]

    vals = submission[["id"]].merge(preds, how="inner", on="id")
    evals = submission[submission["id"].str.endswith("evaluation")]
    final = pd.concat([vals, evals])

    assert final.drop("id", axis=1).isnull().sum().sum() == 0
    assert final["id"].equals(submission["id"])

    final.to_csv("submission.csv", index=False)

def on_kaggle():
    return "KAGGLE_KERNEL_RUN_TYPE" in os.environ

In [4]:
%%time

INPUT_DIR = "/kaggle/input" if on_kaggle() else "input"
 
data = pd.read_pickle(f"{INPUT_DIR}/m5baseline/data.pkl")    
submission = pd.read_csv(f"{INPUT_DIR}/m5-forecasting-accuracy/sample_submission.csv")
lag_features = pd.read_pickle(f"{INPUT_DIR}/lags-features/lags_df_28.pkl")   

NUM_ITEMS = 30490
DAYS_PRED = submission.shape[1] - 1  # 28

dt_col = "date"
day_col = "d"

print("start date:", data[dt_col].min())
print("end date:", data[dt_col].max())
print("data shape:", data.shape)
print("lag_features shape:", lag_features.shape)
print("submission shape:", submission.shape)

start date: 2014-05-23 00:00:00
end date: 2016-05-22 00:00:00
data shape: (22288190, 51)
lag_features shape: (46881677, 22)
submission shape: (60980, 29)
CPU times: user 4.81 s, sys: 22.9 s, total: 27.8 s
Wall time: 28 s


In [5]:
22288190/ 30490, 46881677 / 30490

(731.0, 1537.6082978025581)

In [6]:
display(data.sample(10))
display(lag_features.sample(10))
display(submission.sample(10))

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,shift_t28,shift_t29,shift_t30,rolling_std_t7,rolling_std_t30,rolling_std_t60,rolling_std_t90,rolling_std_t180,rolling_mean_t7,rolling_mean_t30,rolling_mean_t60,rolling_mean_t90,rolling_mean_t180,rolling_min_t7,rolling_min_t30,rolling_min_t60,rolling_max_t7,rolling_max_t30,rolling_max_t60,rolling_skew_t30,rolling_kurt_t30,price_change_t1,price_change_t365,rolling_price_std_t7,rolling_price_std_t30,year,quarter,month,week,day,dayofweek,is_weekend
12747642,FOODS_3_598_CA_1_validation,1210,2,0,0,0,1629,0,train,2015-07-15,11524,,,,,0,1,1,1.98,0.0,3.0,1.0,1.112697,2.873582,2.973708,2.88861,3.635367,0.714286,2.133333,2.733333,2.755556,3.044445,0.0,0.0,0.0,3.0,13.0,13.0,2.145797,6.051272,0.0,0.0,4.094996e-09,6.648725e-09,2015,3,7,29,15,2,0
1511172,FOODS_2_090_TX_2_validation,305,1,0,5,1,1260,0,train,2014-07-11,11423,,,,,0,1,1,8.96,1.0,0.0,0.0,0.534522,,,,,0.428571,,,,,0.0,,,1.0,,,,,0.0,,0.0,0.0,2014,3,7,28,11,4,0
4747513,HOBBIES_1_228_WI_1_validation,1657,3,1,7,2,1366,0,train,2014-10-25,11439,,,,,0,0,0,6.87,0.0,0.0,0.0,0.48795,0.621455,0.700282,0.670029,,0.285714,0.4,0.466667,0.422222,,0.0,0.0,0.0,1.0,2.0,3.0,1.329992,0.830803,0.0,,0.0,0.0,2014,4,10,43,25,5,1
20082974,FOODS_3_035_TX_3_validation,648,2,0,6,1,1869,1,train,2016-03-11,11606,,,,,0,1,1,4.18,0.0,0.0,3.0,1.154701,0.83666,0.922261,0.845713,0.963567,1.0,0.7,0.716667,0.677778,0.694444,0.0,0.0,0.0,3.0,3.0,3.0,1.014388,0.393083,0.0,0.0,2.935267e-08,0.0,2016,1,3,10,11,4,0
1351707,HOUSEHOLD_1_445_CA_4_validation,2437,5,2,3,0,1255,3,train,2014-07-06,11423,,,,,1,1,1,1.97,2.0,0.0,2.0,0.755929,,,,,1.285714,,,,,0.0,,,2.0,,,,,0.0,,0.0,0.0,2014,3,7,27,6,6,1
22102430,HOBBIES_1_237_WI_3_validation,1666,3,1,9,2,1935,0,validation,2016-05-16,11616,,,,,0,0,0,7.98,0.0,0.0,0.0,0.377964,0.592094,0.600141,0.546924,0.60237,0.142857,0.166667,0.25,0.244444,0.35,0.0,0.0,0.0,1.0,3.0,3.0,4.232194,19.245749,0.0,0.0,0.0,0.0,2016,2,5,20,16,0,0
19642835,HOUSEHOLD_2_081_CA_3_validation,2614,6,2,2,0,1855,0,train,2016-02-26,11604,,,,,0,0,0,2.57,1.0,0.0,0.0,0.48795,1.093345,0.922261,0.847704,0.934477,0.285714,0.666667,0.616667,0.577778,0.622222,0.0,0.0,0.0,1.0,5.0,5.0,2.424761,7.613113,0.0,0.0,0.0,0.0,2016,1,2,8,26,4,0
6565996,HOUSEHOLD_2_404_CA_4_validation,2936,6,2,3,0,1426,1,train,2014-12-24,11447,0.0,2.0,,,0,0,0,2.97,0.0,0.0,0.0,0.0,0.610257,0.492887,0.471537,0.488985,0.0,0.2,0.166667,0.188889,0.2,0.0,0.0,0.0,0.0,3.0,3.0,3.784509,15.850643,0.0,,0.0,0.0,2014,4,12,52,24,2,0
2551316,FOODS_3_128_TX_3_validation,740,2,0,6,1,1294,0,train,2014-08-14,11428,,,,,0,0,1,2.98,5.0,2.0,0.0,1.799471,1.098065,,,,1.285714,0.633333,,,,0.0,0.0,,5.0,5.0,,2.649356,8.377532,0.0,,0.0,0.0,2014,3,8,33,14,3,0
2809848,FOODS_1_110_CA_2_validation,107,0,0,1,0,1303,10,train,2014-08-23,11430,,,,,0,0,0,1.96,12.0,9.0,3.0,2.878492,4.816161,6.013544,,,8.571428,6.333333,5.8,,,3.0,0.0,0.0,12.0,18.0,27.0,0.304337,-0.439672,0.0,,0.0,0.08095979,2014,3,8,34,23,5,1


Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,rolling_max_14,rolling_mean_14,rolling_std_14,rolling_max_30,rolling_mean_30,rolling_std_30,rolling_max_60,rolling_mean_60,rolling_std_60,rolling_max_180,rolling_mean_180,rolling_std_180
27391471,FOODS_1_189_TX_2_validation,1293,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20907760,HOBBIES_1_176_TX_3_validation,1057,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.285645,0.46875,3.0,0.266602,0.639648,3.0,0.283447,0.584961,3.0,0.338867,0.570801
15182062,HOUSEHOLD_2_439_TX_3_validation,828,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.571289,1.089844,4.0,0.466553,0.860352,4.0,0.233276,0.647461,4.0,0.116638,0.426514
27986800,HOBBIES_2_055_CA_3_validation,1314,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.071411,0.267334,1.0,0.033325,0.182617,1.0,0.033325,0.18103,3.0,0.083313,0.379395
700280,HOBBIES_1_264_CA_1_validation,56,0.0,0.0,1.0,0.0,2.0,1.0,1.0,2.0,2.0,0.643066,0.745117,,,,,,,,,
40746120,HOBBIES_1_067_WI_1_validation,1740,6.0,0.0,2.0,6.0,31.0,2.0,3.0,12.0,31.0,7.214844,8.257812,36.0,7.898438,8.773438,36.0,9.484375,8.703125,56.0,9.078125,9.695312
10579237,FOODS_3_307_TX_2_validation,622,5.0,1.0,0.0,0.0,1.0,2.0,4.0,2.0,9.0,2.357422,2.273438,9.0,2.433594,1.977539,9.0,2.466797,1.827148,11.0,1.933594,1.886719
27052812,FOODS_3_004_WI_1_validation,1281,1.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.428467,1.158203,4.0,0.533203,1.136719,4.0,0.31665,0.853516,4.0,0.394531,0.842285
31120838,HOUSEHOLD_1_376_TX_1_validation,1421,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,4.0,0.928711,1.268555,4.0,0.733398,1.172852,4.0,0.899902,1.00293,4.0,0.916504,0.973633
16399732,FOODS_3_724_CA_1_validation,879,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.214233,0.425781,1.0,0.133301,0.345703,2.0,0.133301,0.38916,2.0,0.099976,0.318848


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
54209,FOODS_3_152_WI_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42003,FOODS_3_142_CA_4_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41603,FOODS_2_140_CA_4_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15406,HOBBIES_1_166_TX_2_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
45464,FOODS_3_554_TX_1_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29085,FOODS_1_034_WI_3_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
46899,HOUSEHOLD_2_068_TX_2_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30472,FOODS_3_810_WI_3_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36129,FOODS_3_366_CA_2_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47124,HOUSEHOLD_2_294_TX_2_evaluation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
cv_params = {
    "n_splits": 3,
    "train_days": int(365 * 1.5),
    "test_days": DAYS_PRED,
    "day_col": day_col,
}
cv = CustomTimeSeriesSplitter(**cv_params)

In [8]:
features = [
    "item_id",
    "dept_id",
    "cat_id",
    "store_id",
    "state_id",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "sell_price",
    # demand features
    "shift_t28",
    "shift_t29",
    "shift_t30",
    # std
    "rolling_std_t7",
    "rolling_std_t30",
    "rolling_std_t60",
    "rolling_std_t90",
    "rolling_std_t180",
    # mean
    "rolling_mean_t7",
    "rolling_mean_t30",
    "rolling_mean_t60",
    "rolling_mean_t90",
    "rolling_mean_t180",
    # min
    "rolling_min_t7",
    "rolling_min_t30",
    "rolling_min_t60",
    # max
    "rolling_max_t7",
    "rolling_max_t30",
    "rolling_max_t60",
    # others
    "rolling_skew_t30",
    "rolling_kurt_t30",
    # price features
    "price_change_t1",
    "price_change_t365",
    "rolling_price_std_t7",
    "rolling_price_std_t30",
    # time features
    "year",
    "quarter",
    "month",
    "week",
    "day",
    "dayofweek",
    "is_weekend",
]

# prepare training and test data.
# 2011-01-29 ~ 2016-04-24 : d_1    ~ d_1913
# 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public)
# 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private)

del lag_features
del submission
gc.collect()

is_train = data["d"] < 1914

# Attach "d" to X_train for cross validation.
X_train = data[is_train][[day_col] + features].reset_index(drop=True)
y_train = data[is_train]["demand"].reset_index(drop=True)
X_test = data[~is_train][features].reset_index(drop=True)

# keep these two columns to use later.
id_date = data[~is_train][["id", "date"]].reset_index(drop=True)

del data
gc.collect()

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (21434470, 46)
X_test shape: (853720, 45)


In [9]:
bst_params = {
    "boosting_type": "gbdt",
    "metric": "rmse",
    "objective": "poisson",
    "n_jobs": -1,
    "seed": 42,
    "learning_rate": 0.1,
    "bagging_fraction": 0.7,
    "bagging_freq": 10,
    "colsample_bytree": 0.7,
}

fit_params = {
    "num_boost_round": 1000,
    "early_stopping_rounds": 50,
    "verbose_eval": 100,
}

models = train_lgb(
    bst_params, fit_params, X_train, y_train, cv=cv, drop_when_train=[day_col]
)

del X_train, y_train
gc.collect()


----- Fold: (1 / 3) -----

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.30107	valid's rmse: 2.4651
[200]	train's rmse: 2.23313	valid's rmse: 2.41855
[300]	train's rmse: 2.19491	valid's rmse: 2.4027
[400]	train's rmse: 2.16347	valid's rmse: 2.39353
[500]	train's rmse: 2.14089	valid's rmse: 2.38079
[600]	train's rmse: 2.11895	valid's rmse: 2.36965
[700]	train's rmse: 2.10416	valid's rmse: 2.36662
Early stopping, best iteration is:
[748]	train's rmse: 2.09457	valid's rmse: 2.36237

----- Fold: (2 / 3) -----

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.31326	valid's rmse: 2.30927
[200]	train's rmse: 2.23459	valid's rmse: 2.29707
Early stopping, best iteration is:
[217]	train's rmse: 2.22467	valid's rmse: 2.29413

----- Fold: (3 / 3) -----

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.29339	valid's rmse: 2.21394
[200]	train's rmse: 2.22894	valid's rmse: 2.17998
[300]	train's r

0

In [10]:
%debug

In [11]:
# train to submit
model = *train_lgb(
    {**bst_params, "num_boost_round": 700}, fit_params, X_train, y_train, cv=None, drop_when_train=[day_col]
)

preds= model.predict(X_test)

SyntaxError: can't use starred expression here (<ipython-input-11-402d836c0486>, line 5)

In [12]:
 preds= model[0].predict(X_test)

NameError: name 'model' is not defined

In [13]:
lgb.plot_importance(model[0], figsize=(15, 20))

NameError: name 'model' is not defined

In [14]:
submission = pd.read_csv(f"{INPUT_DIR}/m5-forecasting-accuracy/sample_submission.csv")
make_submission(id_date.assign(demand=preds), submission)
submission = pd.read_csv(f"submission.csv")

NameError: name 'preds' is not defined

In [15]:
# blending

subm2 = pd.read_csv(f'{INPUT_DIR}/m5-magic-blending/submission.csv').sort_values(by = 'id').reset_index(drop=True)
subm3 = pd.read_csv(f'{INPUT_DIR}/m5-forecaster-v2/sub_dt_lgb.csv').sort_values(by = 'id').reset_index(drop=True)

for c in submission.columns :
    if c != 'id' :
        submission[c] = 0.1*submission[c] + 0.2*subm2[c] + 0.7*subm3[c]
        
submission.to_csv('submission.csv',index=False)