In [1]:
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

df_train = pd.read_csv(
    'input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "input/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [2]:
df_2017.shape

(23808261, 5)

In [3]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [4]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

Preparing dataset...
Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.301777	valid_1's l2: 0.29407
[200]	training's l2: 0.298198	valid_1's l2: 0.292699
[300]	training's l2: 0.295791	valid_1's l2: 0.29242
[400]	training's l2: 0.293698	valid_1's l2: 0.292212
[500]	training's l2: 0.29183	valid_1's l2: 0.292042
mean_7_2017: 1944148.64
mean_14_2017: 1149049.68
mean_30_2017: 126067.46
promo_0: 104085.02
mean_20_dow0_2017: 82034.91
day_1_2017: 76705.44
mean_3_2017: 69943.73
mean_4_dow0_2017: 59042.79
promo_14_2017: 28628.56
mean_60_2017: 24219.46
promo_7: 9040.54
mean_4_dow5_2017: 7291.80
promo_60_2017: 6685.58
mean_140_2017: 6441.20
promo_140_2017: 5762.20
mean_20_dow4_2017: 5633.53
mean_4_dow6_2017: 4987.13
mean_20_dow2_2017: 4031.89
mean_4_dow2_2017: 3853.97
mean_20_dow1_2017: 3190.93
promo_9: 2997.47
mean_4_dow3_2017: 2818.81
mean_4_dow1_2017: 2783.38
mean_20_dow3_2017: 2759.13
mean_4_dow4_2017: 2567.62
promo_14: 2538.95
mean_20_dow6_2017: 2343.79
mean_20_dow5_2017: 2034.40

Step 8
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.332742	valid_1's l2: 0.390067
[200]	training's l2: 0.328284	valid_1's l2: 0.388853
[300]	training's l2: 0.325326	valid_1's l2: 0.388599
Early stopping, best iteration is:
[297]	training's l2: 0.325424	valid_1's l2: 0.388493
mean_30_2017: 1202867.85
mean_14_2017: 1028549.77
mean_7_2017: 640305.13
promo_7: 181574.10
mean_20_dow0_2017: 162630.75
mean_60_2017: 101355.69
mean_4_dow0_2017: 62783.63
promo_0: 23871.92
day_1_2017: 19783.43
mean_3_2017: 17694.77
promo_14_2017: 13170.80
promo_60_2017: 12091.47
promo_14: 10412.20
promo_140_2017: 8820.41
mean_140_2017: 7372.59
mean_20_dow2_2017: 6141.55
promo_3: 5967.89
mean_20_dow4_2017: 4971.96
promo_5: 3467.43
promo_6: 2907.41
mean_4_dow5_2017: 2796.18
mean_4_dow6_2017: 2731.19
mean_20_dow1_2017: 2684.80
mean_20_dow3_2017: 2567.33
promo_9: 2492.33
mean_4_dow1_2017: 2358.24
mean_4_dow2_2017: 2342.24
promo_4: 2167.95
mean_20_dow6_2017: 2114.02
mean_20_dow5_

[400]	training's l2: 0.349653	valid_1's l2: 0.360783
Early stopping, best iteration is:
[358]	training's l2: 0.350615	valid_1's l2: 0.360664
mean_30_2017: 1481891.01
mean_14_2017: 459426.82
mean_7_2017: 367170.22
mean_60_2017: 246027.59
mean_20_dow6_2017: 221445.35
promo_13: 161702.97
mean_3_2017: 88083.14
mean_4_dow6_2017: 82157.45
day_1_2017: 24056.30
promo_14_2017: 16778.17
mean_4_dow5_2017: 11596.71
promo_60_2017: 10802.95
mean_20_dow5_2017: 10011.63
mean_140_2017: 9998.47
promo_14: 9490.52
promo_10: 8872.45
mean_20_dow1_2017: 7918.32
promo_140_2017: 6831.26
mean_20_dow0_2017: 6739.16
promo_6: 6313.49
promo_12: 5215.39
mean_20_dow3_2017: 3489.35
mean_4_dow0_2017: 3466.40
mean_4_dow1_2017: 3244.47
promo_0: 3203.18
mean_20_dow4_2017: 2857.14
promo_11: 2697.41
mean_4_dow3_2017: 2561.77
mean_20_dow2_2017: 2546.52
mean_4_dow2_2017: 2447.41
mean_4_dow4_2017: 2380.78
promo_9: 2324.16
promo_15: 2106.60
promo_8: 1612.07
promo_7: 1598.02
promo_2: 1040.94
promo_4: 510.85
promo_5: 506.15
promo