In [1]:
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

df_train = pd.read_csv(
    '../Data/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb_one_step.csv', float_format='%.4f', index=None)


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Preparing dataset...
Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.301612	valid_1's l2: 0.293815
[200]	training's l2: 0.29817	valid_1's l2: 0.292499
[300]	training's l2: 0.295722	valid_1's l2: 0.292093
[400]	training's l2: 0.293651	valid_1's l2: 0.291912
[500]	training's l2: 0.291829	valid_1's l2: 0.291832
Did not meet early stopping. Best iteration is:
[500]	training's l2: 0.291829	valid_1's l2: 0.291832
mean_7_2017: 1793172.87
mean_14_2017: 1280385.50
mean_3_2017: 119248.42
day_1_2017: 106723.76
promo_0: 102762.79
mean_20_dow0_2017: 85479.26
mean_4_dow0_2017: 66631.02
mean_30_2017: 58354.90
promo_14_2017: 28719.31
mean_60_2017: 22867.35
promo_7: 9464.83
mean_140_2017: 6552.11
promo_60_2017: 6491.79
mean_4_dow5_2017: 6437.41
promo_140_2017: 6257.70
mean_20_dow4_2017: 5557.56
mean_4_dow6_2017: 4648.37
mean_20_dow2_2017: 4236.30
mean_4_dow2_2017: 3739.05
promo_14: 3027.22
promo_9: 2960.38
mean_20_dow6_2017: 2856.63
mean_20_dow3_2017: 2841.16
mean_4_dow1_2017: 2774.74
m

Step 7
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.346445	valid_1's l2: 0.42134
[200]	training's l2: 0.341976	valid_1's l2: 0.420205
[300]	training's l2: 0.338989	valid_1's l2: 0.419848
Early stopping, best iteration is:
[288]	training's l2: 0.339326	valid_1's l2: 0.419738
mean_14_2017: 1353933.60
mean_30_2017: 754967.81
mean_7_2017: 425467.29
mean_20_dow6_2017: 176268.20
mean_3_2017: 145641.40
mean_60_2017: 144782.98
promo_6: 127569.41
mean_4_dow6_2017: 100791.62
promo_14_2017: 21812.39
day_1_2017: 15671.83
promo_3: 11245.04
promo_7: 9688.54
mean_20_dow5_2017: 8087.85
mean_4_dow5_2017: 6503.44
promo_60_2017: 6161.62
promo_140_2017: 6123.16
mean_140_2017: 5973.32
mean_20_dow1_2017: 5139.13
promo_5: 4157.67
promo_13: 3621.19
mean_20_dow0_2017: 3404.38
mean_4_dow1_2017: 3155.92
promo_0: 3059.59
mean_20_dow3_2017: 2857.55
mean_4_dow0_2017: 2844.52
promo_4: 2657.59
promo_9: 2224.81
mean_20_dow4_2017: 2199.98
mean_4_dow2_2017: 2144.71
mean_20_dow2_20

[200]	training's l2: 0.365881	valid_1's l2: 0.376371
[300]	training's l2: 0.362647	valid_1's l2: 0.376202
Early stopping, best iteration is:
[269]	training's l2: 0.363549	valid_1's l2: 0.376164
mean_30_2017: 1315028.92
mean_14_2017: 867381.36
mean_60_2017: 337492.00
mean_7_2017: 300282.68
mean_3_2017: 196176.26
promo_12: 93763.20
mean_20_dow5_2017: 82840.27
mean_4_dow5_2017: 76112.38
promo_13: 19373.88
promo_14_2017: 17084.43
promo_14: 12470.75
promo_10: 11046.41
mean_140_2017: 9060.49
day_1_2017: 6499.91
promo_60_2017: 6395.81
promo_140_2017: 6299.86
mean_20_dow0_2017: 5850.94
mean_20_dow6_2017: 4452.34
mean_20_dow3_2017: 3960.20
mean_4_dow6_2017: 3920.97
promo_11: 3774.90
promo_9: 3312.85
mean_4_dow0_2017: 3283.99
mean_20_dow2_2017: 2895.39
promo_15: 2596.85
mean_20_dow4_2017: 2298.20
mean_20_dow1_2017: 2184.81
mean_4_dow2_2017: 2167.50
promo_0: 2160.31
promo_7: 2110.95
mean_4_dow3_2017: 2067.90
mean_4_dow1_2017: 2061.86
mean_4_dow4_2017: 2047.20
promo_8: 1669.84
promo_5: 1330.49
pro