In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib as plp
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4, 5],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],dtype={'onpromotion': bool}
)

In [3]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr") #In order to give weight to item perishable

In [5]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [6]:
promo_train.columns = promo_train.columns.get_level_values(1)

In [7]:
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [8]:
promo_test = promo_test.reindex(promo_train.index).fillna(False)
promo_2017 = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [9]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [10]:
items = items.reindex(df_train.index.get_level_values(1))

In [11]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [12]:
# def prepare_dataset(t2017, is_train=True):
#     X = pd.DataFrame({
#         "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel(),
#         "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
#         "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
#         "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
#         "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values,
#         "mean_30_2017": get_timespan(df_train, t2017, 30, 30).mean(axis=1).values,
#         "median_3_2017": get_timespan(df_train, t2017, 3, 3).median(axis=1).values,
#         "median_7_2017": get_timespan(df_train, t2017, 7, 7).median(axis=1).values,
#         "median_14_2017": get_timespan(df_train, t2017, 14, 14).median(axis=1).values,
#         "median_16_2017": get_timespan(df_train, t2017, 16, 16).median(axis=1).values,
#         "median_30_2017": get_timespan(df_train, t2017, 30, 30).median(axis=1).values,
#         "std_3_2017": get_timespan(df_train, t2017, 3, 3).std(axis=1).values,
#         "std_7_2017": get_timespan(df_train, t2017, 7, 7).std(axis=1).values,
#         "std_14_2017": get_timespan(df_train, t2017, 14, 14).std(axis=1).values,
#         "std_16_2017": get_timespan(df_train, t2017, 16, 16).std(axis=1).values,
#         "std_30_2017": get_timespan(df_train, t2017, 30, 30).std(axis=1).values,
#         "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
#         "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values 
#     })
#     for i in range(7):
#         X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
#     for i in range(16):
#         X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
#     if is_train:
#         y = df_train[
#             pd.date_range(t2017, periods=16)
#         ].values
#         return X, y
#     return X

In [13]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
          "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel()
        })
    for i in [3,7,14,16,21,30]:
            X["mean_"+str(i)] = get_timespan(df_train, t2017, i, i).mean(axis=1).values
            X["median_"+str(i)] = get_timespan(df_train, t2017, i, i).median(axis=1).values
            X["std_"+str(i)] = get_timespan(df_train, t2017, i, i).std(axis=1).values
            X["promo_"+str(i)] = get_timespan(promo_2017, t2017, i, i).sum(axis=1).values
            X["max_"+str(i)] = get_timespan(promo_2017, t2017, i, i).max(axis=1).values
            X["min_"+str(i)] = get_timespan(promo_2017, t2017, i, i).min(axis=1).values
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_11_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 77-i, 4, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [15]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [16]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))

In [17]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [18]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.325351	valid_1's l2: 0.322473
[200]	training's l2: 0.301869	valid_1's l2: 0.302369
[300]	training's l2: 0.297426	valid_1's l2: 0.300188
[400]	training's l2: 0.294695	valid_1's l2: 0.299465
[500]	training's l2: 0.292506	valid_1's l2: 0.299155
[600]	training's l2: 0.290569	valid_1's l2: 0.299029
[700]	training's l2: 0.288707	valid_1's l2: 0.298941
[800]	training's l2: 0.286958	valid_1's l2: 0.298898
[900]	training's l2: 0.285282	valid_1's l2: 0.298871
[1000]	training's l2: 0.283692	valid_1's l2: 0.298795
[1100]	training's l2: 0.282126	valid_1's l2: 0.298777
Early stopping, best iteration is:
[1095]	training's l2: 0.282202	valid_1's l2: 0.29877
mean_7: 4836012.02
mean_14: 3074572.38
mean_16: 331768.12
promo_0: 313316.78
median_7: 216233.42
mean_4_dow0_2017: 197249.13
mean_30: 167631.28
mean_21: 133400.95
day_1_2017: 105830.82
mean_11_dow0_2017: 83591.92
mean_3: 72691.24
median_16: 45494.01
median_14: 3850

[600]	training's l2: 0.352954	valid_1's l2: 0.365222
[700]	training's l2: 0.350535	valid_1's l2: 0.365064
Early stopping, best iteration is:
[740]	training's l2: 0.349625	valid_1's l2: 0.364971
mean_4_dow4_2017: 6091904.97
mean_14: 3088820.30
mean_7: 616347.19
mean_3: 306277.31
mean_11_dow4_2017: 303441.08
mean_30: 301926.14
mean_21: 273798.34
mean_16: 230348.39
promo_4: 207110.24
mean_4_dow3_2017: 42016.65
std_30: 36178.80
median_3: 35341.19
promo_16: 26671.50
promo_5: 26081.69
mean_11_dow5_2017: 24890.13
median_7: 24432.71
std_21: 24126.56
promo_30: 22764.52
promo_21: 20210.03
promo_3: 19855.67
std_16: 17699.31
std_7: 16535.61
std_14: 15077.51
mean_11_dow3_2017: 14908.14
max_3: 14728.61
day_1_2017: 13539.12
std_3: 13376.96
promo_7: 12742.93
mean_11_dow2_2017: 12416.71
mean_11_dow6_2017: 11660.07
mean_4_dow0_2017: 11565.37
mean_11_dow1_2017: 11438.36
mean_11_dow0_2017: 11324.28
median_30: 11117.50
min_3: 11026.47
mean_4_dow5_2017: 10424.77
promo_6: 9387.87
mean_4_dow1_2017: 9288.80
pr

Step 10
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.377391	valid_1's l2: 0.405865
[200]	training's l2: 0.354116	valid_1's l2: 0.383125
[300]	training's l2: 0.348155	valid_1's l2: 0.380035
[400]	training's l2: 0.344411	valid_1's l2: 0.379188
[500]	training's l2: 0.3417	valid_1's l2: 0.378973
[600]	training's l2: 0.339246	valid_1's l2: 0.378912
Early stopping, best iteration is:
[620]	training's l2: 0.338808	valid_1's l2: 0.378902
mean_21: 2761709.25
mean_30: 1661493.18
mean_14: 1406937.41
mean_4_dow2_2017: 1287335.64
mean_7: 946458.38
promo_9: 401161.71
mean_11_dow2_2017: 239501.94
std_30: 66431.88
median_7: 47155.51
median_30: 43970.28
promo_30: 43129.07
median_14: 40868.76
promo_21: 36449.85
promo_2: 35571.09
mean_3: 33038.02
std_21: 27616.47
promo_16: 27350.00
mean_16: 23320.61
promo_10: 21596.27
mean_11_dow1_2017: 20585.41
mean_11_dow3_2017: 19630.22
promo_8: 18950.25
std_7: 17271.00
promo_7: 17062.60
mean_4_dow1_2017: 16812.36
mean_11_dow0_2

Step 15
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.381904	valid_1's l2: 0.379375
[200]	training's l2: 0.358167	valid_1's l2: 0.360913
[300]	training's l2: 0.35283	valid_1's l2: 0.359155
[400]	training's l2: 0.34949	valid_1's l2: 0.358767
[500]	training's l2: 0.346848	valid_1's l2: 0.35871
Early stopping, best iteration is:
[499]	training's l2: 0.346871	valid_1's l2: 0.358704
mean_30: 3123138.68
mean_21: 2674673.10
mean_7: 568081.81
promo_14: 504764.91
mean_4_dow0_2017: 401766.26
median_30: 395453.14
mean_14: 385967.14
mean_16: 301448.36
mean_11_dow0_2017: 129111.46
median_16: 87121.22
median_7: 81238.93
std_30: 56459.67
promo_7: 55303.41
mean_11_dow2_2017: 41281.84
promo_30: 39192.80
promo_0: 37511.03
mean_3: 31427.06
median_14: 29950.61
promo_15: 28680.79
promo_21: 27534.72
promo_12: 24414.59
promo_16: 22570.37
promo_13: 22440.22
mean_11_dow1_2017: 20600.98
mean_4_dow2_2017: 18981.08
day_1_2017: 18275.66
std_21: 18191.12
median_21: 15547.41
me

In [19]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('../Result/modified_weight_promo_week_2.csv', float_format='%.4f', index=None)

Making submission...
