In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib as plp
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv(
    '../Data/train_set_one_year.csv', usecols=[1, 2, 3, 4, 5],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],dtype={'onpromotion': bool}
)

In [3]:
df_train.tail(5)

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
23808256,2017-08-15,54,2089339,1.609438,False
23808257,2017-08-15,54,2106464,0.693147,True
23808258,2017-08-15,54,2110456,5.26269,False
23808259,2017-08-15,54,2113914,5.293305,True
23808260,2017-08-15,54,2116416,1.098612,False


In [4]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [5]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr") #In order to give weight to item perishable

In [6]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [7]:
promo_train.columns = promo_train.columns.get_level_values(1)

In [8]:
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [9]:
promo_test = promo_test.reindex(promo_train.index).fillna(False)
promo_2017 = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [10]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [11]:
items = items.reindex(df_train.index.get_level_values(1))

In [12]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [13]:
# def prepare_dataset(t2017, is_train=True):
#     X = pd.DataFrame({
#         "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel(),
#         "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
#         "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
#         "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
#         "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values,
#         "mean_30_2017": get_timespan(df_train, t2017, 30, 30).mean(axis=1).values,
#         "median_3_2017": get_timespan(df_train, t2017, 3, 3).median(axis=1).values,
#         "median_7_2017": get_timespan(df_train, t2017, 7, 7).median(axis=1).values,
#         "median_14_2017": get_timespan(df_train, t2017, 14, 14).median(axis=1).values,
#         "median_16_2017": get_timespan(df_train, t2017, 16, 16).median(axis=1).values,
#         "median_30_2017": get_timespan(df_train, t2017, 30, 30).median(axis=1).values,
#         "std_3_2017": get_timespan(df_train, t2017, 3, 3).std(axis=1).values,
#         "std_7_2017": get_timespan(df_train, t2017, 7, 7).std(axis=1).values,
#         "std_14_2017": get_timespan(df_train, t2017, 14, 14).std(axis=1).values,
#         "std_16_2017": get_timespan(df_train, t2017, 16, 16).std(axis=1).values,
#         "std_30_2017": get_timespan(df_train, t2017, 30, 30).std(axis=1).values,
#         "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
#         "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values 
#     })
#     for i in range(7):
#         X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
#     for i in range(16):
#         X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
#     if is_train:
#         y = df_train[
#             pd.date_range(t2017, periods=16)
#         ].values
#         return X, y
#     return X

In [14]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
          "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel()
        })
    for i in [3,7,14,30,60,140]:
            X["mean_"+str(i)] = get_timespan(df_train, t2017, i, i).mean(axis=1).values
            X["median_"+str(i)] = get_timespan(df_train, t2017, i, i).median(axis=1).values
            X["std_"+str(i)] = get_timespan(df_train, t2017, i, i).std(axis=1).values
            X["promo_"+str(i)] = get_timespan(promo_2017, t2017, i, i).sum(axis=1).values
            X["max_"+str(i)] = get_timespan(promo_2017, t2017, i, i).max(axis=1).values
            X["min_"+str(i)] = get_timespan(promo_2017, t2017, i, i).min(axis=1).values
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [15]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [16]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))

In [17]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [18]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.315138	valid_1's l2: 0.313303
[200]	training's l2: 0.291849	valid_1's l2: 0.293323
[300]	training's l2: 0.287378	valid_1's l2: 0.290963
[400]	training's l2: 0.284605	valid_1's l2: 0.290122
[500]	training's l2: 0.282467	valid_1's l2: 0.289784
[600]	training's l2: 0.28054	valid_1's l2: 0.289611
[700]	training's l2: 0.278726	valid_1's l2: 0.289458
[800]	training's l2: 0.276979	valid_1's l2: 0.289324
[900]	training's l2: 0.275365	valid_1's l2: 0.289273
Early stopping, best iteration is:
[947]	training's l2: 0.274617	valid_1's l2: 0.289247
mean_7: 5396161.24
mean_14: 3171275.71
promo_0: 304568.64
mean_20_dow0_2017: 201132.27
median_7: 189160.43
mean_30: 156295.28
median_14: 140117.57
mean_4_dow0_2017: 123287.36
day_1_2017: 115399.99
mean_3: 72015.69
mean_60: 71678.17
std_14: 41303.26
std_7: 40787.11
promo_7: 32929.38
median_60: 32772.82
min_3: 32175.98
std_140: 30178.26
median_3: 24708.79
std_30: 24392.37
p

Step 6
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.373195	valid_1's l2: 0.384967
[200]	training's l2: 0.350845	valid_1's l2: 0.360515
[300]	training's l2: 0.345584	valid_1's l2: 0.357428
[400]	training's l2: 0.34185	valid_1's l2: 0.356172
[500]	training's l2: 0.339058	valid_1's l2: 0.355916
[600]	training's l2: 0.336627	valid_1's l2: 0.355791
[700]	training's l2: 0.334315	valid_1's l2: 0.355739
Early stopping, best iteration is:
[729]	training's l2: 0.333685	valid_1's l2: 0.355673
mean_14: 4579253.13
mean_30: 2074965.64
mean_7: 1517324.21
median_60: 283204.06
mean_60: 255306.84
mean_3: 240023.91
promo_5: 227847.50
mean_4_dow5_2017: 182810.15
mean_20_dow5_2017: 159776.85
median_30: 52021.97
std_14: 36436.24
promo_30: 34109.86
promo_3: 29606.32
std_30: 28897.06
median_7: 25643.27
std_140: 25269.05
promo_6: 23149.79
std_60: 22008.96
mean_4_dow6_2017: 20104.07
median_3: 18533.41
min_3: 16923.22
mean_20_dow6_2017: 16336.79
promo_7: 16100.19
mean_140:

Step 11
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.396266	valid_1's l2: 0.395952
[200]	training's l2: 0.370819	valid_1's l2: 0.374842
[300]	training's l2: 0.364188	valid_1's l2: 0.37224
[400]	training's l2: 0.35982	valid_1's l2: 0.371306
[500]	training's l2: 0.356563	valid_1's l2: 0.371054
[600]	training's l2: 0.353969	valid_1's l2: 0.370934
[700]	training's l2: 0.35154	valid_1's l2: 0.370931
Early stopping, best iteration is:
[704]	training's l2: 0.351455	valid_1's l2: 0.370916
mean_30: 5244475.18
mean_14: 1753132.58
mean_7: 1419040.09
mean_60: 616239.90
mean_4_dow3_2017: 515264.43
mean_20_dow3_2017: 461241.98
promo_10: 286999.09
median_60: 263650.33
std_30: 57935.03
mean_4_dow4_2017: 56898.38
mean_3: 50951.64
std_140: 43128.16
promo_30: 37843.55
std_14: 37069.87
mean_4_dow2_2017: 27749.13
std_60: 24626.58
promo_140: 24540.52
median_7: 22882.40
promo_12: 22078.09
promo_11: 20919.30
promo_7: 18941.33
mean_140: 18336.77
promo_9: 18128.51
median_

Step 16
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.377658	valid_1's l2: 0.388166
[200]	training's l2: 0.35788	valid_1's l2: 0.371835
[300]	training's l2: 0.352394	valid_1's l2: 0.369907
[400]	training's l2: 0.3487	valid_1's l2: 0.369446
Early stopping, best iteration is:
[421]	training's l2: 0.34803	valid_1's l2: 0.3694
mean_30: 3718869.06
mean_14: 1320605.50
median_30: 692460.13
median_60: 517263.02
mean_60: 500316.89
promo_15: 404384.88
mean_7: 287443.94
mean_20_dow1_2017: 243781.54
std_140: 45060.10
mean_20_dow2_2017: 39116.03
mean_4_dow1_2017: 38729.85
median_14: 37689.36
median_7: 34834.02
std_60: 30491.22
promo_30: 30233.91
promo_14: 28808.96
mean_140: 27681.89
median_140: 27509.35
std_30: 23273.35
mean_3: 23167.98
promo_140: 19580.44
day_1_2017: 16943.30
mean_20_dow4_2017: 14867.89
promo_60: 14513.45
std_14: 11712.08
std_7: 10575.59
median_3: 10184.76
mean_20_dow0_2017: 10123.87
mean_4_dow2_2017: 9642.67
promo_10: 8657.88
promo_7: 7616.0

In [19]:
np.array(test_pred).shape

(16, 167515)

In [20]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('../Result/modified_weight_promo_year_2.csv', float_format='%.4f', index=None)

Making submission...
