In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib as plp
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4, 5],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],dtype={'onpromotion': bool}
)

In [3]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr") #In order to give weight to item perishable

In [5]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [6]:
promo_train.columns = promo_train.columns.get_level_values(1)

In [7]:
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [8]:
promo_test = promo_test.reindex(promo_train.index).fillna(False)
promo_2017 = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [9]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [10]:
items = items.reindex(df_train.index.get_level_values(1))

In [11]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [14]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_train, t2017, 30, 30).mean(axis=1).values,
        "median_3_2017": get_timespan(df_train, t2017, 3, 3).median(axis=1).values,
        "median_7_2017": get_timespan(df_train, t2017, 7, 7).median(axis=1).values,
        "median_14_2017": get_timespan(df_train, t2017, 14, 14).median(axis=1).values,
        "median_16_2017": get_timespan(df_train, t2017, 16, 16).median(axis=1).values,
        "median_30_2017": get_timespan(df_train, t2017, 30, 30).median(axis=1).values,
        "std_3_2017": get_timespan(df_train, t2017, 3, 3).std(axis=1).values,
        "std_7_2017": get_timespan(df_train, t2017, 7, 7).std(axis=1).values,
        "std_14_2017": get_timespan(df_train, t2017, 14, 14).std(axis=1).values,
        "std_16_2017": get_timespan(df_train, t2017, 16, 16).std(axis=1).values,
        "std_30_2017": get_timespan(df_train, t2017, 30, 30).std(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values 
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [15]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [16]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))

In [17]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [18]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.428629	valid_1's l2: 0.420654
[100]	training's l2: 0.327646	valid_1's l2: 0.324848
[150]	training's l2: 0.30975	valid_1's l2: 0.308677
[200]	training's l2: 0.304843	valid_1's l2: 0.304818
[250]	training's l2: 0.302383	valid_1's l2: 0.30322
[300]	training's l2: 0.300737	valid_1's l2: 0.302494
[350]	training's l2: 0.29939	valid_1's l2: 0.302056
[400]	training's l2: 0.298281	valid_1's l2: 0.301841
[450]	training's l2: 0.297294	valid_1's l2: 0.301691
[500]	training's l2: 0.29635	valid_1's l2: 0.301572
[550]	training's l2: 0.29546	valid_1's l2: 0.301488
[600]	training's l2: 0.294606	valid_1's l2: 0.301423
[650]	training's l2: 0.293756	valid_1's l2: 0.301358
[700]	training's l2: 0.292952	valid_1's l2: 0.301311
[750]	training's l2: 0.292166	valid_1's l2: 0.30126
[800]	training's l2: 0.291388	valid_1's l2: 0.301245
[850]	training's l2: 0.290659	valid_1's l2: 0.301255
Early stopping, best iteration is:
[813]	tra

[350]	training's l2: 0.369065	valid_1's l2: 0.374134
[400]	training's l2: 0.367413	valid_1's l2: 0.373628
[450]	training's l2: 0.36594	valid_1's l2: 0.373231
[500]	training's l2: 0.364666	valid_1's l2: 0.373089
[550]	training's l2: 0.363473	valid_1's l2: 0.372953
[600]	training's l2: 0.362336	valid_1's l2: 0.372837
[650]	training's l2: 0.361212	valid_1's l2: 0.372737
[700]	training's l2: 0.360161	valid_1's l2: 0.372712
Early stopping, best iteration is:
[673]	training's l2: 0.360696	valid_1's l2: 0.372689
mean_4_dow4_2017: 6491116.21
mean_14_2017: 2926207.37
mean_30_2017: 515391.84
mean_16_2017: 483864.46
mean_7_2017: 425575.12
mean_3_2017: 226591.50
promo_4: 205460.46
promo_14_2017: 64254.30
mean_4_dow3_2017: 59969.76
std_30_2017: 52950.52
promo_30_2017: 47844.89
median_30_2017: 35778.68
median_3_2017: 33661.37
std_16_2017: 33366.81
promo_5: 24768.62
median_7_2017: 22775.36
promo_3: 19763.93
std_7_2017: 17622.96
std_14_2017: 16786.89
std_3_2017: 15795.44
day_1_2017: 14997.66
mean_4_do

Step 10
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.476463	valid_1's l2: 0.511077
[100]	training's l2: 0.381929	valid_1's l2: 0.412969
[150]	training's l2: 0.36456	valid_1's l2: 0.394226
[200]	training's l2: 0.359203	valid_1's l2: 0.389235
[250]	training's l2: 0.356108	valid_1's l2: 0.387364
[300]	training's l2: 0.3537	valid_1's l2: 0.386308
[350]	training's l2: 0.351859	valid_1's l2: 0.385889
[400]	training's l2: 0.350438	valid_1's l2: 0.385661
[450]	training's l2: 0.349081	valid_1's l2: 0.385466
[500]	training's l2: 0.347824	valid_1's l2: 0.385331
[550]	training's l2: 0.346711	valid_1's l2: 0.3853
[600]	training's l2: 0.345694	valid_1's l2: 0.385299
Early stopping, best iteration is:
[567]	training's l2: 0.346363	valid_1's l2: 0.385265
mean_30_2017: 3436848.94
mean_14_2017: 2281086.54
mean_4_dow2_2017: 1397879.26
mean_7_2017: 806739.84
promo_9: 388807.14
mean_16_2017: 240580.82
median_30_2017: 170282.79
std_30_2017: 91980.40
promo_30_2017: 8453

Step 15
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.480883	valid_1's l2: 0.472237
[100]	training's l2: 0.385365	valid_1's l2: 0.382669
[150]	training's l2: 0.368143	valid_1's l2: 0.367969
[200]	training's l2: 0.362942	valid_1's l2: 0.364515
[250]	training's l2: 0.360162	valid_1's l2: 0.363427
[300]	training's l2: 0.358075	valid_1's l2: 0.362945
[350]	training's l2: 0.356406	valid_1's l2: 0.362645
[400]	training's l2: 0.35512	valid_1's l2: 0.36253
Early stopping, best iteration is:
[397]	training's l2: 0.355191	valid_1's l2: 0.362523
mean_30_2017: 5016608.25
mean_16_2017: 715774.93
median_30_2017: 702854.19
mean_14_2017: 607180.39
promo_14: 530692.32
mean_7_2017: 462850.08
mean_4_dow0_2017: 421339.91
median_16_2017: 147562.41
std_30_2017: 75369.18
promo_30_2017: 69397.88
median_7_2017: 52209.39
promo_7: 49702.32
promo_14_2017: 45344.52
promo_0: 44024.00
mean_4_dow2_2017: 30399.07
mean_3_2017: 29344.66
promo_13: 24047.06
promo_12: 21003.79
median_1

In [20]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('modified_weight_promo_week.csv', float_format='%.4f', index=None)

Making submission...
