In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib as plp
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv(
    '../Data/train_set_one_year.csv', usecols=[1, 2, 3, 4, 5],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],dtype={'onpromotion': bool}
)

In [3]:
df_train.tail(5)

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
23808256,2017-08-15,54,2089339,1.609438,False
23808257,2017-08-15,54,2106464,0.693147,True
23808258,2017-08-15,54,2110456,5.26269,False
23808259,2017-08-15,54,2113914,5.293305,True
23808260,2017-08-15,54,2116416,1.098612,False


In [4]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [5]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr") #In order to give weight to item perishable

In [6]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [7]:
promo_train.columns = promo_train.columns.get_level_values(1)

In [8]:
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [9]:
promo_test = promo_test.reindex(promo_train.index).fillna(False)
promo_2017 = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [10]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [11]:
items = items.reindex(df_train.index.get_level_values(1))

In [12]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [13]:
# def prepare_dataset(t2017, is_train=True):
#     X = pd.DataFrame({
#         "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel(),
#         "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
#         "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
#         "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
#         "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values,
#         "mean_30_2017": get_timespan(df_train, t2017, 30, 30).mean(axis=1).values,
#         "median_3_2017": get_timespan(df_train, t2017, 3, 3).median(axis=1).values,
#         "median_7_2017": get_timespan(df_train, t2017, 7, 7).median(axis=1).values,
#         "median_14_2017": get_timespan(df_train, t2017, 14, 14).median(axis=1).values,
#         "median_16_2017": get_timespan(df_train, t2017, 16, 16).median(axis=1).values,
#         "median_30_2017": get_timespan(df_train, t2017, 30, 30).median(axis=1).values,
#         "std_3_2017": get_timespan(df_train, t2017, 3, 3).std(axis=1).values,
#         "std_7_2017": get_timespan(df_train, t2017, 7, 7).std(axis=1).values,
#         "std_14_2017": get_timespan(df_train, t2017, 14, 14).std(axis=1).values,
#         "std_16_2017": get_timespan(df_train, t2017, 16, 16).std(axis=1).values,
#         "std_30_2017": get_timespan(df_train, t2017, 30, 30).std(axis=1).values,
#         "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
#         "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values 
#     })
#     for i in range(7):
#         X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
#     for i in range(16):
#         X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
#     if is_train:
#         y = df_train[
#             pd.date_range(t2017, periods=16)
#         ].values
#         return X, y
#     return X

In [14]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
          "day_1_2017": get_timespan(df_train, t2017, 1, 1).values.ravel()
        })
    for i in [3,7,14,16,21,30,60,140]:
            X["mean_"+str(i)] = get_timespan(df_train, t2017, i, i).mean(axis=1).values
            X["median_"+str(i)] = get_timespan(df_train, t2017, i, i).median(axis=1).values
            X["std_"+str(i)] = get_timespan(df_train, t2017, i, i).std(axis=1).values
            X["promo_"+str(i)] = get_timespan(promo_2017, t2017, i, i).sum(axis=1).values
            X["max_"+str(i)] = get_timespan(promo_2017, t2017, i, i).max(axis=1).values
            X["min_"+str(i)] = get_timespan(promo_2017, t2017, i, i).min(axis=1).values
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_train, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [15]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [16]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))

In [17]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [19]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.314318	valid_1's l2: 0.309892
[200]	training's l2: 0.291895	valid_1's l2: 0.288639
[300]	training's l2: 0.2879	valid_1's l2: 0.284772
[400]	training's l2: 0.285527	valid_1's l2: 0.282451
[500]	training's l2: 0.283773	valid_1's l2: 0.280664
[600]	training's l2: 0.282196	valid_1's l2: 0.279068
[700]	training's l2: 0.280817	valid_1's l2: 0.277705
[800]	training's l2: 0.279469	valid_1's l2: 0.276358
[900]	training's l2: 0.278185	valid_1's l2: 0.275073
[1000]	training's l2: 0.276955	valid_1's l2: 0.273864
[1100]	training's l2: 0.275771	valid_1's l2: 0.272699
[1200]	training's l2: 0.274636	valid_1's l2: 0.271556
[1300]	training's l2: 0.273518	valid_1's l2: 0.270498
[1400]	training's l2: 0.27243	valid_1's l2: 0.269416
[1500]	training's l2: 0.27139	valid_1's l2: 0.268396
[1600]	training's l2: 0.270391	valid_1's l2: 0.267393
[1700]	training's l2: 0.269406	valid_1's l2: 0

Step 3
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.343473	valid_1's l2: 0.35966
[200]	training's l2: 0.320316	valid_1's l2: 0.33394
[300]	training's l2: 0.315477	valid_1's l2: 0.328492
[400]	training's l2: 0.312567	valid_1's l2: 0.325309
[500]	training's l2: 0.310516	valid_1's l2: 0.32312
[600]	training's l2: 0.308716	valid_1's l2: 0.321213
[700]	training's l2: 0.307051	valid_1's l2: 0.319464
[800]	training's l2: 0.305484	valid_1's l2: 0.317765
[900]	training's l2: 0.304028	valid_1's l2: 0.316223
[1000]	training's l2: 0.302611	valid_1's l2: 0.314713
[1100]	training's l2: 0.301278	valid_1's l2: 0.31329
[1200]	training's l2: 0.299958	valid_1's l2: 0.311915
[1300]	training's l2: 0.298697	valid_1's l2: 0.310575
[1400]	training's l2: 0.29748	valid_1's l2: 0.309303
[1500]	training's l2: 0.296272	valid_1's l2: 0.308016
[1600]	training's l2: 0.295115	valid_1's l2: 0.306807
[1700]	training's l2: 0.293983	valid_1's l2: 0.305618
[1800]	training's l2: 0.292

Step 5
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.379458	valid_1's l2: 0.377392
[200]	training's l2: 0.352736	valid_1's l2: 0.350952
[300]	training's l2: 0.347015	valid_1's l2: 0.345005
[400]	training's l2: 0.343555	valid_1's l2: 0.341474
[500]	training's l2: 0.341082	valid_1's l2: 0.339012
[600]	training's l2: 0.338958	valid_1's l2: 0.336885
[700]	training's l2: 0.337045	valid_1's l2: 0.334965
[800]	training's l2: 0.335231	valid_1's l2: 0.333194
[900]	training's l2: 0.333565	valid_1's l2: 0.331579
[1000]	training's l2: 0.331977	valid_1's l2: 0.329996
[1100]	training's l2: 0.330402	valid_1's l2: 0.328424
[1200]	training's l2: 0.328907	valid_1's l2: 0.326941
[1300]	training's l2: 0.327509	valid_1's l2: 0.325562
[1400]	training's l2: 0.326136	valid_1's l2: 0.324207
[1500]	training's l2: 0.324783	valid_1's l2: 0.322894
[1600]	training's l2: 0.323477	valid_1's l2: 0.321635
[1700]	training's l2: 0.322217	valid_1's l2: 0.320386
[1800]	training's l2: 

Step 7
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.373432	valid_1's l2: 0.435027
[200]	training's l2: 0.352296	valid_1's l2: 0.400639
[300]	training's l2: 0.34744	valid_1's l2: 0.39351
[400]	training's l2: 0.344406	valid_1's l2: 0.389667
[500]	training's l2: 0.342142	valid_1's l2: 0.386971
[600]	training's l2: 0.340201	valid_1's l2: 0.384644
[700]	training's l2: 0.338342	valid_1's l2: 0.382525
[800]	training's l2: 0.336657	valid_1's l2: 0.380526
[900]	training's l2: 0.335052	valid_1's l2: 0.378664
[1000]	training's l2: 0.33353	valid_1's l2: 0.376877
[1100]	training's l2: 0.332049	valid_1's l2: 0.375196
[1200]	training's l2: 0.330606	valid_1's l2: 0.373514
[1300]	training's l2: 0.32924	valid_1's l2: 0.371937
[1400]	training's l2: 0.327872	valid_1's l2: 0.370371
[1500]	training's l2: 0.326581	valid_1's l2: 0.368946
[1600]	training's l2: 0.325312	valid_1's l2: 0.367503
[1700]	training's l2: 0.324066	valid_1's l2: 0.366063
[1800]	training's l2: 0.32

Step 9
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.364204	valid_1's l2: 0.39118
[200]	training's l2: 0.345185	valid_1's l2: 0.366948
[300]	training's l2: 0.3405	valid_1's l2: 0.361547
[400]	training's l2: 0.337639	valid_1's l2: 0.358518
[500]	training's l2: 0.335416	valid_1's l2: 0.356245
[600]	training's l2: 0.333542	valid_1's l2: 0.354339
[700]	training's l2: 0.331822	valid_1's l2: 0.352612
[800]	training's l2: 0.330216	valid_1's l2: 0.350946
[900]	training's l2: 0.328638	valid_1's l2: 0.349373
[1000]	training's l2: 0.327146	valid_1's l2: 0.347824
[1100]	training's l2: 0.325704	valid_1's l2: 0.346382
[1200]	training's l2: 0.324293	valid_1's l2: 0.344914
[1300]	training's l2: 0.322927	valid_1's l2: 0.343458
[1400]	training's l2: 0.321588	valid_1's l2: 0.342061
[1500]	training's l2: 0.320301	valid_1's l2: 0.340732
[1600]	training's l2: 0.319054	valid_1's l2: 0.33944
[1700]	training's l2: 0.317829	valid_1's l2: 0.338197
[1800]	training's l2: 0.31

Step 11
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.396509	valid_1's l2: 0.392059
[200]	training's l2: 0.372077	valid_1's l2: 0.36813
[300]	training's l2: 0.366193	valid_1's l2: 0.362708
[400]	training's l2: 0.362467	valid_1's l2: 0.359561
[500]	training's l2: 0.359905	valid_1's l2: 0.357313
[600]	training's l2: 0.35771	valid_1's l2: 0.355351
[700]	training's l2: 0.355741	valid_1's l2: 0.353508
[800]	training's l2: 0.353907	valid_1's l2: 0.351864
[900]	training's l2: 0.352189	valid_1's l2: 0.3503
[1000]	training's l2: 0.350552	valid_1's l2: 0.34878
[1100]	training's l2: 0.34899	valid_1's l2: 0.347345
[1200]	training's l2: 0.347547	valid_1's l2: 0.346009
[1300]	training's l2: 0.346022	valid_1's l2: 0.344635
[1400]	training's l2: 0.344615	valid_1's l2: 0.343321
[1500]	training's l2: 0.343241	valid_1's l2: 0.34204
[1600]	training's l2: 0.341885	valid_1's l2: 0.340788
[1700]	training's l2: 0.340564	valid_1's l2: 0.33955
[1800]	training's l2: 0.33921

Step 13
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.396097	valid_1's l2: 0.389812
[200]	training's l2: 0.374168	valid_1's l2: 0.369592
[300]	training's l2: 0.368737	valid_1's l2: 0.364795
[400]	training's l2: 0.365466	valid_1's l2: 0.361984
[500]	training's l2: 0.362979	valid_1's l2: 0.359762
[600]	training's l2: 0.360851	valid_1's l2: 0.357879
[700]	training's l2: 0.358937	valid_1's l2: 0.356145
[800]	training's l2: 0.357178	valid_1's l2: 0.354591
[900]	training's l2: 0.355455	valid_1's l2: 0.353051
[1000]	training's l2: 0.353794	valid_1's l2: 0.351528
[1100]	training's l2: 0.352245	valid_1's l2: 0.350079
[1200]	training's l2: 0.350721	valid_1's l2: 0.348687
[1300]	training's l2: 0.349252	valid_1's l2: 0.347316
[1400]	training's l2: 0.34781	valid_1's l2: 0.345983
[1500]	training's l2: 0.346437	valid_1's l2: 0.34471
[1600]	training's l2: 0.345073	valid_1's l2: 0.343477
[1700]	training's l2: 0.343749	valid_1's l2: 0.342247
[1800]	training's l2: 0

Step 15
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.375962	valid_1's l2: 0.362253
[200]	training's l2: 0.35267	valid_1's l2: 0.341906
[300]	training's l2: 0.347558	valid_1's l2: 0.337547
[400]	training's l2: 0.344549	valid_1's l2: 0.334917
[500]	training's l2: 0.342326	valid_1's l2: 0.332988
[600]	training's l2: 0.340427	valid_1's l2: 0.331335
[700]	training's l2: 0.338704	valid_1's l2: 0.329806
[800]	training's l2: 0.337009	valid_1's l2: 0.328286
[900]	training's l2: 0.33543	valid_1's l2: 0.326887
[1000]	training's l2: 0.333913	valid_1's l2: 0.325531
[1100]	training's l2: 0.332454	valid_1's l2: 0.324266
[1200]	training's l2: 0.33108	valid_1's l2: 0.323031
[1300]	training's l2: 0.329733	valid_1's l2: 0.321829
[1400]	training's l2: 0.328409	valid_1's l2: 0.320643
[1500]	training's l2: 0.327144	valid_1's l2: 0.31949
[1600]	training's l2: 0.325904	valid_1's l2: 0.318334
[1700]	training's l2: 0.324685	valid_1's l2: 0.317225
[1800]	training's l2: 0.3

Validation mse: 0.2959389420983095


In [29]:
np.array(test_pred).shape

(16, 167515)

In [20]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('../Result/modified_weight_promo_year_1.csv', float_format='%.4f', index=None)

Making submission...


In [27]:
y_test.shape

(167515, 16)