In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib as plp
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4, 5],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],dtype={'onpromotion': bool}
)

In [3]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr") #In order to give weight to item perishable

In [5]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [6]:
promo_train.columns = promo_train.columns.get_level_values(1)

In [7]:
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [8]:
promo_test = promo_test.reindex(promo_train.index).fillna(False)
promo_2017 = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [9]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [10]:
items = items.reindex(df_train.index.get_level_values(1))

In [11]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [12]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values,
        "median_3_2017": get_timespan(df_train, t2017, 3, 3).median(axis=1).values,
        "median_7_2017": get_timespan(df_train, t2017, 7, 7).median(axis=1).values,
        "median_14_2017": get_timespan(df_train, t2017, 14, 14).median(axis=1).values,
        "median_16_2017": get_timespan(df_train, t2017, 16, 16).median(axis=1).values,
        "std_3_2017": get_timespan(df_train, t2017, 3, 3).std(axis=1).values,
        "std_7_2017": get_timespan(df_train, t2017, 7, 7).std(axis=1).values,
        "std_14_2017": get_timespan(df_train, t2017, 14, 14).std(axis=1).values,
        "std_16_2017": get_timespan(df_train, t2017, 16, 16).std(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values  
    })
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [13]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [14]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))

In [15]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [16]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.442702	valid_1's l2: 0.42459
[100]	training's l2: 0.343199	valid_1's l2: 0.330369
[150]	training's l2: 0.326979	valid_1's l2: 0.315563
[200]	training's l2: 0.32286	valid_1's l2: 0.312049
[250]	training's l2: 0.321088	valid_1's l2: 0.310823
[300]	training's l2: 0.319856	valid_1's l2: 0.310099
[350]	training's l2: 0.318933	valid_1's l2: 0.309671
[400]	training's l2: 0.318192	valid_1's l2: 0.309466
[450]	training's l2: 0.317526	valid_1's l2: 0.309371
[500]	training's l2: 0.316931	valid_1's l2: 0.309294
[550]	training's l2: 0.316382	valid_1's l2: 0.309243
[600]	training's l2: 0.315822	valid_1's l2: 0.309155
[650]	training's l2: 0.315306	valid_1's l2: 0.309126
[700]	training's l2: 0.314835	valid_1's l2: 0.309107
Early stopping, best iteration is:
[676]	training's l2: 0.315065	valid_1's l2: 0.3091
mean_7_2017: 7939499.90
mean_14_2017: 3745661.56
mean_16_2017: 1562151.12
promo_0: 443305.87
median_16_2017: 2708

Step 6
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.493465	valid_1's l2: 0.514469
[100]	training's l2: 0.402348	valid_1's l2: 0.415113
[150]	training's l2: 0.386952	valid_1's l2: 0.396313
[200]	training's l2: 0.382592	valid_1's l2: 0.391182
[250]	training's l2: 0.380388	valid_1's l2: 0.389209
[300]	training's l2: 0.378764	valid_1's l2: 0.388194
[350]	training's l2: 0.377438	valid_1's l2: 0.387566
[400]	training's l2: 0.376477	valid_1's l2: 0.387211
[450]	training's l2: 0.375592	valid_1's l2: 0.386961
[500]	training's l2: 0.374855	valid_1's l2: 0.386896
[550]	training's l2: 0.374165	valid_1's l2: 0.38685
[600]	training's l2: 0.373525	valid_1's l2: 0.386793
[650]	training's l2: 0.372873	valid_1's l2: 0.386776
[700]	training's l2: 0.37224	valid_1's l2: 0.386762
[750]	training's l2: 0.371618	valid_1's l2: 0.386765
Early stopping, best iteration is:
[709]	training's l2: 0.372124	valid_1's l2: 0.386751
mean_16_2017: 10080513.92
mean_14_2017: 1663469.33


[450]	training's l2: 0.405455	valid_1's l2: 0.411623
[500]	training's l2: 0.404628	valid_1's l2: 0.411536
[550]	training's l2: 0.403876	valid_1's l2: 0.411483
[600]	training's l2: 0.403144	valid_1's l2: 0.411403
[650]	training's l2: 0.402466	valid_1's l2: 0.411381
[700]	training's l2: 0.401819	valid_1's l2: 0.411356
[750]	training's l2: 0.401194	valid_1's l2: 0.411329
[800]	training's l2: 0.400584	valid_1's l2: 0.411303
Early stopping, best iteration is:
[767]	training's l2: 0.400964	valid_1's l2: 0.411295
mean_14_2017: 7625177.73
mean_16_2017: 5043506.58
mean_7_2017: 1487607.92
promo_10: 427662.47
promo_14_2017: 195201.07
median_16_2017: 129165.79
median_7_2017: 117140.23
std_16_2017: 116922.49
median_14_2017: 61158.94
std_14_2017: 54364.88
promo_9: 53166.83
mean_3_2017: 52730.17
std_7_2017: 35431.35
promo_7: 35146.19
promo_14: 31218.19
promo_12: 29978.72
promo_11: 27075.19
promo_13: 22625.64
std_3_2017: 22111.04
median_3_2017: 19650.39
promo_8: 18930.80
promo_3: 15956.68
promo_0: 104

[350]	training's l2: 0.379959	valid_1's l2: 0.396873
[400]	training's l2: 0.379056	valid_1's l2: 0.396627
[450]	training's l2: 0.378286	valid_1's l2: 0.396526
[500]	training's l2: 0.377588	valid_1's l2: 0.396425
[550]	training's l2: 0.376939	valid_1's l2: 0.396396
[600]	training's l2: 0.37631	valid_1's l2: 0.396353
[650]	training's l2: 0.375726	valid_1's l2: 0.396325
[700]	training's l2: 0.375133	valid_1's l2: 0.396252
[750]	training's l2: 0.374573	valid_1's l2: 0.396278
Early stopping, best iteration is:
[713]	training's l2: 0.374991	valid_1's l2: 0.396249
mean_16_2017: 7922609.09
mean_14_2017: 1377842.44
median_16_2017: 596046.39
promo_15: 531755.71
median_14_2017: 206828.53
mean_7_2017: 196849.40
promo_14_2017: 168585.82
std_16_2017: 89409.26
median_7_2017: 62905.24
promo_14: 52759.24
std_14_2017: 40895.39
mean_3_2017: 38700.40
std_7_2017: 26089.37
std_3_2017: 20045.06
median_3_2017: 19811.68
promo_13: 16671.68
promo_10: 16302.64
promo_9: 15047.66
promo_7: 14013.88
promo_12: 12143.1

In [18]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('modified_weight_promo_3.csv', float_format='%.4f', index=None)

Making submission...
