In [2]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib as plp
import lightgbm as lgb

In [2]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4, 5],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],dtype={'onpromotion': bool}
)

In [3]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr") #In order to give weight to item perishable

In [5]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [6]:
promo_train.columns = promo_train.columns.get_level_values(1)

In [7]:
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)

In [9]:
promo_test = promo_test.reindex(promo_train.index).fillna(False)
promo_2017 = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [10]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [11]:
items = items.reindex(df_train.index.get_level_values(1))

In [12]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [13]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values,
        "median_3_2017": get_timespan(df_train, t2017, 3, 3).median(axis=1).values,
        "median_7_2017": get_timespan(df_train, t2017, 7, 7).median(axis=1).values,
        "median_14_2017": get_timespan(df_train, t2017, 14, 14).median(axis=1).values,
        "median_16_2017": get_timespan(df_train, t2017, 16, 16).median(axis=1).values,
        "std_3_2017": get_timespan(df_train, t2017, 3, 3).std(axis=1).values,
        "std_7_2017": get_timespan(df_train, t2017, 7, 7).std(axis=1).values,
        "std_14_2017": get_timespan(df_train, t2017, 14, 14).std(axis=1).values,
        "std_16_2017": get_timespan(df_train, t2017, 16, 16).std(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values  
    })
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [14]:
print("Preparing dataset...")
t2017 = date(2017, 5, 16)
X_l, y_l = [], []
for i in range(9):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [15]:
X_val, y_val = prepare_dataset(date(2017, 7, 23))

In [16]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [17]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 9) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.445759	valid_1's l2: 0.573596
[100]	training's l2: 0.353489	valid_1's l2: 0.454515
[150]	training's l2: 0.338403	valid_1's l2: 0.429042
[200]	training's l2: 0.334211	valid_1's l2: 0.421773
[250]	training's l2: 0.332217	valid_1's l2: 0.418703
[300]	training's l2: 0.330931	valid_1's l2: 0.416977
[350]	training's l2: 0.329947	valid_1's l2: 0.41615
[400]	training's l2: 0.329279	valid_1's l2: 0.415644
[450]	training's l2: 0.328646	valid_1's l2: 0.415435
[500]	training's l2: 0.328112	valid_1's l2: 0.415194
[550]	training's l2: 0.327617	valid_1's l2: 0.414991
[600]	training's l2: 0.32715	valid_1's l2: 0.4149
[650]	training's l2: 0.326708	valid_1's l2: 0.41477
[700]	training's l2: 0.326288	valid_1's l2: 0.414606
[750]	training's l2: 0.325877	valid_1's l2: 0.414494
[800]	training's l2: 0.325478	valid_1's l2: 0.414442
[850]	training's l2: 0.325101	valid_1's l2: 0.414426
[900]	training's l2: 0.324723	valid_1's l2:

[300]	training's l2: 0.402634	valid_1's l2: 0.408256
[350]	training's l2: 0.401102	valid_1's l2: 0.407845
[400]	training's l2: 0.400024	valid_1's l2: 0.407619
[450]	training's l2: 0.399132	valid_1's l2: 0.4074
[500]	training's l2: 0.398346	valid_1's l2: 0.407269
[550]	training's l2: 0.397672	valid_1's l2: 0.407192
[600]	training's l2: 0.397024	valid_1's l2: 0.407177
[650]	training's l2: 0.396424	valid_1's l2: 0.407098
[700]	training's l2: 0.395856	valid_1's l2: 0.407074
[750]	training's l2: 0.395316	valid_1's l2: 0.406948
Early stopping, best iteration is:
[742]	training's l2: 0.395404	valid_1's l2: 0.406935
mean_16_2017: 17537940.44
mean_3_2017: 3206783.99
mean_14_2017: 2461606.31
median_16_2017: 918520.13
promo_5: 436543.57
median_3_2017: 243097.77
promo_14_2017: 216728.48
std_16_2017: 169558.09
mean_7_2017: 132162.44
promo_4: 47667.13
std_3_2017: 47211.26
promo_8: 46118.90
std_14_2017: 44971.33
promo_6: 44936.49
promo_7: 41087.80
std_7_2017: 31963.00
promo_1: 28293.20
promo_3: 25561

[850]	training's l2: 0.367555	valid_1's l2: 0.492347
[900]	training's l2: 0.367168	valid_1's l2: 0.492213
[950]	training's l2: 0.366784	valid_1's l2: 0.492113
[1000]	training's l2: 0.366417	valid_1's l2: 0.492076
[1050]	training's l2: 0.366068	valid_1's l2: 0.492025
[1100]	training's l2: 0.365698	valid_1's l2: 0.491862
[1150]	training's l2: 0.365338	valid_1's l2: 0.49181
[1200]	training's l2: 0.364992	valid_1's l2: 0.491761
[1250]	training's l2: 0.364649	valid_1's l2: 0.491703
Early stopping, best iteration is:
[1225]	training's l2: 0.364818	valid_1's l2: 0.491665
mean_14_2017: 7850626.34
mean_16_2017: 5358190.27
mean_7_2017: 2173634.59
promo_9: 590428.79
median_7_2017: 422740.65
median_16_2017: 289445.50
promo_14_2017: 266358.55
std_16_2017: 119212.09
median_14_2017: 106918.19
std_14_2017: 68373.14
mean_3_2017: 67445.58
promo_8: 65356.50
std_7_2017: 49320.10
std_3_2017: 42262.10
promo_10: 39933.50
promo_11: 37293.59
median_3_2017: 35921.50
promo_12: 24779.61
promo_1: 24720.24
promo_4:

[650]	training's l2: 0.383364	valid_1's l2: 0.525846
[700]	training's l2: 0.382924	valid_1's l2: 0.525774
[750]	training's l2: 0.382469	valid_1's l2: 0.525673
[800]	training's l2: 0.382029	valid_1's l2: 0.525565
[850]	training's l2: 0.381608	valid_1's l2: 0.525481
[900]	training's l2: 0.381188	valid_1's l2: 0.525404
[950]	training's l2: 0.380782	valid_1's l2: 0.525333
[1000]	training's l2: 0.380387	valid_1's l2: 0.525286
[1050]	training's l2: 0.380024	valid_1's l2: 0.525227
[1100]	training's l2: 0.379637	valid_1's l2: 0.525152
[1150]	training's l2: 0.379259	valid_1's l2: 0.525194
Early stopping, best iteration is:
[1102]	training's l2: 0.379624	valid_1's l2: 0.525151
mean_16_2017: 12672300.93
mean_14_2017: 2115492.84
mean_7_2017: 1789706.39
promo_14: 867710.97
median_16_2017: 514645.20
promo_14_2017: 248914.91
std_16_2017: 146364.18
promo_15: 120813.04
mean_3_2017: 116682.61
median_7_2017: 112066.14
promo_13: 83582.44
promo_0: 72736.88
promo_11: 71489.65
promo_7: 67537.36
std_14_2017: 

In [19]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('modified_weight_promo_1.csv', float_format='%.4f', index=None)

Making submission...
