In [None]:
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta
import calendar as ca
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
print('Loading Data')
df_train = pd.read_csv(
    'input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)
item_nbr_u = df_train[df_train.date>pd.datetime(2017,8,10)].item_nbr.unique()

df_test = pd.read_csv(
    "input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "input/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train



promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

    
items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def get_nearwd(date,b_date):
    date_list = pd.date_range(date-timedelta(140),periods=21,freq='7D').date
    result = date_list[date_list<=b_date][-1]
    return result
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values,
        "unpromo_16aftsum_2017":(1-get_timespan(promo_2017, t2017+timedelta(16), 16, 16)).iloc[:,1:].sum(axis=1).values, 
    })

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
        for j in [14,60,140]:
            X["aft_promo_{}{}".format(i,j)] = (promo_2017[
                t2017 + timedelta(days=i)]-1).values.astype(np.uint8)
            X["aft_promo_{}{}".format(i,j)] = X["aft_promo_{}{}".format(i,j)]\
                                        *X['promo_{}_2017'.format(j)]
        if i ==15:
            X["bf_unpromo_{}".format(i)]=0
        else:
            X["bf_unpromo_{}".format(i)] = (1-get_timespan(
                    promo_2017, t2017+timedelta(16), 16-i, 16-i)).iloc[:,1:].sum(
                            axis=1).values / (15-i) * X['promo_{}'.format(i)]

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        #X['mean_12_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 84-i, 12, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values        
        
        date = get_nearwd(t2017+timedelta(i),t2017)
        ahead = (t2017-date).days
        if ahead!=0:
            X['ahead0_{}'.format(i)] = get_timespan(df_2017, date+timedelta(ahead), ahead, ahead).mean(axis=1).values
            X['ahead7_{}'.format(i)] = get_timespan(df_2017, date+timedelta(ahead), ahead+7, ahead+7).mean(axis=1).values
        X["day_1_2017_{}1".format(i)]= get_timespan(df_2017, date, 1, 1).values.ravel()
        X["day_1_2017_{}2".format(i)]= get_timespan(df_2017, date-timedelta(7), 1, 1).values.ravel()
        for m in [3,7,14,30,60,140]:
            X["mean_{}_2017_{}1".format(m,i)]= get_timespan(df_2017, date,m, m).\
                mean(axis=1).values
            X["mean_{}_2017_{}2".format(m,i)]= get_timespan(df_2017, date-timedelta(7),m, m).\
                mean(axis=1).values
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

Loading Data


In [None]:
print("Preparing dataset...")

t2017 = date(2017, 7, 5)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2_root',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose())**0.5)

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0).reset_index()
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 10000)
submission.loc[~submission.item_nbr.isin(item_nbr_u),'unit_sales']=0
del item_nbr_u
submission[['id','unit_sales']].to_csv('lgb.csv', float_format='%.4f', index=None)

Preparing dataset...
Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.538544	valid_1's rmse: 0.534744
[200]	training's rmse: 0.533852	valid_1's rmse: 0.530217
[300]	training's rmse: 0.530128	valid_1's rmse: 0.52655
[400]	training's rmse: 0.526911	valid_1's rmse: 0.523431
[500]	training's rmse: 0.523938	valid_1's rmse: 0.520579
mean_7_2017_01: 809045.86
mean_14_2017_01: 631297.47
ahead7_6: 353870.38
ahead7_5: 246269.23
promo_0: 49363.50
day_1_2017_01: 42966.65
mean_20_dow0_2017: 41514.32
ahead0_5: 40127.73
mean_30_2017_01: 39001.09
mean_4_dow0_2017: 35073.04
ahead0_2: 22843.58
bf_unpromo_0: 13873.93
aft_promo_014: 12723.87
bf_unpromo_7: 8319.76
mean_3_2017_32: 8310.96
promo_14_2017: 6924.20
mean_60_2017_01: 6567.35
bf_unpromo_14: 5806.81
ahead0_6: 5717.85
mean_4_dow2_2017: 4560.88
mean_3_2017_01: 4264.82
ahead7_1: 3331.11
mean_60_2017_42: 3309.60
ahead7_4: 3154.20
day_1_2017_11: 2890.34
aft_promo_060: 2751.13
aft_promo_0140: 2720.01
mean_60_2017_32: 2562.13
mean_20_dow4

Step 4
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.577074	valid_1's rmse: 0.584606
[200]	training's rmse: 0.571866	valid_1's rmse: 0.579014
[300]	training's rmse: 0.56793	valid_1's rmse: 0.574849
[400]	training's rmse: 0.564425	valid_1's rmse: 0.571268
[500]	training's rmse: 0.561119	valid_1's rmse: 0.567874
ahead7_2: 700594.95
ahead7_1: 541360.14
ahead7_3: 258618.47
ahead0_2: 246108.47
ahead0_1: 215342.90
mean_7_2017_01: 192259.41
mean_4_dow3_2017: 137742.09
mean_20_dow3_2017: 71068.32
mean_30_2017_01: 63077.07
promo_3: 39752.96
mean_60_2017_01: 17363.10
mean_3_2017_51: 16223.46
ahead7_6: 13495.49
ahead0_3: 13426.51
mean_14_2017_01: 9813.34
mean_60_2017_61: 9582.73
unpromo_16aftsum_2017: 9576.43
aft_promo_314: 9173.27
promo_14_2017: 8561.09
bf_unpromo_3: 7303.60
mean_30_2017_51: 6508.94
mean_4_dow4_2017: 5003.48
day_1_2017_41: 4076.46
day_1_2017_51: 3463.21
day_1_2017_01: 3164.92
mean_3_2017_01: 3126.10
mean_20_dow5_2017: 2890.08
promo_0: 265

Step 6
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.58652	valid_1's rmse: 0.590572
[200]	training's rmse: 0.581537	valid_1's rmse: 0.585614
[300]	training's rmse: 0.577663	valid_1's rmse: 0.581589
[400]	training's rmse: 0.574077	valid_1's rmse: 0.577932
[500]	training's rmse: 0.570861	valid_1's rmse: 0.574694
ahead7_4: 683107.61
ahead7_3: 582919.64
mean_30_2017_01: 266198.45
ahead7_5: 209008.19
mean_14_2017_01: 194729.01
mean_3_2017_01: 103949.60
mean_4_dow5_2017: 59989.19
promo_5: 50744.69
mean_20_dow5_2017: 36090.70
ahead0_3: 31988.07
mean_60_2017_01: 24110.40
aft_promo_514: 11626.68
unpromo_16aftsum_2017: 10082.43
ahead0_4: 9968.75
ahead0_5: 7917.17
promo_14_2017: 6980.68
mean_60_2017_41: 4314.47
promo_3: 3195.14
mean_60_2017_32: 3157.67
mean_4_dow6_2017: 2774.58
mean_60_2017_21: 2550.79
ahead7_2: 2416.77
mean_20_dow6_2017: 2406.20
promo_0: 2335.60
mean_7_2017_01: 2228.82
day_1_2017_61: 2192.59
ahead7_6: 2090.07
mean_60_2017_11: 2025.27
mean

Step 8
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.577358	valid_1's rmse: 0.60242
[200]	training's rmse: 0.571907	valid_1's rmse: 0.59611
[300]	training's rmse: 0.567629	valid_1's rmse: 0.591377
[400]	training's rmse: 0.564118	valid_1's rmse: 0.587577
[500]	training's rmse: 0.560837	valid_1's rmse: 0.584049
mean_14_2017_01: 714632.86
mean_30_2017_01: 639169.04
ahead7_5: 275327.41
mean_7_2017_01: 253639.37
promo_7: 79756.26
ahead7_6: 74854.60
mean_20_dow0_2017: 74840.74
mean_4_dow0_2017: 47039.85
ahead7_2: 22974.40
bf_unpromo_7: 21953.53
ahead0_5: 14728.56
mean_60_2017_01: 13187.47
unpromo_16aftsum_2017: 10627.66
promo_0: 8926.57
aft_promo_714: 8570.98
mean_60_2017_41: 7710.67
bf_unpromo_14: 7392.99
bf_unpromo_0: 5326.52
mean_4_dow5_2017: 4922.19
mean_60_2017_31: 4882.81
promo_14_2017: 4482.76
day_1_2017_01: 4188.66
aft_promo_914: 3914.59
promo_14: 3533.22
mean_60_2017_12: 3460.44
aft_promo_7140: 3421.97
mean_3_2017_31: 3203.89
promo_60_2017: 3

Step 10
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.589252	valid_1's rmse: 0.596159
[200]	training's rmse: 0.583528	valid_1's rmse: 0.590967
[300]	training's rmse: 0.579155	valid_1's rmse: 0.586888
[400]	training's rmse: 0.575427	valid_1's rmse: 0.583387
[500]	training's rmse: 0.571909	valid_1's rmse: 0.579944
mean_30_2017_01: 799127.78
mean_14_2017_01: 599784.44
mean_7_2017_01: 262424.26
mean_20_dow2_2017: 193843.48
mean_4_dow2_2017: 192228.56
promo_9: 77670.34
bf_unpromo_9: 13045.14
unpromo_16aftsum_2017: 11175.08
mean_60_2017_41: 9407.21
ahead7_1: 9295.78
ahead0_1: 9020.40
promo_14_2017: 8162.30
mean_60_2017_61: 7717.13
day_1_2017_31: 6905.31
mean_3_2017_41: 5823.46
aft_promo_9140: 5602.58
bf_unpromo_2: 5552.03
mean_60_2017_01: 5496.00
day_1_2017_01: 5072.95
mean_20_dow1_2017: 4634.92
aft_promo_960: 4543.27
promo_10: 4396.26
aft_promo_914: 4180.86
ahead0_2: 3505.70
mean_20_dow4_2017: 3298.41
ahead0_5: 3191.62
bf_unpromo_7: 2902.07
promo_2: 

Step 12
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.609227	valid_1's rmse: 0.613039
[200]	training's rmse: 0.603531	valid_1's rmse: 0.60788
[300]	training's rmse: 0.599077	valid_1's rmse: 0.603606
[400]	training's rmse: 0.59515	valid_1's rmse: 0.599913
[500]	training's rmse: 0.591614	valid_1's rmse: 0.596531
mean_4_dow4_2017: 888494.18
ahead7_3: 830739.83
ahead7_4: 225247.41
ahead0_3: 180699.87
mean_20_dow4_2017: 158893.47
mean_30_2017_01: 110909.79
promo_11: 70004.14
mean_60_2017_01: 44411.20
mean_3_2017_61: 30277.58
mean_14_2017_01: 30105.49
mean_30_2017_61: 22302.48
ahead0_2: 16051.62
unpromo_16aftsum_2017: 12202.86
mean_60_2017_41: 9688.16
day_1_2017_51: 8944.43
ahead0_1: 8681.20
promo_14_2017: 8079.10
aft_promo_1114: 5908.63
bf_unpromo_11: 5824.13
mean_60_2017_12: 5519.72
promo_12: 5171.44
mean_4_dow3_2017: 5021.46
ahead7_1: 4991.01
mean_60_2017_61: 4336.30
mean_3_2017_01: 4222.28
mean_7_2017_22: 4212.00
promo_14: 2874.66
aft_promo_11140:

Step 14
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.606703	valid_1's rmse: 0.595078
[200]	training's rmse: 0.601149	valid_1's rmse: 0.590396
[300]	training's rmse: 0.59688	valid_1's rmse: 0.586622
[400]	training's rmse: 0.592993	valid_1's rmse: 0.583157
[500]	training's rmse: 0.589667	valid_1's rmse: 0.58015
mean_30_2017_01: 982769.45
mean_14_2017_01: 288172.47
ahead7_5: 234172.78
ahead7_4: 136600.61
promo_13: 99471.48
mean_60_2017_01: 86806.22
mean_20_dow6_2017: 75440.52
mean_4_dow6_2017: 49018.61
ahead7_6: 37699.28
mean_3_2017_01: 30887.03
ahead0_3: 25497.53
mean_30_2017_61: 17886.67
ahead0_5: 17136.04
mean_7_2017_22: 10827.13
unpromo_16aftsum_2017: 9821.81
mean_7_2017_32: 8735.73
mean_3_2017_12: 6558.85
promo_14_2017: 6268.23
day_1_2017_01: 5275.28
aft_promo_1314: 4977.42
mean_140_2017_01: 4732.19
mean_20_dow5_2017: 4171.85
mean_14_2017_31: 3911.52
mean_20_dow1_2017: 3631.13
mean_60_2017_61: 3530.86
bf_unpromo_0: 3362.15
bf_unpromo_13: 3345

Step 16
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.601268	valid_1's rmse: 0.601114
[200]	training's rmse: 0.596058	valid_1's rmse: 0.596053
[300]	training's rmse: 0.591881	valid_1's rmse: 0.59197
[400]	training's rmse: 0.588269	valid_1's rmse: 0.588589
[500]	training's rmse: 0.584707	valid_1's rmse: 0.585092
mean_30_2017_01: 896842.65
mean_14_2017_01: 501073.06
promo_15: 84451.89
mean_60_2017_01: 82680.44
mean_20_dow1_2017: 73141.31
mean_7_2017_01: 68976.93
ahead7_6: 35739.89
ahead7_1: 23604.71
ahead0_2: 20363.54
mean_60_2017_41: 16027.83
ahead0_1: 14309.08
mean_20_dow2_2017: 13530.33
mean_60_2017_51: 13500.04
unpromo_16aftsum_2017: 8528.04
promo_14_2017: 7534.29
mean_4_dow1_2017: 7463.80
mean_7_2017_12: 6688.66
mean_60_2017_61: 5543.28
day_1_2017_01: 5107.56
mean_7_2017_22: 4608.68
mean_60_2017_31: 4009.13
mean_30_2017_61: 3830.42
mean_3_2017_41: 3758.18
mean_4_dow2_2017: 3639.88
ahead0_5: 3454.35
aft_promo_1514: 3259.39
promo_14: 2959.84
af