In [1]:
"""
This is an upgraded version of Ceshine's and Linzhi and Andy Harless starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM
from keras import callbacks
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import gc

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
PATH = 'data/favorita/'
!ls {PATH}

1stplace_nn_sub_100e.csv  raw_test
1stplace_nn_sub_10e.csv   sample_submission.csv
holidays_events.csv	  stores.csv
items.csv		  test.csv
march2017-v1.csv.gz	  tmp
march2017-v5.csv.gz	  train.csv
march2017-v6.csv.gz	  transactions.csv
march2017-v7.csv.gz	  winning_with_sample_weight.csv
models			  winning_with_sample_weight.csv.gz
oil.csv


In [3]:
df_train = pd.read_csv(
    f'{PATH}train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 101688779)  # skip entries up to 2016-12-31
)

In [4]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-12-31,54,2048674,0.693147,False
1,2017-01-01,25,99197,0.693147,False
2,2017-01-01,25,103665,2.079442,False
3,2017-01-01,25,105574,0.693147,False
4,2017-01-01,25,105857,1.609438,False


In [5]:
df_train.shape

(23808262, 5)

In [6]:
df_test = pd.read_csv(
    f'{PATH}test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [7]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [8]:
items = pd.read_csv(
    f'{PATH}items.csv',
).set_index("item_nbr")

In [9]:
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103501,CLEANING,3008,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1


In [10]:
stores = pd.read_csv(
    f'{PATH}stores.csv',
).set_index("store_nbr")

In [11]:
stores.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Quito,Pichincha,D,13
2,Quito,Pichincha,D,13
3,Quito,Pichincha,D,8
4,Quito,Pichincha,D,9
5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [12]:
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)

stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

In [13]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [14]:
df_2017.shape

(23808261, 5)

In [15]:
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
1,2017-01-01,25,99197,0.693147,False
2,2017-01-01,25,103665,2.079442,False
3,2017-01-01,25,105574,0.693147,False
4,2017-01-01,25,105857,1.609438,False
5,2017-01-01,25,106716,1.098612,False


In [16]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [17]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [18]:
items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))

In [19]:
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

In [20]:
df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()

In [21]:
df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

In [22]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [23]:
def prepare_dataset(df, promo_df, t2017, is_train=True, name_prefix=None):
    X = {
        "promo_14_2017": get_timespan(promo_df, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_df, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_df, t2017, 140, 140).sum(axis=1).values,
        "promo_3_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 3).sum(axis=1).values,
        "promo_7_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 7).sum(axis=1).values,
        "promo_14_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 14).sum(axis=1).values,
    }

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017 + timedelta(days=-7), i, i)
        X['diff_%s_mean_2' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay_2' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s_2' % i] = tmp.mean(axis=1).values
        X['median_%s_2' % i] = tmp.median(axis=1).values
        X['min_%s_2' % i] = tmp.min(axis=1).values
        X['max_%s_2' % i] = tmp.max(axis=1).values
        X['std_%s_2' % i] = tmp.std(axis=1).values

    for i in [7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_sales_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

        tmp = get_timespan(promo_df, t2017, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_promo_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    tmp = get_timespan(promo_df, t2017 + timedelta(days=16), 15, 15)
    X['has_promo_days_in_after_15_days'] = (tmp > 0).sum(axis=1).values
    X['last_has_promo_day_in_after_15_days'] = i - ((tmp > 0) * np.arange(15)).max(axis=1).values
    X['first_has_promo_day_in_after_15_days'] = ((tmp > 0) * np.arange(15, 0, -1)).max(axis=1).values

    for i in range(1, 16):
        X['day_%s_2017' % i] = get_timespan(df, t2017, i, 1).values.ravel()

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, t2017, 140-i, 20, freq='7D').mean(axis=1).values

    for i in range(-16, 16):
        X["promo_{}".format(i)] = promo_df[t2017 + timedelta(days=i)].values.astype(np.uint8)

    X = pd.DataFrame(X)

    if is_train:
        y = df[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

In [24]:
print("Preparing dataset...")
num_days = 8
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(num_days):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp2 = prepare_dataset(df_2017_item, promo_2017_item, t2017 + delta, is_train=False, name_prefix='item')
    X_tmp2.index = df_2017_item.index
    X_tmp2 = X_tmp2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

    X_tmp3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, t2017 + delta, is_train=False, name_prefix='store_class')
    X_tmp3.index = df_2017_store_class.index
    X_tmp3 = X_tmp3.reindex(df_2017_store_class_index).reset_index(drop=True)

    X_tmp = pd.concat([X_tmp, X_tmp2, X_tmp3, items.reset_index(), stores.reset_index()], axis=1)

    X_l.append(X_tmp)
    y_l.append(y_tmp)

X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

del X_l, y_l

Preparing dataset...


In [25]:
X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 26))
X_val2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 7, 26), is_train=False, name_prefix='item')
X_val2.index = df_2017_item.index
X_val2 = X_val2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

X_val3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, name_prefix='store_class')
X_val3.index = df_2017_store_class.index
X_val3 = X_val3.reindex(df_2017_store_class_index).reset_index(drop=True)

X_val = pd.concat([X_val, X_val2, X_val3, items.reset_index(), stores.reset_index()], axis=1)

X_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 16), is_train=False)
X_test2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 8, 16), is_train=False, name_prefix='item')
X_test2.index = df_2017_item.index
X_test2 = X_test2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

X_test3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 8, 16), is_train=False, name_prefix='store_class')
X_test3.index = df_2017_store_class.index
X_test3 = X_test3.reindex(df_2017_store_class_index).reset_index(drop=True)

X_test = pd.concat([X_test, X_test2, X_test3, items.reset_index(), stores.reset_index()], axis=1)
del df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
gc.collect()

2420

In [26]:
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_test]))
X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
X_test[:] = scaler.transform(X_test)

In [27]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()
X_val = X_val.as_matrix()
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

In [28]:
X_train.shape

(1340120, 1, 561)

In [29]:
def build_model():
    model = Sequential()
    model.add(LSTM(512, input_shape=(X_train.shape[1],X_train.shape[2])))
    model.add(BatchNormalization())
    model.add(Dropout(.2))

    model.add(Dense(256))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.1))

    model.add(Dense(256))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.1))

    model.add(Dense(128))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(32))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(16))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(1))

    return model

In [31]:
N_EPOCHS = 15

In [32]:
val_pred = []
test_pred = []
# wtpath = 'weights.hdf5'  # To save best epoch. But need Keras bug to be fixed first.
sample_weights=np.array( pd.concat([items["perishable"]] * num_days) * 0.25 + 1 )
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    y = y_train[:, i]
    y_mean = y.mean()
    xv = X_val
    yv = y_val[:, i]
    model = build_model()
    opt = optimizers.Adam(lr=0.001)
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')
        ]
    model.fit(X_train, y - y_mean, batch_size = 65536, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv-y_mean), callbacks=callbacks )
    val_pred.append(model.predict(X_val)+y_mean)
    test_pred.append(model.predict(X_test)+y_mean)

Step 1
Train on 1340120 samples, validate on 167515 samples
Epoch 1/15
 - 44s - loss: 0.9239 - mean_squared_error: 0.8708 - val_loss: 2.3893 - val_mean_squared_error: 2.3893
Epoch 2/15
 - 42s - loss: 0.5161 - mean_squared_error: 0.4866 - val_loss: 1.1362 - val_mean_squared_error: 1.1362
Epoch 3/15
 - 42s - loss: 0.4799 - mean_squared_error: 0.4530 - val_loss: 0.6137 - val_mean_squared_error: 0.6137
Epoch 4/15
 - 42s - loss: 0.4587 - mean_squared_error: 0.4334 - val_loss: 0.4411 - val_mean_squared_error: 0.4411
Epoch 5/15
 - 41s - loss: 0.4399 - mean_squared_error: 0.4162 - val_loss: 0.3632 - val_mean_squared_error: 0.3632
Epoch 6/15
 - 41s - loss: 0.4236 - mean_squared_error: 0.4013 - val_loss: 0.3387 - val_mean_squared_error: 0.3387
Epoch 7/15
 - 42s - loss: 0.4103 - mean_squared_error: 0.3892 - val_loss: 0.3244 - val_mean_squared_error: 0.3244
Epoch 8/15
 - 43s - loss: 0.4001 - mean_squared_error: 0.3798 - val_loss: 0.3147 - val_mean_squared_error: 0.3147
Epoch 9/15
 - 43s - loss: 0.

Epoch 6/15
 - 43s - loss: 0.5052 - mean_squared_error: 0.4800 - val_loss: 0.3873 - val_mean_squared_error: 0.3873
Epoch 7/15
 - 43s - loss: 0.4936 - mean_squared_error: 0.4691 - val_loss: 0.3740 - val_mean_squared_error: 0.3740
Epoch 8/15
 - 42s - loss: 0.4811 - mean_squared_error: 0.4573 - val_loss: 0.3691 - val_mean_squared_error: 0.3691
Epoch 9/15
 - 42s - loss: 0.4719 - mean_squared_error: 0.4485 - val_loss: 0.3648 - val_mean_squared_error: 0.3648
Epoch 10/15
 - 42s - loss: 0.4626 - mean_squared_error: 0.4397 - val_loss: 0.3642 - val_mean_squared_error: 0.3642
Epoch 11/15
 - 42s - loss: 0.4555 - mean_squared_error: 0.4328 - val_loss: 0.3573 - val_mean_squared_error: 0.3573
Epoch 12/15
 - 43s - loss: 0.4481 - mean_squared_error: 0.4258 - val_loss: 0.3548 - val_mean_squared_error: 0.3548
Epoch 13/15
 - 43s - loss: 0.4425 - mean_squared_error: 0.4205 - val_loss: 0.3552 - val_mean_squared_error: 0.3552
Epoch 14/15
 - 42s - loss: 0.4366 - mean_squared_error: 0.4149 - val_loss: 0.3497 - 

Epoch 12/15
 - 43s - loss: 0.4067 - mean_squared_error: 0.3850 - val_loss: 0.3639 - val_mean_squared_error: 0.3639
Epoch 13/15
 - 42s - loss: 0.4029 - mean_squared_error: 0.3814 - val_loss: 0.3628 - val_mean_squared_error: 0.3628
Epoch 14/15
 - 43s - loss: 0.3993 - mean_squared_error: 0.3780 - val_loss: 0.3623 - val_mean_squared_error: 0.3623
Epoch 15/15
 - 43s - loss: 0.3958 - mean_squared_error: 0.3747 - val_loss: 0.3628 - val_mean_squared_error: 0.3628
Step 10
Train on 1340120 samples, validate on 167515 samples
Epoch 1/15
 - 45s - loss: 0.9632 - mean_squared_error: 0.9029 - val_loss: 1.7811 - val_mean_squared_error: 1.7811
Epoch 2/15
 - 42s - loss: 0.5778 - mean_squared_error: 0.5425 - val_loss: 0.8779 - val_mean_squared_error: 0.8779
Epoch 3/15
 - 42s - loss: 0.5406 - mean_squared_error: 0.5083 - val_loss: 0.5285 - val_mean_squared_error: 0.5285
Epoch 4/15
 - 42s - loss: 0.5188 - mean_squared_error: 0.4885 - val_loss: 0.4263 - val_mean_squared_error: 0.4263
Epoch 5/15
 - 41s - los

Epoch 2/15
 - 43s - loss: 0.5675 - mean_squared_error: 0.5370 - val_loss: 0.7452 - val_mean_squared_error: 0.7452
Epoch 3/15
 - 43s - loss: 0.5365 - mean_squared_error: 0.5077 - val_loss: 0.4904 - val_mean_squared_error: 0.4904
Epoch 4/15
 - 44s - loss: 0.5164 - mean_squared_error: 0.4889 - val_loss: 0.4155 - val_mean_squared_error: 0.4155
Epoch 5/15
 - 44s - loss: 0.4995 - mean_squared_error: 0.4732 - val_loss: 0.3871 - val_mean_squared_error: 0.3871
Epoch 6/15
 - 44s - loss: 0.4854 - mean_squared_error: 0.4600 - val_loss: 0.3770 - val_mean_squared_error: 0.3770
Epoch 7/15
 - 44s - loss: 0.4737 - mean_squared_error: 0.4491 - val_loss: 0.3720 - val_mean_squared_error: 0.3720
Epoch 8/15
 - 43s - loss: 0.4654 - mean_squared_error: 0.4413 - val_loss: 0.3680 - val_mean_squared_error: 0.3680
Epoch 9/15
 - 43s - loss: 0.4583 - mean_squared_error: 0.4345 - val_loss: 0.3625 - val_mean_squared_error: 0.3625
Epoch 10/15
 - 43s - loss: 0.4511 - mean_squared_error: 0.4278 - val_loss: 0.3624 - val_

In [33]:
weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).squeeze(axis=2).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err))

nwrmsle = 0.5928160636612965


In [34]:
y_val = np.array(val_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_val, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv('nn_cv.csv', index=False)

In [35]:
print("Making submission...")
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [36]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv(f'{PATH}nn_sub_LSTM_15e.csv', float_format='%.4f', index=None)

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_16 (LSTM)               (None, 512)               2199552   
_________________________________________________________________
batch_normalization_106 (Bat (None, 512)               2048      
_________________________________________________________________
dropout_106 (Dropout)        (None, 512)               0         
_________________________________________________________________
dense_106 (Dense)            (None, 256)               131328    
_________________________________________________________________
p_re_lu_91 (PReLU)           (None, 256)               256       
_________________________________________________________________
batch_normalization_107 (Bat (None, 256)               1024      
_________________________________________________________________
dropout_107 (Dropout)        (None, 256)               0         
__________