In [3]:
import tensorflow as tf
from tensorflow import keras

from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [5]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
    
)

In [6]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [8]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [7]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [10]:
items = items.reindex(df_train.index.get_level_values(1))

In [8]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [9]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [62]:
print("Preparing dataset...")
t2017 = date(2017, 5, 16)
X_l, y_l = [], []
for i in range(9):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [14]:
X_train.head(5)

Unnamed: 0,mean_3_2017,mean_7_2017,mean_14_2017,mean_16_2017
0,0.0,0.198042,0.148532,0.129965
1,0.366204,0.454008,0.227004,0.198628
2,0.597253,0.888939,1.029376,0.944026
3,1.059351,1.111889,1.054853,0.922996
4,1.762756,1.980641,1.973509,1.896073


In [63]:
X_val, y_val = prepare_dataset(date(2017, 7, 23))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [64]:
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_test]))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [65]:
X_train.head(5)

Unnamed: 0,mean_3_2017,mean_7_2017,mean_14_2017,mean_16_2017
0,0.0,0.198042,0.148532,0.129965
1,0.366204,0.454008,0.227004,0.198628
2,0.597253,0.888939,1.029376,0.944026
3,1.059351,1.111889,1.054853,0.922996
4,1.762756,1.980641,1.973509,1.896073


In [69]:
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test) #will it work?

In [71]:
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

In [72]:
def build_model():
    model = keras.Sequential([
     keras.layers.Flatten(input_shape=(X_train.shape[1],X_train.shape[2])),
     keras.layers.Dense(256, activation=tf.nn.relu),
     keras.layers.Dense(128, activation=tf.nn.relu),
     keras.layers.Dense(64, activation=tf.nn.relu),
     keras.layers.Dense(32, activation=tf.nn.relu),
     keras.layers.Dense(16, activation=tf.nn.relu),
     keras.layers.Dense(1)
     ])
    return model


In [None]:
sample_weights=np.array( pd.concat([items["perishable"]] * 9) * 0.25 + 1 )

In [73]:
N_EPOCHS = 100

val_pred = []
test_pred = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    y = y_train[:, i]
    y_mean = y.mean()
    xv = X_val
    yv = y_val[:, i]
    model = build_model()
    model.compile(optimizer='adam', 
              loss='mse',
              metrics=['mse'])
    model.fit(X_train, y - y_mean, batch_size = 65536,epochs = N_EPOCHS, verbose=2, validation_data=(xv,yv-y_mean))
    val_pred.append(model.predict(X_val)+y_mean)
    test_pred.append(model.predict(X_test)+y_mean)

Step 1
Train on 1466226 samples, validate on 162914 samples
Epoch 1/100


KeyboardInterrupt: 

In [48]:
print("Making submission...")
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('nn.csv', float_format='%.4f', index=None)

Making submission...
