## M5 forecasting Accuracy


## Import necessary packages

In [5]:
import pandas as pd
import numpy as np
import os
import gc

from sklearn.preprocessing import StandardScaler,MinMaxScaler

import tensorflow as tf


from keras import backend as K


Using TensorFlow backend.


## Explore the data a bit

In [None]:
data_dir = './data'

print(os.listdir(data_dir));


sales_train_validation = pd.read_csv(os.path.join(data_dir,'sales_train_validation.csv'))
calendar = pd.read_csv(os.path.join(data_dir,'calendar.csv'))
sell_prices = pd.read_csv(os.path.join(data_dir,'sell_prices.csv'))


In [None]:
print(sales_train_validation.iloc[0])

In [None]:
print(sales_train_validation.iloc[0]['item_id'])
print(sales_train_validation.iloc[0]['dept_id'])
print(sales_train_validation.iloc[0]['cat_id'])
print(sales_train_validation.iloc[0]['store_id'])
print(sales_train_validation.iloc[0]['state_id'])





In [None]:
print(sales_train_validation.head())

In [None]:
print(calendar.head())

In [None]:
print(sell_prices.head())

In [None]:
days = range(1, 1914)
time_series_columns = [f'd_{i}' for i in days]

In [None]:
time_series_columns

In [None]:
time_series_data = sales_train_validation[time_series_columns]


## First Approach
The data is a bit complex because of all the features that influence the sales as well as the format of the data. What makes it a bit complex in my opinion if that we have products and days. Thus for each product there are 1913 days, it would be simpler if we had one product and 1913 days. Because that problem could be translated to the wheather problem which is used in this tutorial [tutorial](https://www.tensorflow.org/tutorials/structured_data/time_series#the_weather_dataset) where they have days and temperature. So maybe we should think of this as a situation where we have n times , days and temperature, where n is the number of products....

Thus I would like to build a simple MLP, where the inputs are the products and sales on different days. Then use an activation function that enables the MLP to "learn" the values..





In [None]:
time_series_data

In [None]:
def univariate_data(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, 1)
        data.append(np.reshape(dataset[indices], (history_size, 1)))
        labels.append(dataset[i+target_size])
    return np.array(data), np.array(labels)

In [None]:
time_series_data.iloc[10].plot(subplots=False)
#.plot(subplots=True)

In [2]:
input_path = "./data"

def get_salesval_coltypes():
    keys = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] + \
        [f"d_{i}" for i in range(1, 1914)]
    values = ['object', 'category', 'category', 'category', 'category', 'category'] +\
        ["uint16" for i in range(1, 1914)]
    return dict(zip(keys, values))

submission = pd.read_csv(os.path.join(input_path, 'sample_submission.csv'))
sales_train_val = pd.read_csv(os.path.join(input_path, 'sales_train_validation.csv'), 
                              dtype=get_salesval_coltypes())

In [3]:
sales_train_val.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [84]:
8# Prepare scalars to normalize data
input_scaler = MinMaxScaler()
output_scaler = StandardScaler()

# Our timeseries data is in cols d_1 to d_1913
data = sales_train_val.iloc[:, 6:]
#data = (data-data.min())/(data.max()-data.min())

# For LSTM, X needs to be a stack of shape (samples, timesteps, features)
# So aiming at a shape of  = (~order of 30490 * timesteps, 28, 1)


# For later - test train split, for now just get shapes right
base = []
predictions = []

timesteps = 10
prediction_steps = 28

# Well just iterate through slicing timesteps until we get somewhat near the end. With a
# proper train test split, we could be more precise
for i in range(1, 12):
    samples = data.iloc[:, i*timesteps:i*timesteps+timesteps]
    preds = data.iloc[:, i*timesteps+timesteps:i*timesteps+timesteps+prediction_steps]
    base.extend(samples.to_numpy())
    predictions.extend(preds.to_numpy())
    print(f"Samples {samples.shape}, preds {preds.shape}")
    

# Scale and reshape our input
X_train = np.array(base)
input_scaler.fit(X_train)
X_train = input_scaler.transform(X_train)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    

# Scale our prediction labels
Y_train_orig = np.array(predictions)
output_scaler.fit(Y_train_orig)
Y_train = output_scaler.transform(Y_train_orig)
print(X_train.shape)
print(Y_train.shape)

# Note this could be horrible on memory. Later, need to look at generating this in batches
del predictions
del base
gc.collect()

Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
Samples (30490, 10), preds (30490, 28)
(335390, 10, 1)
(335390, 28)


1463

In [85]:
def root_mean_squared_scaled_error(y_true,y_pred):
    sample_length = 10
    forecasting_horizon = 28

    numerator = (1/(sample_length-1)) * K.sum(K.square(y_true-y_pred))
    denominator = (1/(sample_length - 1 )) * K.sum(K.square(y_true[1:] - y_true[:-1]))
    value_to_be_sqrt = (1/forecasting_horizon) * (numerator/denominator)
    result = K.square(value_to_be_sqrt)
    
    return result

In [86]:
def root_mean_squared_error(y_true, y_pred):
    print(y_true.shape)
    print(y_pred.shape)
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

steps = X_train.shape[1]
n_features = X_train.shape[2]
n_steps_out = Y_train.shape[1]

model = tf.keras.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True, input_shape=(steps, n_features))))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10)))
model.add(tf.keras.layers.Dense(n_steps_out))
model.compile(optimizer='adam', loss=root_mean_squared_scaled_error) # this loss needs changing to competition loss.


In [87]:
print(X_train[0].shape)

(10, 1)


In [88]:
%%time

# 0.6345 200 56
# 0.5633 200 56 4m 14s

model.fit(X_train, Y_train, epochs=2, verbose=1)

Train on 335390 samples
Epoch 1/2
Epoch 2/2
CPU times: user 19min 48s, sys: 3min 7s, total: 22min 56s
Wall time: 6min 22s


<tensorflow.python.keras.callbacks.History at 0x7f3a411ea510>

In [90]:
%%time

# Take a slice of n{timesteps} from the input data
x_pred = data.iloc[:,-timesteps:].to_numpy()

# Reshape to fit the format for input scalar
x_pred = x_pred.reshape((len(sales_train_val), x_pred.shape[1]))
# Normalize the input
x_pred = input_scaler.transform(x_pred)
# Reshape to fit the format for LSTM model
x_pred = x_pred.reshape((len(sales_train_val), x_pred.shape[1], 1))

# Get our predictions
raw_pred = model.predict(x_pred)

# Inverse to transform to get the predictions at the right scale
all_pred = output_scaler.inverse_transform(raw_pred)
# Round the predictions back to integers
all_pred = np.round(np.abs(all_pred))


CPU times: user 14.5 s, sys: 2.34 s, total: 16.8 s
Wall time: 5.18 s


In [91]:
# Stack our predictions into a dataframe
validation = pd.concat([pd.DataFrame(all_pred[:,0:prediction_steps]), pd.DataFrame(all_pred[:,-prediction_steps:])])
validation = validation.astype(int)

# Reset index to match the submission dataframe
validation.reset_index(inplace=True, drop=True)

# Add the id column from the submission dataframe to our results
validation['id'] = submission.id
validation = validation.reindex(
        columns=['id'] + [c for c in validation.columns if c != 'id'], copy=False)

# Add the correct colummn names for the submission file format
validation.columns = ['id'] + [f"F{i}" for i in range(1, 29)]

validation.to_csv('submission.csv', index=False)