In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Conv1D, MaxPooling1D, Flatten, TimeDistributed, RepeatVector

In [2]:
# read data

data_dir = 'data/'

train_sales = pd.read_csv(data_dir + 'sales_train_validation.csv')
#sell_prices = pd.read_csv(data_dir + 'sell_prices.csv')
#calendar = pd.read_csv(data_dir + 'calendar.csv')
submission_file = pd.read_csv(data_dir + 'sample_submission.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
train_sales = reduce_mem_usage(train_sales) # takes about 4mins

Mem. usage decreased to 95.00 Mb (78.7% reduction)


In [5]:
# create training data

sales = train_sales.drop(["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], axis=1).T

scaler = MinMaxScaler()
scaler.fit(sales)
sales = scaler.transform(sales)
sales = pd.DataFrame(sales)

In [6]:
# basic parameters

start_day = 1

timesteps = 28
prediction_steps = 1
len_window = timesteps + prediction_steps

nr_training_days = sales.shape[0] - start_day + 1
nr_sets = nr_training_days - len_window + 1

In [7]:
# create X and y from scratch or load them from npz file

loadXandYfromfile = False

In [8]:
# create X and y

if (loadXandYfromfile):
    XandY = np.load('lstm_x_and_y.npz')
    X = XandY['arr_0']
    y = XandY['arr_1']
    del XandY
else:
    base, predictions = [], []
    for i in range(nr_sets):
        samples = sales.iloc[i:i+timesteps]
        pred = sales.iloc[i+timesteps]
        base.append(samples.to_numpy())
        predictions.append(pred.to_numpy())
    X = np.array(base)
    y = np.array(predictions)
    del base, predictions

In [9]:
# split into training and validation set

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25)
del X, y

In [27]:
print(X_train.shape)
print(y_train.shape)

(1413, 28, 30490)
(1413, 30490)


In [None]:
# basic LSTM model

n_features = X_train.shape[2] # = 30490
n_outputs = y_train.shape[1]  # = 30490

model = Sequential()
model.add(Bidirectional(LSTM(20, return_sequences=True, input_shape=(timesteps, n_features))))
model.add(Bidirectional(LSTM(10)))
model.add(Dense(n_outputs))
model.compile(optimizer='adam', loss='mse')

In [35]:
# CNN-LSTM model

n_features = X_train.shape[2] # = 30490
n_outputs = y_train.shape[1]  # = 30490

model = Sequential()
model.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(timesteps, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(RepeatVector(n_outputs))
model.add(LSTM(20, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(n_outputs)))
model.compile(loss='mse', optimizer='adam')

In [36]:
# train & validate model (1 epoch takes about 2mins)

model.fit(X_train, y_train, batch_size=32, epochs=10, verbose=1, validation_data=(X_val, y_val))

ValueError: Error when checking target: expected time_distributed_9 to have 3 dimensions, but got array with shape (1413, 30490)

In [None]:
# test model

for i in range(28):    
    # get input for prediction by selecting last 28 days from sales
    X_pred = []
    X_pred.append(sales.iloc[-timesteps:].to_numpy())
    X_pred = np.array(X_pred)
    
    # get prediction
    prediction = model.predict(X_pred)
    
    # add prediction to sales so that it can be used for next prediction
    sales.loc[sales.shape[0]] = prediction[0]

# get all predictions, 'unnormalize' them, and round to integers
predictions = sales.iloc[-28:]
predictions = scaler.inverse_transform(predictions)
predictions = np.round(np.abs(predictions))
predictions = pd.DataFrame(predictions).T

In [None]:
# create submission file

predictions_copy = predictions
final_submission = pd.concat([predictions, predictions_copy])
final_submission.reset_index(drop=True, inplace=True)
final_submission = final_submission.astype(int)
final_submission.insert(0, 'id', submission_file['id'])
final_submission.columns = ['id'] + [f"F{i}" for i in range(1, 29)]

final_submission.to_csv('submission.csv', index=False)