In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Conv1D, MaxPooling1D, Flatten, TimeDistributed, RepeatVector
import keras
from tensorflow.keras.utils import Sequence
import math
import os
import tensorflow as tf

In [11]:

## specify data_dir
data_dir = '../code/data/'

## npz file
filename = 'lstm_x_and_y.npz'

#initialize x and y
X = []
Y = []


In [4]:
# ## load the data <-- this does not work unfortunately, kernel keeps dying..
# with np.load(os.path.join(data_dir,filename)) as data:
#     X = data['arr_0']
#     Y = data['arr_1']
    

In [5]:
train_sales = pd.read_csv(data_dir + 'sales_train_validation.csv')
#sell_prices = pd.read_csv(data_dir + 'sell_prices.csv')
#calendar = pd.read_csv(data_dir + 'calendar.csv')
submission_file = pd.read_csv(data_dir + 'sample_submission.csv')
eval_file = pd.read_csv(data_dir + 'sales_train_evaluation.csv')

In [6]:
# create training data, for now it only contains the sales and no extra features
sales = train_sales.drop(["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], axis=1).T
# normalize training data

sales.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_1,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
d_2,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
d_3,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
d_4,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0
d_5,0,0,0,0,0,0,0,0,0,0,...,0,5,1,0,2,0,0,2,0,0


In [None]:
for i in range(4):
    print(i)

In [7]:

class DataGenerator(Sequence):
    
    
    def __init__(self,csv_file, timesteps, sliding_window,batch_size = 32, shuffle= False, to_fit = True):
        self.data = csv_file
        self.timesteps = timesteps
        self.sliding_window = sliding_window
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.current_start_x = 0
        self.current_start_y = 0
        self.first_run = True
        self.to_fit = to_fit
        self.on_epoch_end()
        
    
    def __len__(self):
        """
            number of batches per epoch
        """
#         print(self.data.shape[0])
#         print(self.data.shape[0] - self.sliding_window + 1)
        return (self.data.shape[0] - self.sliding_window + 1) // self.batch_size
    
    
    def __getitem__(self, index):
        
        """
            generates a single batch of data
        """
        ## the number of windows is equal to the the batch_size,
#         print("The index %s " % str(index) )
#         if (self.current_start_x + self.timesteps > 1913):
#             indexes = self.indexes[:1913 - self.current_start_x]
#         else:
        indexes = self.indexes[:self.batch_size] ## first 32 indexes are important the starts of the slidingwindows
        
        X = self._generate_X(indexes)
        
        if self.to_fit:
            y = self._generate_Y(indexes)
            return X,y
        else:
            return X
        
        
    def on_epoch_end(self):
        
        """
            after every epoch we need to update the available indexes 
            e.g. current_start-100 should become current_start + batch_size - 100
            where current_start is 0 initially
        """ 
        self.indexes = np.arange( (self.data.shape[0]-self.sliding_window + 1) )
        self.current_start_x = 0
        self.current_start_y = 0

    
    
    def _generate_X(self, indexes):
        """
            generates batch data, thus the sliding windows for the current batch
        """
        
        X = []
        for sliding_window in indexes:

            X.append(self.data.iloc[self.current_start_x: self.current_start_x + self.timesteps].to_numpy())
            self.current_start_x += 1 
#             print("the sliding window : %s " % str(sliding_window))
        X = np.array(X)
#         print("self current start x %s " % str(self.current_start_x))
        return X
    
    def _generate_Y(self,indexes):
        """
            generate the labels corresponding to the sliding window
            
        """
        Y = []
        
        for sliding_window in indexes:

            Y.append(self.data.iloc[ self.current_start_y+self.timesteps].to_numpy())
            self.current_start_y +=1
            
        Y = np.array(Y)

        return Y
    
        
    




In [17]:
# normalize
scaler = MinMaxScaler()
scaler.fit(sales)
sales = scaler.transform(sales)
sales = pd.DataFrame(sales)

In [18]:
sales_numpy = sales

In [19]:
train_generator = DataGenerator(sales_numpy,28, (28+1),batch_size = 32, shuffle=False )

In [20]:
train_generator.__len__()

58

In [21]:
## sanity check

# for index,(x,y) in enumerate(train_generator):
#     print(x.shape)
#     print(y.shape)
#     print(index)


In [22]:
# create model

n_features = 30490
timesteps = 28
model = Sequential()
model.add(Bidirectional(LSTM(20,  return_sequences=True),input_shape=(timesteps, n_features)))
model.add(Bidirectional(LSTM(10)))
model.add(Dense(30490))
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer='adam', loss='mse')


In [23]:
model.fit_generator(train_generator, epochs=10 ,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f24cb3c1bd0>

In [24]:
# test model

for i in range(28):    
    # get input for prediction by selecting last 28 days from sales
    X_pred = []
    X_pred.append(sales_numpy.iloc[-timesteps:].to_numpy())
    X_pred = np.array(X_pred)
    
    # get prediction
    prediction = model.predict(X_pred)
    
    # add prediction to sales so that it can be used for next prediction
    sales_numpy.loc[sales_numpy.shape[0]] = prediction[0]
    
predictions = sales_numpy.iloc[-28:]
predictions = scaler.inverse_transform(predictions)
predictions = np.round(np.abs(predictions))
predictions = pd.DataFrame(predictions).T

In [26]:
# create submission file

predictions_copy = predictions
final_submission = pd.concat([predictions, predictions_copy])
final_submission.reset_index(drop=True, inplace=True)
final_submission = final_submission.astype(int)
final_submission.insert(0, 'id', submission_file['id'])
final_submission.columns = ['id'] + [f"F{i}" for i in range(1, 29)]

final_submission.to_csv('submission_year_datagen_allbatches.csv', index=False)