In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf 
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Dense, SimpleRNN, GRU, ReLU
from tensorflow.keras.losses import MAPE

from collections import OrderedDict

In [2]:
train_df = pd.read_csv('../data/train_dataV3.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,ItemCode,Date,DailySales,CategoryCode
0,0,3418,2021-10-01,18,category_1
1,1,3418,2021-10-02,3,category_1
2,2,3418,2021-10-03,8,category_1
3,3,3418,2021-10-04,2,category_1
4,4,3418,2021-10-05,4,category_1


In [3]:
sale_df = pd.read_csv('../data/SalesPointsV2.csv')
sale_df.head()

Unnamed: 0.1,Unnamed: 0,ItemCode,#sales,stage,WeeklySales
0,0,3418,21,validation,"[29, 42, 41, 41, 44, 46, 43, 49, 54, 69, 101, ..."
1,1,3427,21,validation,"[11, 40, 20, 48, 54, 28, 36, 52, 42, 57, 46, 4..."
2,2,7666,21,validation,"[84, 119, 196, 185, 94, 15, 10, 30, 79, 82, 14..."
3,3,9925,21,test,"[1, 9, 11, 13, 5, 11, 11, 13, 12, 10, 21, 15, ..."
4,4,16936,21,test,"[9, 25, 40, 31, 21, 43, 7, 34, 151, 32, 28, 25..."


In [4]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the sequence
		if end_ix > len(sequence)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y)

In [5]:
from numpy import array
import random
# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        rnd_ind = random.choice([0, 1, 2, 3,])
        seq_x, seq_y = sequence[i+rnd_ind:end_ix+rnd_ind], sequence[end_ix+rnd_ind:end_ix+rnd_ind+4]

        if len(seq_y) == 4:
            X.append(seq_x)
            y.append(seq_y)
        else:
            return X, y
 
# define input sequence
# raw_seq = range(10, 1000, 10)
# choose a number of time steps

# n_steps = 10
# split into samples
# X, y = split_sequence(raw_seq, n_steps)
# summarize the data
# for i in range(len(X)):
#     print(X[i], y[i])

In [6]:
def dataset_gen(df):
    X_dataset, y_dataset = [], []
    for l in df['WeeklySales'].values:
        x, y = split_sequence(eval(l), 15)
        X_dataset.extend(x)
        y_dataset.extend(y)

    return np.array(X_dataset), np.array(y_dataset)
    

In [7]:
X_dt, y_dt = dataset_gen(sale_df)
X_dt

array([[42, 41, 41, ..., 39, 83, 85],
       [42, 41, 41, ..., 39, 83, 85],
       [11, 40, 20, ..., 78, 32, 66],
       ...,
       [22, 26, 14, ..., 27, 31, 53],
       [ 3,  0,  0, ...,  0,  5,  7],
       [ 3,  0,  0, ...,  0,  5,  7]])

In [8]:
X_dt.shape

(225, 15)

In [9]:
# define model
model = Sequential()
model.add(GRU(20, activation='relu', input_shape=(15, 1)))
model.add(Dense(4))
model.add(ReLU())

opt = tf.keras.optimizers.Adagrad(learning_rate=0.02)
model.compile(optimizer=opt, loss='mape')



In [10]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X_dt = X_dt.reshape((X_dt.shape[0], X_dt.shape[1], n_features))

In [11]:
# fit model
model.fit(X_dt, y_dt, epochs=200, verbose=0)#, batch_size=4)

<keras.callbacks.History at 0x1b90b09ca30>

In [12]:
val_df = pd.read_csv('../data/validation_data.csv').sort_values(by=['ItemCode', 'Week'])
sales = val_df.groupby('ItemCode')['WeeklySales'].agg(list)
# val_df['Sales'] = val_df['ItemCode'].map(sales)
# val_df.head()
sales

ItemCode
3418          [25, 69, 120, 69]
3427            [7, 56, 72, 33]
7666               [48, 16, 17]
17287          [57, 60, 83, 64]
17296      [366, 514, 728, 771]
                   ...         
1097143        [43, 47, 32, 27]
1098493           [10, 8, 9, 9]
1098502            [7, 6, 7, 4]
1101661        [64, 69, 54, 88]
1105009        [11, 10, 10, 19]
Name: WeeklySales, Length: 95, dtype: object

In [13]:
def dataset_vgen(train_df, val_ser):
    tr_df = train_df.copy().set_index('ItemCode')
    X_dataset, y_dataset = [], []
    for i,l in tr_df.loc[tr_df['stage']=='validation', ['WeeklySales']].to_dict()['WeeklySales'].items():
        x = eval(l)[-15:]
        X_dataset.append(x)
        y_dataset.append(val_ser[i])

    return np.array(X_dataset), np.array(y_dataset)

In [14]:
X_val, y_val = dataset_vgen(sale_df, sales)

  return np.array(X_dataset), np.array(y_dataset)


In [15]:
y_hat = model(X_val).numpy()

In [16]:
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
print(y_val.ravel(), y_val.ravel())

In [None]:
mean_absolute_percentage_error(y_val.ravel(), y_val.ravel())

In [18]:
def dataset_tgen(train_df):
    tr_df = train_df.copy().set_index('ItemCode')
    X_dataset = []
    for i,l in tr_df.loc[tr_df['stage']=='test', ['WeeklySales']].to_dict()['WeeklySales'].items():
        x = eval(l)[-15:]
        X_dataset.append(x)
    tr_df.reset_index(inplace=True)
    return np.array(X_dataset), tr_df.loc[tr_df['stage']=='test', ['ItemCode']].values

In [19]:
X_test, item_idx = dataset_tgen(sale_df)

In [20]:
y_test = model(X_test).numpy()

In [21]:
I2C_map = pd.Series(train_df.CategoryCode.values,index=train_df.ItemCode).to_dict()

In [22]:
labels = []
for i in range(y_test.shape[0]):
    it = item_idx[i][0]
    labels.extend([f"{I2C_map[it]}_{it}_w1", f'{I2C_map[it]}_{it}_w2', f'{I2C_map[it]}_{it}_w3', f'{I2C_map[it]}_{it}_w4'])

In [23]:
test_dict = {'ID': labels, 'WeeklySales': y_test.astype(np.int).ravel()}
res_df = pd.DataFrame(test_dict)

In [24]:
res_df.head()

Unnamed: 0,ID,WeeklySales
0,category_2_9925_w1,0
1,category_2_9925_w2,0
2,category_2_9925_w3,0
3,category_2_9925_w4,0
4,category_3_16936_w1,0


In [25]:
res_df.to_csv('../data/answer2.csv')