In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
import statsmodels.api as sm
import datetime
from sklearn_pandas import DataFrameMapper

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input, Dense, Activation, Reshape, BatchNormalization, Dropout, concatenate, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import backend
from tensorflow.keras.constraints import NonNeg


from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

import re
import gc
import matplotlib.pyplot as plt
%matplotlib inline

# from fastai.imports import *
# from fastai.column_data import *
# from fastai.structured import *



In [2]:
# Read data
data = pd.read_pickle(r'D:\Project\Pet_Project\Demand_Forecast\Data\data_2.pkl')

test = pd.read_pickle(r'D:\Project\Pet_Project\Demand_Forecast\Data\test.pkl')

In [3]:
# Select columns

data = data[[
    'date_block_num',
    'shop_id',
    'shop_category',
    'item_id',
    'item_cnt_month',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_4',
    'item_cnt_month_lag_5',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_shop_type_avg_item_cnt_lag_1',
    'date_shop_subtype_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'date_type_avg_item_cnt_lag_1',
    'date_subtype_avg_item_cnt_lag_1',
    'delta_price_lag',
    'month',
    'item_shop_last_sale',
    'item_last_sale',
    'item_shop_first_sale',
    'item_first_sale',
    'city_coord_1',
    'city_coord_2',
    'country_part',
    'weeknd_count',
    'days_in_month'
]]

In [4]:
# Define feature type
cat_feature = ["shop_id", "shop_category", "item_id", "city_code", "item_category_id", "type_code", "subtype_code", 'month', "country_part"]
con_feature = ['item_cnt_month_lag_1', 'item_cnt_month_lag_2','item_cnt_month_lag_3','item_cnt_month_lag_4', 'item_cnt_month_lag_5', 'item_cnt_month_lag_6', 'item_cnt_month_lag_12', 'date_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2', 'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6', 'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1', 'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12', 'date_cat_avg_item_cnt_lag_1', 'date_shop_cat_avg_item_cnt_lag_1', 'date_shop_type_avg_item_cnt_lag_1', 'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1', 'date_item_city_avg_item_cnt_lag_1', 'date_type_avg_item_cnt_lag_1', 'date_subtype_avg_item_cnt_lag_1', 'delta_price_lag', 'item_shop_last_sale', 'item_last_sale','item_shop_first_sale', 'item_first_sale', 'city_coord_1', 'city_coord_2', 'weeknd_count', 'days_in_month']

In [5]:
# Split data

X_train = data[data.date_block_num <= 33]
# X_valid = data[data.date_block_num == 33]
X_test = data[data.date_block_num == 34]

In [6]:
# Clean env
del data
gc.collect()

71

In [7]:
def prepare_df(data_df, isTrain=True, shuffle=True):

    if shuffle:
        data_df = data_df.sample(frac=1)

    for cat_f in cat_feature:
        data_df[cat_f] = data_df[cat_f].astype("category").cat.as_ordered()

    mapper = DataFrameMapper([
         (con_feature, StandardScaler())
    ])
    data_df[con_feature] = mapper.fit_transform(data_df)

    label_encoders = []
    for f_name in cat_feature:
        le = LabelEncoder()
        le.fit(data_df[f_name])
        label_encoders.append(le)
        data_df[f_name] = le.transform(data_df[f_name])

    sales_scaler = None
    if isTrain:
        sales_scaler = StandardScaler()
        sales_values = data_df.item_cnt_month.values.reshape(-1,1)
        scaled = sales_scaler.fit_transform(sales_values)
        data_df.item_cnt_month = scaled
    
    return data_df, sales_scaler, label_encoders

In [8]:
train_df, scaler, label_encoders = prepare_df(X_train)
train_df.head()

Unnamed: 0,date_block_num,shop_id,shop_category,item_id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,month,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale,city_coord_1,city_coord_2,country_part,weeknd_count,days_in_month
9797431,29,1,1,9760,0.0,1,50,12,2,0.0,...,5,0.058824,0.0,0.878788,0.878788,0.899745,0.292587,1,0.0,0.666667
8695944,24,47,0,10387,0.049988,25,64,13,53,0.0,...,0,0.058824,0.0,0.515152,0.515152,0.882553,0.431409,4,0.0,1.0
5656821,15,24,0,2080,0.0,13,52,12,8,0.0,...,3,0.0,0.0,0.363636,0.363636,0.898822,0.289947,1,0.0,0.666667
5323490,14,24,0,5396,0.0,13,26,7,35,0.0,...,2,0.058824,0.0,0.212121,0.212121,0.898822,0.289947,1,1.0,1.0
6038554,16,34,1,14449,0.0,18,37,10,4,0.0,...,4,0.0,0.0,0.060606,0.181818,0.761451,0.306194,3,0.0,1.0


In [9]:
def data_for_model(data_df):
    x_fit = []

    for cat in cat_feature:
        x_fit.append(data_df[cat].values)

    for con in con_feature:
        x_fit.append(data_df[con].values)
        
    return x_fit

In [10]:
x_train_df = train_df[train_df.date_block_num < 33]
x_val_df = train_df[train_df.date_block_num == 33]
y_train, y_val = train_df[train_df.date_block_num < 33].item_cnt_month.values, train_df[train_df.date_block_num == 33].item_cnt_month.values

In [11]:
x_fit_train = data_for_model(x_train_df)
x_fit_val = data_for_model(x_val_df)

In [12]:
emb_space = [(len(le.classes_), min(25, len(le.classes_)) // 2 ) for idx, le in enumerate(label_encoders)]
emb_space

[(54, 12),
 (3, 1),
 (17054, 12),
 (31, 12),
 (79, 12),
 (19, 9),
 (62, 12),
 (12, 6),
 (5, 2)]

# Building keras model
- Each categorical feature goes thru embedding matrix
- Each continues feature (only Elapsed) goes thru simple Dense layer for relu activataion
- We add several dense layer and make singe linear output

In [None]:

def rmse(y_true, y_pred):
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

In [13]:
model_inputs = []
model_embeddings = []
    
for input_dim, output_dim in emb_space:
    i = Input(shape=(1,))
    emb = Embedding(input_dim=input_dim, output_dim=output_dim)(i)
    
    model_inputs.append(i)
    model_embeddings.append(emb)
    
    
con_outputs = []
for con in con_feature:
    elaps_input = Input(shape=(1,))
    elaps_output = Dense(10)(elaps_input) 
    #elaps_output = BatchNormalization()(elaps_output)
    elaps_output = Activation("relu")(elaps_output)
    
    elaps_output = Reshape(target_shape=(1,10))(elaps_output)

    model_inputs.append(elaps_input)
    con_outputs.append(elaps_output)

merge_embeddings = concatenate(model_embeddings, axis=-1)
if len(con_outputs) > 1:
    merge_con_output = concatenate(con_outputs)
else:
    merge_con_output = con_outputs[0]

merge_embedding_cont = concatenate([merge_embeddings, merge_con_output])
merge_embedding_cont

output_tensor = Dense(1000, name="dense1024")(merge_embedding_cont)
output_tensor = BatchNormalization()(output_tensor)
output_tensor = Activation('relu')(output_tensor)
#output_tensor = Dropout(0.5)(output_tensor)

output_tensor = Dense(500, name="dense512")(output_tensor)
output_tensor = BatchNormalization()(output_tensor)
output_tensor = Activation("relu")(output_tensor)
#output_tensor = Dropout(0.5)(output_tensor)

output_tensor = Dense(1, activation='linear', name="output", kernel_constraint = NonNeg())(output_tensor)

optimizer = Adam(lr=10e-3)

nn_model = Model(inputs=model_inputs, outputs=output_tensor)
nn_model.compile(loss='mse', optimizer=optimizer, metrics=['mean_squared_error'])


reduceLr=ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, verbose=1)
checkpoint = ModelCheckpoint("nn_model.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')#val_mean_absolute_percentage_error
callbacks_list = [checkpoint, reduceLr]

In [14]:
history = nn_model.fit(x=x_fit_train, y=y_train.reshape(-1,1,1),
                       validation_data=(x_fit_val, y_val.reshape(-1,1,1)),
                       batch_size=1024, epochs=10, callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.00222, saving model to nn_model.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.00222 to 0.00215, saving model to nn_model.hdf5

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.00215

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.00215

Epoch 00004: ReduceLROnPlateau reducing learning rate to 7.999999215826393e-05.
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.00215

Epoch 00005: ReduceLROnPlateau reducing learning rate to 1.599999814061448e-05.
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.00215

Epoch 00006: ReduceLROnPlateau reducing learning rate to 3.199999628122896e-06.
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.00215

Epoch 00007: ReduceLROnPlateau reducing learning rate to 6.399999165296323e-07.
Epoch 8

In [15]:
from tensorflow.keras.models import load_model
tt_model = load_model(r'D:\Project\Pet_Project\Demand_Forecast\Experiments\nn_model.hdf5')

In [16]:
test_df, _, _ = prepare_df(X_test, isTrain=False, shuffle=False)
x_fit_test = data_for_model(test_df)

scaled_preds = tt_model.predict(x=x_fit_test)

In [17]:
scaled_preds.ravel().shape

(214200,)

In [18]:
y_predictions = scaler.inverse_transform(scaled_preds.reshape(-1, 1))
y_predictions = y_predictions.reshape(-1)

In [20]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_predictions
})
submission[submission < 0] = 0
submission.to_csv(r'D:\Project\Pet_Project\Demand_Forecast\Results\keras_nn_3.csv', index=False)