In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, random,datetime

from tf_utils import *
sys.path.append(".") # For execution form the main file 
sys.path.append("..") # For execution from the notebook 

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [2]:
CAL_DTYPES={"event_name_1": "category",
            "event_name_2": "category",
            "event_type_1": "category", 
            "event_type_2": "category",
            "weekday": "category", 
            'wm_yr_wk': 'int16', "wday": "int16",
            "month": "int16", "year": "int16",
            "snap_CA": "float32",
            'snap_TX': 'float32',
            'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category",
                "item_id": "category",
                "wm_yr_wk": "int16", 
                "sell_price":"float32" }


In [3]:
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime.datetime(2016,4, 25) 

In [4]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("../data/raw/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("../data/raw/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("../data/raw/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [5]:
FIRST_DAY = 1 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !
df = create_dt(is_train=True, first_day= FIRST_DAY)
create_fea(df)
print(f"MEMORY USAGE : {df.memory_usage().sum()/1e9}")



MEMORY USAGE : 5.06307527


In [None]:
cat_feats = ['item_id', 
             'dept_id',
             'store_id',
             'cat_id',
             'state_id'] +\
            ["event_name_1",
             "event_name_2",
             "event_type_1",
             "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)
gc.collect()

In [None]:
df.memory_usage().sum()/1e9

In [None]:
# get the weights for the training (the older the sample the less it will have impact )
weights = df['d'].str[2:].astype(int)
weights = weights/np.max(weights)

In [None]:

num_feats = df.columns[~df.columns.isin(useless_cols+cat_feats)].to_list()
train_cols = num_feats+cat_feats

X_train = df[train_cols]
y_train = df["sales"]

# np.random.seed(777)
# fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
# train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)

# X_test,y_test = X_train.loc[fake_valid_inds],y_train.loc[fake_valid_inds]
# X_train,y_train = X_train.loc[train_inds],y_train.loc[train_inds]
cardinality  = df[cat_feats].max()
weights_train =  weights.loc[X_train.index]

In [None]:
# further preprocessing 
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train[num_feats] = scaler.fit_transform(X_train[num_feats])
# X_test[num_feats] = scaler.fit_transform(X_test[num_feats])

# prepare input for tensorflow 
# as we have multiple input type the best solution is to feed a dict like object 

input_dict = {f"input_{col}": X_train[col] for col in X_train.columns}
# input_dict_test = {f"input_{col}": X_test[col] for col in X_train.columns}

del df,X_train,X_test
gc.collect()

In [None]:
## TF2 model 

# Dense model, not sequential

import tensorflow as tf 
import tensorflow.keras as tfk
import tensorflow.keras.backend as K

tfkl = tfk.layers

# # loss for a poisson regression 
# def poisson(y_true, y_pred): 
#     return K.mean(K.maximum(.0, y_pred) - y_true * K.log(K.maximum(.0, y_pred) + K.epsilon()), axis=-1)

# def tweedie_loss(y_true, y_pred):
#     p=1.5
#     dev = K.pow(y_true, 2-p)/((1-p) * (2-p)) \
#     - y_true * K.pow(K.maximum(.0, y_pred)+ K.epsilon(), 1-p)/(1-p) \
#     + K.pow(K.maximum(.0, y_pred)+ K.epsilon(), 2-p)/(2-p)
#     return K.mean(dev,axis=-1)

# alpha=.5
# def weighted_loss(y_true, y_pred):
#     ''' make a comprised loss of poisson and tweedie distribution'''
#     return (1 - alpha) * poisson(y_true, y_pred) + alpha * tweedie(y_true, y_pred)

# def create_mlp(layers_list=[512,256,128,64]):
#     '''
#     description : 
#     generate regression mlp with
#     both embedding entries for categorical features and 
#     standard inputs for numerical features

#     params:
#     layers_list : list of layers dimensions 
#     output :
#     compiled keras model  
#     '''

#     # define our MLP network
#     layers = []
#     output_num = []
#     inputs = []
#     output_cat = []
#     output_num = []
    
#     # sequencial inputs 

#     # numerical data part
#     if len(num_feats) > 1:
#         for num_var in num_feats:
#             print(num_var)
#             input_num = tfkl.Input(
#                 shape=(1,), name='input_{0}'.format(num_var))
#             inputs.append(input_num)
#             output_num.append(input_num)
#         output_num = tfkl.Concatenate(name='concatenate_num')(output_num)
#         output_num = tfkl.BatchNormalization()(output_num)

#     else:
#         input_num = tfkl.Input(
#             shape=(1,), name='input_{0}'.format(numeric_features[0]))
#         inputs.append(input_num)
#         output_num = input_num

#     # categorical data input 
#     for categorical_var in cat_feats:
#         no_of_unique_cat = cardinality[categorical_var] # should me nunique() but events are poorly preprocessed 
#         print(categorical_var , no_of_unique_cat)
#         embedding_size = min(np.ceil((no_of_unique_cat)/2), 50)
#         embedding_size = int(embedding_size)
#         vocab = no_of_unique_cat+1
#         # functionnal loop
#         input_cat = tfkl.Input(
#             shape=(1,), name='input_{0}'.format(categorical_var))
#         inputs.append(input_cat)
#         embedding = tfkl.Embedding(vocab,
#                                    embedding_size,
#                                    embeddings_regularizer = tf.keras.regularizers.l1(1e-8),
#                                    name='embedding_{0}'.format(categorical_var))(input_cat)
#         embedding = tfkl.Dropout(0.1)(embedding)
#         vec = tfkl.Flatten(name='flatten_{0}'.format(
#             categorical_var))(embedding)
        
#         output_cat.append(vec)
#     output_cat = tfkl.Concatenate(name='concatenate_cat')(output_cat)

#     # concatenate numerical input and embedding output
#     dense = tfkl.Concatenate(name='concatenate_all')([output_num, output_cat])

#     for i in range(len(layers_list)):
#         dense = tfkl.Dense(layers_list[i],
#                            name='Dense_{0}'.format(str(i)),
#                            activation='elu')(dense)
#         dense = tfkl.Dropout(.1)(dense)
#         dense = tfkl.BatchNormalization()(dense)

#     dense2 = tfkl.Dense(1, name='Output', activation='elu')(dense)
#     model = tfk.Model(inputs, dense2)

#     opt = tfk.optimizers.Adam(learning_rate=1e-2)
#     model.compile(loss=poisson, optimizer=opt, metrics=[tf.keras.metrics.RootMeanSquaredError()])
#     return model

# try:
#     del mdl
# except:
#     pass

create_mlp(layers_list=[512,256,128,64],
           emb_dim=50,
           loss_fn='poisson',
           learning_rate=1e-2,
           optimizer=tfk.optimizers.Adam,
           cat_feats=cat_feats,
           num_feats=num_feats,
           cardinality=cardinality, verbose=1)
mdl = create_mlp()

# mdl = create_mlp()
# mdl.summary()

In [None]:
# checkpoints  
model_save = tfk.callbacks.ModelCheckpoint('model_checkpoints')
early_stopping = tfk.callbacks.EarlyStopping('val_root_mean_squared_error',
                                             patience = 7,
                                            restore_best_weights=True)
history = mdl.fit(input_dict,
                  y_train.values,
                  #validation_data=(input_dict_test, y_test.values),
                  batch_size=4096,
                  epochs=100,
                  shuffle=True,
                  sample_weight = weights_train.values,
                  callbacks=[early_stopping],
                  verbose=1,
                   )

In [None]:
mdl.save('keras_poisson_stable2.h5')

In [None]:
# import matplotlib.pyplot as plt 
# plt.plot(history.history['loss'])
# # plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'valid'], loc='upper left')
# plt.show()

# plt.plot(history.history['root_mean_squared_error'])
# # plt.plot(history.history['val_loss'])
# plt.title('model rmse')
# plt.ylabel('rmse')
# plt.xlabel('epoch')
# plt.legend(['train', 'valid'], loc='upper left')
# plt.show()
# plt.savefig('poisson.png')

In [None]:
# len(history.history['loss'])

In [None]:
from datetime import timedelta 
alphas = [1.035, 1.03, 1.025]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(icount, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        input_dict_predict = {f"input_{col}": tst[col] for col in tst.columns}
        pred = mdl.predict(input_dict_predict,batch_size=10000)
        te.loc[te.date == day, "sales"] = alpha*pred
        print(pred)


    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission_tf_stable.csv",index=False)

In [None]:
# # submit using the tensorflow model 

# alphas = [1.035, 1.03, 1.025]
# weights = [1/len(alphas)]*len(alphas)
# sub = 0.

# for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

#     te = create_dt(False)
#     cols = [f"F{i}" for i in range(1,29)]

#     for tdelta in range(0, 28):
#         day = fday + timedelta(days=tdelta)
#         print(icount, day)
#         tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
#         create_fea(tst)
#         tst = tst.loc[tst.date == day , train_cols]
#         input_dict_predict = {f"input_{col}": tst[col] for col in tst.columns}
#         te.loc[te.date == day, "sales"] = alpha*mdl.predict(input_dict_predict,batch_size=10000) # magic multiplier by kyakovlev

#     te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
#     te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
#     te_sub.fillna(0., inplace = True)
#     te_sub.sort_values("id", inplace = True)
#     te_sub.reset_index(drop=True, inplace = True)
#     te_sub.to_csv(f"submission_{icount}.csv",index=False)
#     if icount == 0 :
#         sub = te_sub
#         sub[cols] *= weight
#     else:
#         sub[cols] += te_sub[cols]*weight
#     print(icount, alpha, weight)

# sub2 = sub.copy()
# sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
# sub = pd.concat([sub, sub2], axis=0, sort=False)
# sub.to_csv("submission.csv",index=False)