In [1]:

import sys
sys.path.append('./tf_pipeline/')

from tf_pipeline.tf_utils import create_mlp_softmax

from tf_pipeline.conf import *
import pandas as pd
import os 
import numpy as np 
from datetime import datetime,timedelta


def create_dt(horizon="validation", tr_last=1913):
    prices = pd.read_csv(os.path.join(RAW_PATH, "sell_prices.csv"), dtype=PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()

    cal = pd.read_csv(os.path.join(RAW_PATH, "calendar.csv"), dtype=CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()

    numcols = [f"d_{day}" for day in range(1, tr_last + 1)]
    catcols = ["id", "item_id", "dept_id", "store_id", "cat_id", "state_id"]
    dtype = {numcol: "float32" for numcol in numcols}
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(
        os.path.join(RAW_PATH, "sales_train_%s.csv" % horizon),
        usecols=catcols + numcols,
        dtype=dtype,
    )

    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()

    increasing_term = dt.groupby(["dept_id", "store_id"])[numcols].sum()
    increasing_term = (
                              increasing_term.T - increasing_term.T.shift(28)
                      ) / increasing_term.T.shift(28)
    increasing_term = increasing_term.reset_index(drop=True).iloc[-365:, :]
    rates = increasing_term[increasing_term.abs() < 1].mean() + 1
    rates = rates.reset_index().rename(columns={0: "rate"})

    for day in range(tr_last + 1, tr_last + 2 * 28 + 1):
        dt[f"d_{day}"] = np.nan

    dt = pd.melt(
        dt,
        id_vars=catcols,
        value_vars=[col for col in dt.columns if col.startswith("d_")],
        var_name="d",
        value_name="sales",
    )

    dt = dt.merge(cal, on="d", copy=False)
    dt = dt.merge(prices, on=["store_id", "item_id", "wm_yr_wk"], copy=False)
    dt = dt.merge(rates, how="left")

    return dt

def compute_share(dt):
    shares = (
        dt.groupby(["dept_id", "store_id", "date"])["sales"]
        .sum()
        .reset_index()
        .rename(columns={"sales": "gp_sales"})
    )
    dt = dt.merge(shares, how="left")
    dt["sales"] = dt["sales"] / dt["gp_sales"]
    dt.drop(["gp_sales"], axis=1, inplace=True)
    return dt

def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id", "sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins:
        for lag, lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = (
                dt[["id", lag_col]]
                    .groupby("id")[lag_col]
                    .transform(lambda x: x.rolling(win).mean())
            )

    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }

    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")
    return dt

In [2]:
horizon="validation"

if horizon=="validation":
    tr_last = 1913
    fday = datetime(2016, 4, 25)
elif horizon=="evaluation":
    tr_last = 1941
    fday = datetime(2016, 4, 25) + timedelta(days=28)
else:
    raise ValueError('Wrong horizon arg.')

dataframe = create_dt(horizon, tr_last)
print(dataframe.columns)
dataframe = compute_share(dataframe)
print(dataframe.columns)

dataframe = create_fea(dataframe)
print(dataframe.columns)


Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'rate'],
      dtype='object')
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'rate'],
      dtype='object')
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'rate', 'lag_7',
       'lag_28', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28', 'rmean_28_28',
       'week', 'quarter', 'mday'],


In [3]:
dataframe

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,rate,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1,0.022727,2011-01-29,11101,...,1.020196,,,,,,,4,1,29
1,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_2,0.030675,2011-01-30,11101,...,1.020196,,,,,,,4,1,30
2,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_3,0.000000,2011-01-31,11101,...,1.020196,,,,,,,5,1,31
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_4,0.000000,2011-02-01,11101,...,1.020196,,,,,,,5,1,1
4,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_5,0.000000,2011-02-02,11101,...,1.020196,,,,,,,5,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47735392,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1969,,2016-06-19,11621,...,1.033772,,,,,,,24,2,19
47735393,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1968,,2016-06-18,11621,...,1.033772,,,,,,,24,2,18
47735394,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1969,,2016-06-19,11621,...,1.033772,,,,,,,24,2,19
47735395,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1968,,2016-06-18,11621,...,1.033772,,,,,,,24,2,18


In [4]:
import tensorflow.keras as tfk 
class CustomCallback(tfk.callbacks.Callback):
    def __init__(self, print_n_epochs=10):
        super(CustomCallback, self).__init__()
        self.print_n_epochs = print_n_epochs
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch%self.print_n_epochs==0:
            try:
                print(f"E: {epoch} loss : {logs['loss']:.2f}, vrmse :{logs['val_root_mean_squared_error']:.2f}")
            except:
                print(f"E: {epoch} loss : {logs['loss']:.2f}")


In [None]:
import gc
import warnings
list_preds = list()
gc.collect()
warnings.filterwarnings('ignore')  # TF casse les ******
from tqdm import tqdm

for _, df_gp in tqdm(dataframe.groupby(['store_id', 'dept_id']),total=70):

    cat_feats = ['wday',
                 'quarter',
                 "event_name_1",
                 "event_name_2",
                 "event_type_1",
                 "event_type_2",
                 ]

    n_items = len(df_gp['item_id'].drop_duplicates())
    #print(f'{n_items} in the hierachie')

    ids = df_gp[['id', 'item_id']].drop_duplicates()\
                                  .sort_values('item_id')['id']\
                                  .tolist()
    X = df_gp[
        ['d',
         'item_id',
         'date',
         'rmean_28_28',
         'sales']+cat_feats].pivot_table(index=['d', 'date']+cat_feats,
                  columns=['item_id'],
                  values=['rmean_28_28', 'sales']).fillna(0)

    num_feats = ['_'.join(list(map(str, c))) for c in X.columns.tolist()]
    X.columns = num_feats

    target_feats = num_feats[n_items:]
    num_feats = num_feats[:n_items]
    X = X.reset_index()

    X_train = X[(X['date'] < fday) & (X['date'] >= fday -
                                      timedelta(days=364))][num_feats+cat_feats]
    X_test = X[X['date'] >= fday][num_feats+cat_feats]

    input_dict_train = {'input_%s' %
                        c: X_train[c] for c in num_feats+cat_feats}
    input_dict_test = {'input_%s' % c: X_test[c] for c in num_feats+cat_feats}

    cardinality = X[cat_feats].nunique()

    y_train = X[(X['date'] < fday) & (X['date'] >= fday -
                                      timedelta(days=364))][target_feats].values

    mlp = create_mlp_softmax(layers_list=[32, 64, ],
                             emb_dim=1,
                             output_count=n_items,
                             cat_feats=cat_feats,
                             cardinality=cardinality,
                             num_feats=num_feats)
    #print(f"{mlp.count_params()} parameters")

    training_params = {
        'x': input_dict_train,
        'y': y_train,
        'batch_size': 32,
        'epochs': 300,
        #'callbacks': [CustomCallback(49)],
        'shuffle': True,
        'verbose': 0
    }

    mlp.fit(**training_params)
    preds = mlp.predict(input_dict_test)
    preds = pd.DataFrame(preds,
                         index=['F%s' % c for c in range(1, 29)],
                         columns=ids).T
    
    list_preds.append(preds)
    del mlp
    gc.collect() 

preds = pd.concat(list_preds)

 54%|█████████████████████████████████████████▊                                   | 38/70 [1:01:01<50:59, 95.60s/it]

In [None]:
preds = pd.concat(list_preds)
preds = preds.reset_index()
preds.columns = ['id'] + preds.columns.tolist()[1:]

preds.to_csv(
        os.path.join(EXTERNAL_PATH, "tf_weights_%s.csv" % horizon), index=False
    )

In [None]:
!python score_submission_validation.py