In [1]:

import sys
sys.path.append('./tf_pipeline/')

from tf_pipeline.tf_utils import create_mlp_softmax

from tf_pipeline.conf import *
import pandas as pd
import os 
import numpy as np 
from datetime import datetime,timedelta


def create_dt(horizon="validation", tr_last=1913):
    prices = pd.read_csv(os.path.join(RAW_PATH, "sell_prices.csv"), dtype=PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()

    cal = pd.read_csv(os.path.join(RAW_PATH, "calendar.csv"), dtype=CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()

    numcols = [f"d_{day}" for day in range(1, tr_last + 1)]
    catcols = ["id", "item_id", "dept_id", "store_id", "cat_id", "state_id"]
    dtype = {numcol: "float32" for numcol in numcols}
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(
        os.path.join(RAW_PATH, "sales_train_%s.csv" % horizon),
        usecols=catcols + numcols,
        dtype=dtype,
    )

    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()

    increasing_term = dt.groupby(["dept_id", "store_id"])[numcols].sum()
    increasing_term = (
                              increasing_term.T - increasing_term.T.shift(28)
                      ) / increasing_term.T.shift(28)
    increasing_term = increasing_term.reset_index(drop=True).iloc[-365:, :]
    rates = increasing_term[increasing_term.abs() < 1].mean() + 1
    rates = rates.reset_index().rename(columns={0: "rate"})

    for day in range(tr_last + 1, tr_last + 2 * 28 + 1):
        dt[f"d_{day}"] = np.nan

    dt = pd.melt(
        dt,
        id_vars=catcols,
        value_vars=[col for col in dt.columns if col.startswith("d_")],
        var_name="d",
        value_name="sales",
    )

    dt = dt.merge(cal, on="d", copy=False)
    dt = dt.merge(prices, on=["store_id", "item_id", "wm_yr_wk"], copy=False)
    dt = dt.merge(rates, how="left")

    return dt

def compute_share(dt):
    shares = (
        dt.groupby(["dept_id", "store_id", "date"])["sales"]
        .sum()
        .reset_index()
        .rename(columns={"sales": "gp_sales"})
    )
    dt = dt.merge(shares, how="left")
    dt["sales"] = dt["sales"] / dt["gp_sales"]
    dt.drop(["gp_sales"], axis=1, inplace=True)
    return dt

def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id", "sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins:
        for lag, lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = (
                dt[["id", lag_col]]
                    .groupby("id")[lag_col]
                    .transform(lambda x: x.rolling(win).mean())
            )

    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }

    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")
    return dt

def train_and_pred(horizon="validation"):

    if horizon=="validation":
        tr_last = 1913
        fday = datetime(2016, 4, 25)
    elif horizon=="evaluation":
        tr_last = 1941
        fday = datetime(2016, 4, 25) + timedelta(days=28)
    else:
        raise ValueError('Wrong horizon arg.')

    dataframe = create_dt(horizon, tr_last)
    dataframe = compute_share(dataframe)
    dataframe = create_fea(dataframe)
    dataframe.dropna(inplace=True)
    

    list_preds = list()

    for _, df_gp in dataframe.groupby(['store_id', 'dept_id']):

        cat_feats = ['wday', 'quarter']

        n_items = len(df_gp['item_id'].drop_duplicates())    

        ids = df_gp[['id', 'item_id']].drop_duplicates()\
                                      .sort_values('item_id')['id']\
                                      .tolist()
        X = df_gp[
            ['d',
             'item_id',
             'wday',
             'quarter',
             'date',
             'rmean_28_28',
             'sales',
             ]
            
        ].pivot_table(index=['d', 'date', 'wday', 'quarter'],
                      columns=['item_id'],
                      values=['rmean_28_28', 'sales']).fillna(0)

        num_feats = ['_'.join(list(map(str, c))) for c in X.columns.tolist()]
        X.columns = num_feats

        target_feats = num_feats[n_items:]
        num_feats = num_feats[:n_items]
        X = X.reset_index()

        X_train = X[X['date']<fday][num_feats+cat_feats]
        X_test = X[X['date']>=fday][num_feats+cat_feats]

        input_dict_train = {'input_%s' % c: X_train[c] for c in num_feats+cat_feats}
        input_dict_test = {'input_%s' % c: X_test[c] for c in num_feats+cat_feats}

        cardinality = X[cat_feats].nunique()

        y_train = X[X['date']<fday][target_feats].values

        mlp = create_mlp_softmax(layers_list=[2048, 2048],
                                 output_count=n_items,
                                 cat_feats=cat_feats,
                                 cardinality=cardinality,
                                 num_feats=num_feats)



        training_params = {
                    'x': input_dict_train,
                    'y': y_train,
                    'batch_size': 128,
                    'epochs': 20,
                    'shuffle': True,
                }

        mlp.fit(**training_params)
        preds = mlp.predict(input_dict_test)
        preds = pd.DataFrame(preds,
                             index=['F%s' % c for c in range(1,29)],
                             columns=ids).T
        list_preds.append(preds)

    preds = pd.concat(list_preds)
    preds = preds.reset_index()
    preds.columns = ['id'] + preds.columns.tolist()[1:]
    
    preds.to_csv(
            os.path.join(EXTERNAL_PATH, "tf_weights_%s.csv" % horizon), index=False
        )

In [2]:
horizon="validation"

if horizon=="validation":
    tr_last = 1913
    fday = datetime(2016, 4, 25)
elif horizon=="evaluation":
    tr_last = 1941
    fday = datetime(2016, 4, 25) + timedelta(days=28)
else:
    raise ValueError('Wrong horizon arg.')

dataframe = create_dt(horizon, tr_last)
dataframe = compute_share(dataframe)
dataframe = create_fea(dataframe)

In [3]:
import tensorflow.keras as tfk 
class CustomCallback(tfk.callbacks.Callback):
    def __init__(self, print_n_epochs=10):
        super(CustomCallback, self).__init__()
        self.print_n_epochs = print_n_epochs
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch%self.print_n_epochs==0:
            try:
                print(f"E: {epoch} loss : {logs['loss']:.2f}, vrmse :{logs['val_root_mean_squared_error']:.2f}")
            except:
                print(f"E: {epoch} loss : {logs['loss']:.2f}")


In [4]:
import gc
import warnings
list_preds = list()
gc.collect()
warnings.filterwarnings('ignore')  # TF casse les ******

for _, df_gp in dataframe.groupby(['store_id', 'dept_id']):

    cat_feats = ['wday', 'quarter']

    n_items = len(df_gp['item_id'].drop_duplicates())
    print(f'{n_items} in the hierachie')

    ids = df_gp[['id', 'item_id']].drop_duplicates()\
                                  .sort_values('item_id')['id']\
                                  .tolist()
    X = df_gp[
        ['d', 'item_id', 'wday', 'quarter', 'date', 'rmean_28_28', 'sales']
    ].pivot_table(index=['d', 'date', 'wday', 'quarter'],
                  columns=['item_id'],
                  values=['rmean_28_28', 'sales']).fillna(0)

    num_feats = ['_'.join(list(map(str, c))) for c in X.columns.tolist()]
    X.columns = num_feats

    target_feats = num_feats[n_items:]
    num_feats = num_feats[:n_items]
    X = X.reset_index()

    X_train = X[(X['date'] < fday) & (X['date'] >= fday -
                                      timedelta(days=364))][num_feats+cat_feats]
    X_test = X[X['date'] >= fday][num_feats+cat_feats]

    input_dict_train = {'input_%s' %
                        c: X_train[c] for c in num_feats+cat_feats}
    input_dict_test = {'input_%s' % c: X_test[c] for c in num_feats+cat_feats}

    cardinality = X[cat_feats].nunique()

    y_train = X[(X['date'] < fday) & (X['date'] >= fday -
                                      timedelta(days=364))][target_feats].values

    mlp = create_mlp_softmax(layers_list=[8, 16, ],
                             emb_dim=3,
                             output_count=n_items,
                             cat_feats=cat_feats,
                             cardinality=cardinality,
                             num_feats=num_feats)
    print(f"{mlp.count_params()} parameters")

    training_params = {
        'x': input_dict_train,
        'y': y_train,
        'batch_size': 32,
        'epochs': 300,
        'callbacks': [CustomCallback(49)],
        'shuffle': True,
        'verbose': 0
    }

    mlp.fit(**training_params)
    preds = mlp.predict(input_dict_test)
    preds = pd.DataFrame(preds,
                         index=['F%s' % c for c in range(1, 29)],
                         columns=ids).T
    list_preds.append(preds)
    del mlp
    gc.collect()

preds = pd.concat(list_preds)

416 in the hierachie
17514 parameters
E: 0 loss : 6.03
E: 49 loss : 5.29
E: 98 loss : 5.28
E: 147 loss : 5.27
E: 196 loss : 5.27
149 in the hierachie
6567 parameters
E: 0 loss : 5.04
E: 49 loss : 4.58
E: 98 loss : 4.53
E: 147 loss : 4.52
E: 196 loss : 4.49
532 in the hierachie
22270 parameters
E: 0 loss : 6.26
E: 49 loss : 5.77
E: 98 loss : 5.76
E: 147 loss : 5.76
E: 196 loss : 5.75
515 in the hierachie
21573 parameters
E: 0 loss : 6.24
E: 49 loss : 5.79
E: 98 loss : 5.78
E: 147 loss : 5.77
E: 196 loss : 5.76
216 in the hierachie
9314 parameters
E: 0 loss : 5.39
E: 49 loss : 4.85
E: 98 loss : 4.84
E: 147 loss : 4.83
E: 196 loss : 4.82
398 in the hierachie
16776 parameters
E: 0 loss : 5.98
E: 49 loss : 5.41
E: 98 loss : 5.40
E: 147 loss : 5.39
E: 196 loss : 5.38
823 in the hierachie
34201 parameters
E: 0 loss : 6.69
E: 49 loss : 5.94
E: 98 loss : 5.93
E: 147 loss : 5.92
E: 196 loss : 5.91
416 in the hierachie
17514 parameters
E: 0 loss : 6.02
E: 49 loss : 5.39
E: 98 loss : 5.38
E: 147 l

E: 147 loss : 5.27
E: 196 loss : 5.26
149 in the hierachie
6567 parameters
E: 0 loss : 5.05
E: 49 loss : 4.59
E: 98 loss : 4.52
E: 147 loss : 4.49
E: 196 loss : 4.47
532 in the hierachie
22270 parameters
E: 0 loss : 6.26
E: 49 loss : 5.58
E: 98 loss : 5.56
E: 147 loss : 5.56
E: 196 loss : 5.55
515 in the hierachie
21573 parameters
E: 0 loss : 6.23
E: 49 loss : 5.67
E: 98 loss : 5.65
E: 147 loss : 5.63
E: 196 loss : 5.63
216 in the hierachie
9314 parameters
E: 0 loss : 5.38
E: 49 loss : 4.58
E: 98 loss : 4.55
E: 147 loss : 4.55
E: 196 loss : 4.53
398 in the hierachie
16776 parameters
E: 0 loss : 5.97
E: 49 loss : 5.38
E: 98 loss : 5.35
E: 147 loss : 5.34
E: 196 loss : 5.34
823 in the hierachie
34201 parameters
E: 0 loss : 6.70
E: 49 loss : 5.69
E: 98 loss : 5.66
E: 147 loss : 5.65
E: 196 loss : 5.64


In [5]:
preds = pd.concat(list_preds)
preds = preds.reset_index()
preds.columns = ['id'] + preds.columns.tolist()[1:]

preds.to_csv(
        os.path.join(EXTERNAL_PATH, "tf_weights_%s.csv" % horizon), index=False
    )

In [None]:
!python score_submission_validation.py