In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import tqdm
import re
from sklearn.multioutput import MultiOutputRegressor
import seaborn as sns
import matplotlib.pyplot as plt
import re

from warnings import simplefilter
simplefilter('ignore')

import matplotlib.pyplot as plt
import lightgbm as lgb

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os

import seaborn as sns; sns.set()
import gc

from sklearn import preprocessing

from typing import Union
from tqdm.notebook import tqdm_notebook as tqdm

In [2]:
groups = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2',  'TX_3', 'WI_1', 'WI_2', 'WI_3']

In [3]:
'''
weight_df = pd.read_csv('../data/weight_df.csv')
weight_df.rename({'0': 'weight'}, axis=1, inplace=True)
weight_dict = {
    f'{item_id}_{store_id}_evaluation': weight for item_id, store_id, weight in weight_df.values
}
weight_dict
'''

"\nweight_df = pd.read_csv('../data/weight_df.csv')\nweight_df.rename({'0': 'weight'}, axis=1, inplace=True)\nweight_dict = {\n    f'{item_id}_{store_id}_evaluation': weight for item_id, store_id, weight in weight_df.values\n}\nweight_dict\n"

In [4]:
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 'all'  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')]\
                     .columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')]\
                               .columns.tolist()
        
        #valid_target_columns = valid_df.columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], 
                                 axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)\
                    [valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns]\
                    .set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index()\
                   .rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left',
                                    on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd'])\
                    .unstack(level=2)['value']\
                    .loc[zip(self.train_df.item_id, self.train_df.store_id), :]\
                    .reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns],
                               weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt) 

    def score(self, valid_preds: Union[pd.DataFrame, 
                                       np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape \
               == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, 
                                       columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], 
                                 valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):

            valid_preds_grp = valid_preds.groupby(group_id)[self.valid_target_columns].sum()
            setattr(self, f'lv{i + 1}_valid_preds', valid_preds_grp)
            
            lv_scores = self.rmsse(valid_preds_grp, i + 1)
            setattr(self, f'lv{i + 1}_scores', lv_scores)
            
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, 
                                  sort=False).prod(axis=1)
            
            all_scores.append(lv_scores.sum())
            
        self.all_scores = all_scores

        return np.mean(all_scores)

In [5]:
def get_wrmsse_score(train_df, pred_df, split_day):
    delta = 1941 - split_day
    train_fold_df = train_df.iloc[:, : -delta]
    if delta == 28:
        valid_fold_df = train_df.iloc[:, -delta:]
    else:
        valid_fold_df = train_df.iloc[:, -delta: -delta+28]

    pred_df.rename({f'F{i}': f'd_{split_day+i}' for i in range(1,29)}, axis=1, inplace=True)

    pred_df = submission[['id']].merge(pred_df, on = 'id')

    evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
    return evaluator.score(pred_df[[col for col in pred_df.columns if re.match('d_\d{1,4}', col)]])

In [6]:
train_data = pd.read_csv('../data/sales_train_evaluation.csv')
calendar = pd.read_csv('../data/calendar.csv')
prices = pd.read_csv('../data/sell_prices.csv')
submission = pd.read_csv('../data/sample_submission.csv')

In [7]:
'''
lgb_params = {
    #"boosting_type": "goss",
    "n_estimators": 500,
    "boosting_type": "gbdt",
    "objective": "tweedie",
    "tweedie_variance_power": 1.1,
    "metric": "rmse",
    "learning_rate": 0.01,
    #"num_leaves": 2 ** 5 - 1,
    #"min_data_in_leaf": 2 ** 12 - 1,
    "feature_fraction": 0.5,
    #"max_bin": 100,
    "boost_from_average": False,
    #"num_boost_round": 1400,
    "verbose": -1,
    #"num_threads": os.cpu_count(),
    "force_row_wise": True,
    "seed": 42
}
'''

lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
             #'subsample': 0.5,
             #'subsample_freq': 1,
            'learning_rate': 0.03,
            'num_leaves': 2 ** 11 - 1,
            'min_data_in_leaf': 2 ** 12 - 1,
            'feature_fraction': 0.5,
            'max_bin': 100,
            'boost_from_average': False,
            'num_boost_round': 1400,
            'verbose': -1,
            'num_threads': os.cpu_count(),
            'force_row_wise': True,
        }

In [8]:
def make_multistep_target(df, steps):
    return pd.DataFrame(
        {f'F{i}': df.groupby('id').sales.shift(-i)
         for i in range(1, steps+1)})

In [12]:
%%time
all_subs = pd.DataFrame()
for store_id in groups:
    feature_df = pd.read_feather(f'../data/store_data/grid_full_store_{store_id}_1941_to_1948.feather')
    store_train_data = train_data[train_data.store_id==store_id]
    
    target_df = make_multistep_target(feature_df, 28)
    
    full_df = pd.concat([feature_df, target_df], axis=1)
    
    train_df = full_df[full_df['d']<=1913]
    
    X_columns = train_df.columns.drop(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])\
                    .drop(target_df.columns).drop(['event_name_1','event_type_1','event_name_2','event_type_2'])
    
    X = train_df[X_columns].fillna(0)
    y = train_df[target_df.columns]
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    #model = LinearRegression()
    #model.fit(X, y, sample_weight=sample_weights.values)
    
    model = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))
    model.fit(X, y)

    #y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
    #y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
    
    #train_rmse = mean_squared_error(y_train, y_fit, squared=False)
    #test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    #print((f"Train RMSE: {train_rmse:.2f}\n" f"Test RMSE: {test_rmse:.2f}"))
    
    y_fore = pd.DataFrame(model.predict(full_df[full_df.d==1941][X_columns]), columns=target_df.columns)
    y_fore = y_fore.clip(0)
    y_fore['id'] = full_df[full_df.d==1941].reset_index().id
    
    y_valid = pd.DataFrame(model.predict(full_df[full_df.d==1913][X_columns]), columns=target_df.columns)
    y_valid = y_valid.clip(0)
    y_valid['id']  = full_df[full_df.d==1941].reset_index().id
    y_valid['id'] = y_valid['id'].apply(lambda x: x.replace('_evaluation', '_validation'))
    
    all_sub = pd.concat([y_valid, y_fore], axis=0)
    all_sub.to_csv(f'../result/cat_and_store/lgb_{store_id}_submission.csv', index=False)
    
    all_subs = pd.concat([all_sub, all_subs], axis=0)
    
    wrmsse_score = get_wrmsse_score(store_train_data.reset_index(), y_valid, 1913)
    print(f"WRMSSE: {wrmsse_score:.5f}")
    
    print(f'{store_id} done')



  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.36786
CA_1 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.41210
CA_2 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.37316
CA_3 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.49820
CA_4 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.46647
TX_1 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.39868
TX_2 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.51876
TX_3 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.38263
WI_1 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.55377
WI_2 done


  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.41407
WI_3 done
CPU times: user 11d 20h 31min 21s, sys: 5d 11h 15min 43s, total: 17d 7h 47min 4s
Wall time: 5d 20h 47min 21s


In [13]:
all_subs.to_csv(f'../result/lgb_store_submission.csv', index=False)

In [14]:
all_subs['is_valid'] = all_subs.id.apply(lambda x: 1 if '_validation' in x else 0)
pred_data = all_subs[all_subs['is_valid']==1].reset_index()

wrmsse_score = get_wrmsse_score(train_data, pred_data, 1913)
print(f"WRMSSE: {wrmsse_score:.5f}")

  0%|          | 0/12 [00:00<?, ?it/s]

WRMSSE: 0.39572


In [11]:
sample_weights.values

array([6.00131256e-05, 2.11836142e-06, 1.26781530e-05, ...,
       7.76376791e-07, 1.40868366e-06, 7.76376791e-07])

In [31]:
feature_importance = pd.Series(np.mean([estimator.feature_importances_ for estimator in model.estimators_], axis=0), index=X_columns).sort_values(ascending=False)

In [32]:
feature_importance.head(20)

enc_item_id_std     89695.071429
d                   85629.785714
enc_item_id_mean    83978.642857
tm_w                82800.142857
tm_d                64432.357143
release             61290.535714
rolling_std_180     55818.928571
rolling_mean_180    53205.285714
rolling_std_60      46744.178571
price_momentum_m    45408.714286
rolling_mean_60     45042.107143
price_std           43160.785714
item_nunique        42746.821429
rolling_std_30      42138.500000
price_max_cent      37710.714286
price_min_cent      37326.000000
rolling_std_14      34405.285714
rolling_mean_30     34343.000000
moon                34121.892857
price_min           33872.892857
dtype: float64