In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit,StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold,train_test_split,GroupShuffleSplit,StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error,log_loss,confusion_matrix
import sqlite3
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
#from bayes_opt import BayesianOptimization
import re
from string import punctuation
from scipy.spatial import Voronoi
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
from tqdm.notebook import tqdm
from numba import jit
from collections import Counter
import json
import joblib
import multiprocessing
import time
from scipy.sparse import csr_matrix
import gc

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int32', 'int64', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
sample_submission = pd.read_csv('../data/sample_submission.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')
sales_train = pd.read_csv('../data/sales_train_validation.csv')
calendar = pd.read_csv('../data/calendar.csv')

In [4]:
sales_train_long_format = pd.melt(sales_train,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name = 'day_num',value_name='sale')

In [5]:
def transform_day_to_num(str1):
    return int(str1[2:])
sales_train_long_format['day_num'] = sales_train_long_format['day_num'].map(transform_day_to_num)
calendar['date'] = pd.to_datetime(calendar['date'])
calendar['day_num'] = calendar['d'].map(transform_day_to_num)
map_day_date = calendar[['date','day_num']].set_index('day_num')['date']
sales_train_long_format['date'] = sales_train_long_format['day_num'].map(map_day_date)
list1 = ['wm_yr_wk','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI','day_num']
sales_train_long_format = sales_train_long_format.merge(calendar[list1],on='day_num',how='left')

In [6]:
sales_train_long_format = sales_train_long_format.merge(sell_prices,how='left',on = ['store_id','item_id','wm_yr_wk'])
#sales_train_long_format = reduce_mem_usage(sales_train_long_format)

In [29]:
del sales_train_long_format
gc.collect()

27

### metric
#### fast metric

In [15]:
data_for_weight = sales_train_long_format[sales_train_long_format.day_num.between(1886,1913)].copy()
data_for_weight['sale_used'] = data_for_weight['sale'] * data_for_weight['sell_price']
sales = pd.read_csv('../data/sales_train_validation.csv')

In [16]:
dummies_list = [sales.state_id, sales.store_id, 
                sales.cat_id, sales.dept_id, 
                sales.state_id + sales.cat_id, sales.state_id + sales.dept_id,
                sales.store_id + sales.cat_id, sales.store_id + sales.dept_id, 
                sales.item_id, sales.state_id + sales.item_id, sales.id]


## First element Level_0 aggregation 'all_sales':
dummies_df_list =[pd.DataFrame(np.ones(sales.shape[0]).astype(np.int8), 
                               index=sales.index, columns=['all']).T]

# List of dummy dataframes:
for i, cats in enumerate(dummies_list):
    dummies_df_list +=[pd.get_dummies(cats, drop_first=False).astype(np.int8).T]
    
# Concat dummy dataframes in one go:
## Level is constructed for free.
roll_mat_df = pd.concat(dummies_df_list, keys=list(range(12)), 
                        names=['level','id']).astype(np.int8, copy=False)

# Save values as sparse matrix & save index for future reference:
roll_index = roll_mat_df.index
roll_mat_csr = csr_matrix(roll_mat_df.values)
roll_mat_csr.shape

(42840, 30490)

In [17]:
# Fucntion to calculate S weights:
def get_s(drop_days=0):
    
    """
    drop_days: int, equals 0 by default, so S is calculated on all data.
               If equals 28, last 28 days want be used in calculating S.
    """
    
    # Rollup sales:
    d_name = ['d_' + str(i+1) for i in range(1913-drop_days)]
    sales_train_val = roll_mat_csr * sales[d_name].values

    # Find sales start index:
    start_no = np.argmax(sales_train_val>0, axis=1)
    
    # Replace days less than min day number with np.nan:
    # Next code line is super slow:
    flag = np.dot(np.diag(1/(start_no+1)) , np.tile(np.arange(1,1914-drop_days),(roll_mat_csr.shape[0],1)))<1
    sales_train_val = np.where(flag, np.nan, sales_train_val)

    # Denominator of RMSSE / RMSSE
    weight1 = np.nansum(np.diff(sales_train_val,axis=1)**2,axis=1)/(1913-start_no - 1)
    
    return weight1

In [18]:
S = get_s(drop_days=0)
S.shape

(42840,)

In [19]:
def get_w(sale_usd):
    """
    """
    # Calculate the total sales in USD for each item id:
    total_sales_usd = sale_usd.groupby(['id'],sort=False)['sale_used'].apply(np.sum).values
    
    # Roll up total sales by ids to higher levels:
    weight2 = roll_mat_csr * total_sales_usd
    
    return weight2/np.sum(weight2)

In [20]:
W = get_w(data_for_weight[['id','sale_used']])
W.shape

(42840,)

In [21]:
SW = W/np.sqrt(S)

In [22]:
# Function to do quick rollups:
def rollup(v):
    '''
    v - np.array of size (30490 rows, n day columns)
    v_rolledup - array of size (n, 42840)
    '''
    return (v.T*roll_mat_csr.T).T


# Function to calculate WRMSSE:
def wrmsse(preds, y_true, score_only=False, s = S, w = W, sw=SW):
    '''
    preds - Predictions: pd.DataFrame of size (30490 rows, N day columns)
    y_true - True values: pd.DataFrame of size (30490 rows, N day columns)
    sequence_length - np.array of size (42840,)
    sales_weight - sales weights based on last 28 days: np.array (42840,)
    '''
    
    if score_only:
        return np.sum(
                np.sqrt(
                    np.mean(
                        np.square(rollup(preds.values-y_true.values))
                            ,axis=1)) * sw)
    else: 
        score_matrix = (np.square(rollup(preds.values-y_true.values)) * np.square(w)[:, None]) / s[:, None]
        score = np.sum(np.sqrt(np.mean(score_matrix,axis=1)))
        return score, score_matrix

In [23]:
class WRMSSE_Evalator_super_version():
    def __init__(self,sw,roll_mat_csr):
        self.sw = sw
        self.roll_mat_csr = roll_mat_csr
    def rollup(self,v):
        '''
        v - np.array of size (30490 rows, n day columns) start with h.... not food
        v_rolledup - array of size (n, 42840)
        '''
        return (v.T*roll_mat_csr.T).T
    def feval(self,y_true,y_pred):
        """
        y_true,y_pred: np.ndarray
        """
        return 'WRMSSE',np.sum(
                np.sqrt(
                    np.mean(
                        np.square(self.rollup(y_pred.reshape(30490,-1,order = 'F')-y_true.reshape(30490,-1,order = 'F')))
                            ,axis=1)) * self.sw),False

In [24]:
evaluator_super = WRMSSE_Evalator_super_version(sw = SW,roll_mat_csr=roll_mat_csr)

#### my metric

### add features

In [25]:
train1 = pd.read_pickle('data_part1.pkl')
train4 = pd.read_pickle('data_part4.pkl')
#train5 = pd.read_pickle('data_part5.pkl')

In [26]:
print(train1.shape)
print(train4.shape)
#print(train5.shape)

(60034810, 96)
(60034810, 98)


In [27]:
cond4 = (train4.columns.str.endswith('_max'))|(train4.columns.str.endswith('_min'))|(train4.columns.str.endswith('_mean'))|(train4.columns.str.endswith('_std'))|(train4.columns.str.endswith('_median'))


In [37]:
cond5 = (train5.columns.str.endswith('_max'))|(train5.columns.str.endswith('_min'))|(train5.columns.str.endswith('_mean'))|(train5.columns.str.endswith('_std'))|(train5.columns.str.endswith('_median'))

In [28]:
for _ in train4.columns[cond4]:
    print(_)
    train1[_] = train4[_]
del train4
gc.collect()

state_id_rolling_7_sale_mean
state_id_rolling_7_sale_std
state_id_rolling_7_sale_median
state_id_rolling_7_sale_max
state_id_rolling_7_sale_min
state_id_rolling_14_sale_mean
state_id_rolling_14_sale_std
state_id_rolling_14_sale_median
state_id_rolling_14_sale_max
state_id_rolling_14_sale_min
state_id_rolling_28_sale_mean
state_id_rolling_28_sale_std
state_id_rolling_28_sale_median
state_id_rolling_28_sale_max
state_id_rolling_28_sale_min
store_id_rolling_7_sale_mean
store_id_rolling_7_sale_std
store_id_rolling_7_sale_median
store_id_rolling_7_sale_max
store_id_rolling_7_sale_min
store_id_rolling_14_sale_mean
store_id_rolling_14_sale_std
store_id_rolling_14_sale_median
store_id_rolling_14_sale_max
store_id_rolling_14_sale_min
store_id_rolling_28_sale_mean
store_id_rolling_28_sale_std
store_id_rolling_28_sale_median
store_id_rolling_28_sale_max
store_id_rolling_28_sale_min
cat_id_rolling_7_sale_mean
cat_id_rolling_7_sale_std
cat_id_rolling_7_sale_median
cat_id_rolling_7_sale_max
cat_id_r

156

In [None]:
# for _ in train5.columns[cond5]:
#     print(_)
#     train1[_] = train5[_]
#     train5.drop([_],axis=1)
# del train5
# gc.collect()

In [30]:
for _ in ['CA','TX','WI']:
    train1.loc[train1.state_id==_,'snap'] = list(train1.loc[train1.state_id==_,f'snap_{_}'])

In [31]:
drop_col = ['day_num','date','wm_yr_wk','snap_CA','snap_TX','snap_WI','sale']
cat_col = ['item_id','dept_id','cat_id','store_id','state_id','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','id']

In [32]:
for cat in cat_col:
    lbl = preprocessing.LabelEncoder()
    train1[cat] = lbl.fit_transform(train1[cat].astype(str))
    print(cat)

item_id
dept_id
cat_id
store_id
state_id
event_name_1
event_type_1
event_name_2
event_type_2
id


In [33]:
train1.fillna(-999,inplace=True)

In [34]:
def func_custom_1_2(scalar):
    if scalar ==0:
        return 0
    elif scalar <= 10**(-8):
        return 10**4
    else:
        return scalar **(-1/2)

In [35]:
func1 = np.vectorize(func_custom_1_2)

In [36]:
def custom_obj(y_true, y_pred):
    residual = (y_true - y_pred).astype(np.float128)
    residual = residual.reshape((-1,30490)) 
    d = residual.shape[0]    
    residual_42840 = residual * roll_mat_csr.T
    mean_square_42840 = np.square(residual_42840).sum(axis = 0)
    temp_1 = func1(mean_square_42840)
    part1 = ((1/np.sqrt(S)) * temp_1)/np.sqrt(d)
    part_all = np.multiply(part1,residual_42840) 
    grad = -(part_all * roll_mat_csr).reshape((-1))
    hess_part1 = np.multiply(-(1/np.sqrt(S))/np.sqrt(d)*temp_1**3,np.square(residual_42840))
    hess_part2 = (1/np.sqrt(S))/np.sqrt(d)*temp_1
    hess = ((hess_part1 + hess_part2)*roll_mat_csr).reshape((-1))
    return grad,hess

In [37]:
def rmse(y_true, y_pred):
    return 'RMSE', np.sqrt(np.mean(np.power((y_pred) - (y_true), 2))), False

#### fold1

In [39]:
X_train = train1[(train1.day_num<=1885)&(train1.day_num>=1885-1-364)&(train1.date!='2015-12-25')].drop(drop_col,axis=1).copy()
y_train = train1.loc[(train1.day_num<=1885)&(train1.day_num>=1885-1-364)&(train1.date!='2015-12-25'),'sale']
X_valid = train1[(train1.day_num<=1913)&(train1.day_num>=1886)].drop(drop_col,axis=1).copy()
y_valid = train1.loc[(train1.day_num<=1913)&(train1.day_num>=1886),'sale']
y_train[y_train==-999] = 0
y_valid[y_valid==-999] = 0

In [40]:
lgb_re2 = lgb.LGBMRegressor(n_estimators=1000, random_state=51,subsample=0.8,
                         colsample_bytree=0.8,learning_rate=0.05 ,importance_type = 'gain',
                 max_depth = -1, num_leaves = 2**8-1,metric='None',bagging_freq=1,n_jobs=-1,
                         first_metric_only = True,objective=custom_obj,min_data_in_leaf =  2**8-1)

In [41]:
lgb_re2.fit(X_train,y_train,eval_set = [(X_valid,y_valid)],verbose=10,
           eval_metric =lambda y_true, y_pred: [evaluator_super.feval(y_true, y_pred), rmse(y_true, y_pred)],
           early_stopping_rounds=50)#,categorical_feature=cat_col)

Training until validation scores don't improve for 50 rounds
[10]	valid_0's WRMSSE: 2.39383	valid_0's RMSE: 2.45954
[20]	valid_0's WRMSSE: 1.1484	valid_0's RMSE: 2.03346
[30]	valid_0's WRMSSE: 0.705997	valid_0's RMSE: 1.93085
[40]	valid_0's WRMSSE: 0.581926	valid_0's RMSE: 1.90718
[50]	valid_0's WRMSSE: 0.553732	valid_0's RMSE: 1.9013
[60]	valid_0's WRMSSE: 0.545427	valid_0's RMSE: 1.89835
[70]	valid_0's WRMSSE: 0.541784	valid_0's RMSE: 1.89753
[80]	valid_0's WRMSSE: 0.54126	valid_0's RMSE: 1.89712
[90]	valid_0's WRMSSE: 0.538332	valid_0's RMSE: 1.89603
[100]	valid_0's WRMSSE: 0.539212	valid_0's RMSE: 1.89519
[110]	valid_0's WRMSSE: 0.537804	valid_0's RMSE: 1.89441


KeyboardInterrupt: 

In [40]:
pd.DataFrame(lgb_re2.feature_importances_,index=X_train.columns).sort_values(0,ascending=False).loc[['rolling_id_non_zero_mean_7', 'rolling_id_non_zero_std_7',
       'rolling_id_non_zero_quantile25_7', 'rolling_id_non_zero_quantile75_7',
       'rolling_id_non_zero_max_7', 'rolling_id_non_zero_min_7',
       'rolling_sale_zero_trans_7', 'rolling_sale_zero_trans_14',
       'rolling_sale_zero_trans_28', 'rolling_sale_zero_trans_56',
       'rolling_sale_ratio_zero_7', 'rolling_sale_ratio_zero_14',
       'rolling_sale_ratio_zero_28', 'rolling_sale_ratio_zero_56']]

Unnamed: 0,0
rolling_id_non_zero_mean_7,261.665622
rolling_id_non_zero_std_7,75.702637
rolling_id_non_zero_quantile25_7,102.048913
rolling_id_non_zero_quantile75_7,144.286515
rolling_id_non_zero_max_7,36.68682
rolling_id_non_zero_min_7,25.019205
rolling_sale_zero_trans_7,71.542361
rolling_sale_zero_trans_14,700.089435
rolling_sale_zero_trans_28,389.540604
rolling_sale_zero_trans_56,961.806516


In [41]:
pd.DataFrame(lgb_re2.feature_importances_,index=X_train.columns).sort_values(0,ascending=False)

Unnamed: 0,0
rolling_sale_28_mean,120904.455225
rolling_sale_14_mean,31834.401007
rolling_sale_7_mean,17581.530088
rolling_sale_dayofweek_52_mean,4338.107614
dayofweek,2722.952652
rolling_sale_ratio_zero_56,2426.918972
rolling_sale_dayofweek_52_quantile75,2415.357305
lag_sale_1,2251.747651
rolling_sale_364_mean,2201.572040
rolling_sale_ratio_zero_28,2113.103439


#### fold2

In [None]:
X_train = train1[(train1.day_num<=1885- 27)&(train1.day_num>=1885-1-364- 27)&(train1.date!='2015-12-25')&(train1.date!='2014-12-25')].drop(drop_col,axis=1).copy()
y_train = train1.loc[(train1.day_num<=1885- 27)&(train1.day_num>=1885-1-364- 27)&(train1.date!='2015-12-25')&(train1.date!='2014-12-25'),'sale']
X_valid = train1[(train1.day_num<=1913 - 27)&(train1.day_num>=1886- 27)].drop(drop_col,axis=1).copy()
y_valid = train1.loc[(train1.day_num<=1913- 27)&(train1.day_num>=1886- 27),'sale']
y_train[y_train==-999] = 0
y_valid[y_valid==-999] = 0

#### fold3

In [42]:
X_train = train1[(train1.day_num<=1913 - 365)&(train1.day_num>=1913 - 365 - 364)&(train1.date!='2015-12-25')&(train1.date!='2014-12-25')].drop(drop_col,axis=1).copy()
y_train = train1.loc[(train1.day_num<=1913 - 365)&(train1.day_num>=1913 - 365 - 364)&(train1.date!='2015-12-25')&(train1.date!='2014-12-25'),'sale']
X_valid = train1[(train1.day_num<=1969 - 365)&(train1.day_num>=1914 - 365)].drop(drop_col,axis=1).copy()
y_valid = train1.loc[(train1.day_num<=1969 - 365)&(train1.day_num>=1914 - 365),'sale']
y_train[y_train==-999] = 0
y_valid[y_valid==-999] = 0

In [43]:
lgb_re2 = lgb.LGBMRegressor(n_estimators=1000, random_state=51,subsample=0.8,
                         colsample_bytree=0.8,learning_rate=0.05 ,importance_type = 'gain',
                 max_depth = -1, num_leaves = 2**8-1,metric='None',bagging_freq=1,n_jobs=12,
                         first_metric_only = True,objective=custom_obj,min_data_in_leaf =  2**8-1)

In [None]:
lgb_re2.fit(X_train,y_train,eval_set = [(X_valid,y_valid)],verbose=10,
           eval_metric =lambda y_true, y_pred: [evaluator_super.feval(y_true, y_pred), rmse(y_true, y_pred)],
           early_stopping_rounds=50)#,categorical_feature=cat_col)

Training until validation scores don't improve for 50 rounds
[10]	valid_0's WRMSSE: 2.19171	valid_0's RMSE: 2.55022
[20]	valid_0's WRMSSE: 1.1347	valid_0's RMSE: 2.17706
[30]	valid_0's WRMSSE: 0.741193	valid_0's RMSE: 2.07299
[40]	valid_0's WRMSSE: 0.617495	valid_0's RMSE: 2.04253
[50]	valid_0's WRMSSE: 0.575052	valid_0's RMSE: 2.0306
[60]	valid_0's WRMSSE: 0.557291	valid_0's RMSE: 2.02407
[70]	valid_0's WRMSSE: 0.545979	valid_0's RMSE: 2.01969
[80]	valid_0's WRMSSE: 0.539343	valid_0's RMSE: 2.0158
[90]	valid_0's WRMSSE: 0.535779	valid_0's RMSE: 2.01304
[100]	valid_0's WRMSSE: 0.532817	valid_0's RMSE: 2.01098
[110]	valid_0's WRMSSE: 0.530269	valid_0's RMSE: 2.00906
[120]	valid_0's WRMSSE: 0.527545	valid_0's RMSE: 2.00724
[130]	valid_0's WRMSSE: 0.525801	valid_0's RMSE: 2.00582
[140]	valid_0's WRMSSE: 0.524013	valid_0's RMSE: 2.00479
[150]	valid_0's WRMSSE: 0.523379	valid_0's RMSE: 2.00406
[160]	valid_0's WRMSSE: 0.52224	valid_0's RMSE: 2.00369
[170]	valid_0's WRMSSE: 0.521381	valid_0's

### part5

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit,StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold,train_test_split,GroupShuffleSplit,StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error,log_loss,confusion_matrix
import sqlite3
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
#from bayes_opt import BayesianOptimization
import re
from string import punctuation
from scipy.spatial import Voronoi
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
from tqdm.notebook import tqdm
from numba import jit
from collections import Counter
import json
import joblib
import multiprocessing
import time
from scipy.sparse import csr_matrix
import gc

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int32', 'int64', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
sample_submission = pd.read_csv('../data/sample_submission.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')
sales_train = pd.read_csv('../data/sales_train_validation.csv')
calendar = pd.read_csv('../data/calendar.csv')

In [4]:
sales_train_long_format = pd.melt(sales_train,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name = 'day_num',value_name='sale')

In [5]:
def transform_day_to_num(str1):
    return int(str1[2:])
sales_train_long_format['day_num'] = sales_train_long_format['day_num'].map(transform_day_to_num)
calendar['date'] = pd.to_datetime(calendar['date'])
calendar['day_num'] = calendar['d'].map(transform_day_to_num)
map_day_date = calendar[['date','day_num']].set_index('day_num')['date']
sales_train_long_format['date'] = sales_train_long_format['day_num'].map(map_day_date)
list1 = ['wm_yr_wk','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI','day_num']
sales_train_long_format = sales_train_long_format.merge(calendar[list1],on='day_num',how='left')

In [6]:
sales_train_long_format = sales_train_long_format.merge(sell_prices,how='left',on = ['store_id','item_id','wm_yr_wk'])
#sales_train_long_format = reduce_mem_usage(sales_train_long_format)

In [7]:
# del sales_train_long_format
# gc.collect()

### metric
#### fast metric

In [8]:
data_for_weight = sales_train_long_format[sales_train_long_format.day_num.between(1886,1913)].copy()
data_for_weight['sale_used'] = data_for_weight['sale'] * data_for_weight['sell_price']
sales = pd.read_csv('../data/sales_train_validation.csv')

In [9]:
dummies_list = [sales.state_id, sales.store_id, 
                sales.cat_id, sales.dept_id, 
                sales.state_id + sales.cat_id, sales.state_id + sales.dept_id,
                sales.store_id + sales.cat_id, sales.store_id + sales.dept_id, 
                sales.item_id, sales.state_id + sales.item_id, sales.id]


## First element Level_0 aggregation 'all_sales':
dummies_df_list =[pd.DataFrame(np.ones(sales.shape[0]).astype(np.int8), 
                               index=sales.index, columns=['all']).T]

# List of dummy dataframes:
for i, cats in enumerate(dummies_list):
    dummies_df_list +=[pd.get_dummies(cats, drop_first=False).astype(np.int8).T]
    
# Concat dummy dataframes in one go:
## Level is constructed for free.
roll_mat_df = pd.concat(dummies_df_list, keys=list(range(12)), 
                        names=['level','id']).astype(np.int8, copy=False)

# Save values as sparse matrix & save index for future reference:
roll_index = roll_mat_df.index
roll_mat_csr = csr_matrix(roll_mat_df.values)
roll_mat_csr.shape

(42840, 30490)

In [10]:
# Fucntion to calculate S weights:
def get_s(drop_days=0):
    
    """
    drop_days: int, equals 0 by default, so S is calculated on all data.
               If equals 28, last 28 days want be used in calculating S.
    """
    
    # Rollup sales:
    d_name = ['d_' + str(i+1) for i in range(1913-drop_days)]
    sales_train_val = roll_mat_csr * sales[d_name].values

    # Find sales start index:
    start_no = np.argmax(sales_train_val>0, axis=1)
    
    # Replace days less than min day number with np.nan:
    # Next code line is super slow:
    flag = np.dot(np.diag(1/(start_no+1)) , np.tile(np.arange(1,1914-drop_days),(roll_mat_csr.shape[0],1)))<1
    sales_train_val = np.where(flag, np.nan, sales_train_val)

    # Denominator of RMSSE / RMSSE
    weight1 = np.nansum(np.diff(sales_train_val,axis=1)**2,axis=1)/(1913-start_no - 1)
    
    return weight1

In [11]:
S = get_s(drop_days=0)
S.shape

(42840,)

In [12]:
def get_w(sale_usd):
    """
    """
    # Calculate the total sales in USD for each item id:
    total_sales_usd = sale_usd.groupby(['id'],sort=False)['sale_used'].apply(np.sum).values
    
    # Roll up total sales by ids to higher levels:
    weight2 = roll_mat_csr * total_sales_usd
    
    return weight2/np.sum(weight2)

In [13]:
W = get_w(data_for_weight[['id','sale_used']])
W.shape

(42840,)

In [14]:
SW = W/np.sqrt(S)

In [15]:
# Function to do quick rollups:
def rollup(v):
    '''
    v - np.array of size (30490 rows, n day columns)
    v_rolledup - array of size (n, 42840)
    '''
    return (v.T*roll_mat_csr.T).T


# Function to calculate WRMSSE:
def wrmsse(preds, y_true, score_only=False, s = S, w = W, sw=SW):
    '''
    preds - Predictions: pd.DataFrame of size (30490 rows, N day columns)
    y_true - True values: pd.DataFrame of size (30490 rows, N day columns)
    sequence_length - np.array of size (42840,)
    sales_weight - sales weights based on last 28 days: np.array (42840,)
    '''
    
    if score_only:
        return np.sum(
                np.sqrt(
                    np.mean(
                        np.square(rollup(preds.values-y_true.values))
                            ,axis=1)) * sw)
    else: 
        score_matrix = (np.square(rollup(preds.values-y_true.values)) * np.square(w)[:, None]) / s[:, None]
        score = np.sum(np.sqrt(np.mean(score_matrix,axis=1)))
        return score, score_matrix

In [16]:
class WRMSSE_Evalator_super_version():
    def __init__(self,sw,roll_mat_csr):
        self.sw = sw
        self.roll_mat_csr = roll_mat_csr
    def rollup(self,v):
        '''
        v - np.array of size (30490 rows, n day columns) start with h.... not food
        v_rolledup - array of size (n, 42840)
        '''
        return (v.T*roll_mat_csr.T).T
    def feval(self,y_true,y_pred):
        """
        y_true,y_pred: np.ndarray
        """
        return 'WRMSSE',np.sum(
                np.sqrt(
                    np.mean(
                        np.square(self.rollup(y_pred.reshape(30490,-1,order = 'F')-y_true.reshape(30490,-1,order = 'F')))
                            ,axis=1)) * self.sw),False

In [17]:
evaluator_super = WRMSSE_Evalator_super_version(sw = SW,roll_mat_csr=roll_mat_csr)

#### my metric

### add features

In [18]:
train1 = pd.read_pickle('data_part1.pkl')
#train4 = pd.read_pickle('data_part4.pkl')
train5 = pd.read_pickle('data_part5.pkl')

In [34]:
train5 = pd.read_pickle('data_part5.pkl')

In [35]:
train5[['id','day_num']].head()

Unnamed: 0,id,day_num
0,HOBBIES_1_001_CA_1_validation,1
1,HOBBIES_1_002_CA_1_validation,1
2,HOBBIES_1_003_CA_1_validation,1
3,HOBBIES_1_004_CA_1_validation,1
4,HOBBIES_1_005_CA_1_validation,1


In [36]:
train1[['id','day_num']].head()

Unnamed: 0,id,day_num
0,14370,1
1,14380,1
2,14390,1
3,14400,1
4,14410,1


In [19]:
print(train1.shape)
print(train5.shape)
#print(train5.shape)

(60034810, 96)
(60034810, 68)


In [27]:
cond4 = (train4.columns.str.endswith('_max'))|(train4.columns.str.endswith('_min'))|(train4.columns.str.endswith('_mean'))|(train4.columns.str.endswith('_std'))|(train4.columns.str.endswith('_median'))


In [20]:
cond5 = (train5.columns.str.endswith('_max'))|(train5.columns.str.endswith('_min'))|(train5.columns.str.endswith('_mean'))|(train5.columns.str.endswith('_std'))|(train5.columns.str.endswith('_median'))

In [21]:
for _ in train5.columns[cond5]:
    print(_)
    train1[_] = train5[_].copy()
del train5
gc.collect()

state_id_cat_id_rolling_7_sale_mean
state_id_cat_id_rolling_7_sale_std
state_id_cat_id_rolling_7_sale_median
state_id_cat_id_rolling_14_sale_mean
state_id_cat_id_rolling_14_sale_std
state_id_cat_id_rolling_14_sale_median
state_id_cat_id_rolling_28_sale_mean
state_id_cat_id_rolling_28_sale_std
state_id_cat_id_rolling_28_sale_median
state_id_dept_id_rolling_7_sale_mean
state_id_dept_id_rolling_7_sale_std
state_id_dept_id_rolling_7_sale_median
state_id_dept_id_rolling_14_sale_mean
state_id_dept_id_rolling_14_sale_std
state_id_dept_id_rolling_14_sale_median
state_id_dept_id_rolling_28_sale_mean
state_id_dept_id_rolling_28_sale_std
state_id_dept_id_rolling_28_sale_median
state_id_item_id_rolling_7_sale_mean
state_id_item_id_rolling_7_sale_std
state_id_item_id_rolling_7_sale_median
state_id_item_id_rolling_14_sale_mean
state_id_item_id_rolling_14_sale_std
state_id_item_id_rolling_14_sale_median
state_id_item_id_rolling_28_sale_mean
state_id_item_id_rolling_28_sale_std
state_id_item_id_rollin

191

In [None]:
# for _ in train5.columns[cond5]:
#     print(_)
#     train1[_] = train5[_]
#     train5.drop([_],axis=1)
# del train5
# gc.collect()

In [22]:
for _ in ['CA','TX','WI']:
    train1.loc[train1.state_id==_,'snap'] = list(train1.loc[train1.state_id==_,f'snap_{_}'])

In [23]:
drop_col = ['day_num','date','wm_yr_wk','snap_CA','snap_TX','snap_WI','sale']
cat_col = ['item_id','dept_id','cat_id','store_id','state_id','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','id']

In [24]:
for cat in cat_col:
    lbl = preprocessing.LabelEncoder()
    train1[cat] = lbl.fit_transform(train1[cat].astype(str))
    print(cat)

item_id
dept_id
cat_id
store_id
state_id
event_name_1
event_type_1
event_name_2
event_type_2
id


In [25]:
train1.fillna(-999,inplace=True)

In [26]:
def func_custom_1_2(scalar):
    if scalar ==0:
        return 0
    elif scalar <= 10**(-8):
        return 10**4
    else:
        return scalar **(-1/2)

In [27]:
func1 = np.vectorize(func_custom_1_2)

In [28]:
def custom_obj(y_true, y_pred):
    residual = (y_true - y_pred).astype(np.float128)
    residual = residual.reshape((-1,30490)) 
    d = residual.shape[0]    
    residual_42840 = residual * roll_mat_csr.T
    mean_square_42840 = np.square(residual_42840).sum(axis = 0)
    temp_1 = func1(mean_square_42840)
    part1 = ((1/np.sqrt(S)) * temp_1)/np.sqrt(d)
    part_all = np.multiply(part1,residual_42840) 
    grad = -(part_all * roll_mat_csr).reshape((-1))
    hess_part1 = np.multiply(-(1/np.sqrt(S))/np.sqrt(d)*temp_1**3,np.square(residual_42840))
    hess_part2 = (1/np.sqrt(S))/np.sqrt(d)*temp_1
    hess = ((hess_part1 + hess_part2)*roll_mat_csr).reshape((-1))
    return grad,hess

In [29]:
def rmse(y_true, y_pred):
    return 'RMSE', np.sqrt(np.mean(np.power((y_pred) - (y_true), 2))), False

#### fold1

In [30]:
X_train = train1[(train1.day_num<=1885)&(train1.day_num>=1885-1-364)&(train1.date!='2015-12-25')].drop(drop_col,axis=1).copy()
y_train = train1.loc[(train1.day_num<=1885)&(train1.day_num>=1885-1-364)&(train1.date!='2015-12-25'),'sale']
X_valid = train1[(train1.day_num<=1913)&(train1.day_num>=1886)].drop(drop_col,axis=1).copy()
y_valid = train1.loc[(train1.day_num<=1913)&(train1.day_num>=1886),'sale']
y_train[y_train==-999] = 0
y_valid[y_valid==-999] = 0

In [31]:
lgb_re2 = lgb.LGBMRegressor(n_estimators=1000, random_state=51,subsample=0.8,
                         colsample_bytree=0.8,learning_rate=0.05 ,importance_type = 'gain',
                 max_depth = -1, num_leaves = 2**8-1,metric='None',bagging_freq=1,n_jobs=-1,
                         first_metric_only = True,objective=custom_obj,min_data_in_leaf =  2**8-1)

In [32]:
lgb_re2.fit(X_train,y_train,eval_set = [(X_valid,y_valid)],verbose=10,
           eval_metric =lambda y_true, y_pred: [evaluator_super.feval(y_true, y_pred), rmse(y_true, y_pred)],
           early_stopping_rounds=50)#,categorical_feature=cat_col)

Training until validation scores don't improve for 50 rounds
[10]	valid_0's WRMSSE: 2.3815	valid_0's RMSE: 2.45523
[20]	valid_0's WRMSSE: 1.14096	valid_0's RMSE: 2.03145
[30]	valid_0's WRMSSE: 0.696838	valid_0's RMSE: 1.92929
[40]	valid_0's WRMSSE: 0.566141	valid_0's RMSE: 1.90465
[50]	valid_0's WRMSSE: 0.532615	valid_0's RMSE: 1.89747
[60]	valid_0's WRMSSE: 0.520829	valid_0's RMSE: 1.89366
[70]	valid_0's WRMSSE: 0.512417	valid_0's RMSE: 1.89137
[80]	valid_0's WRMSSE: 0.510078	valid_0's RMSE: 1.89051
[90]	valid_0's WRMSSE: 0.508062	valid_0's RMSE: 1.88899
[100]	valid_0's WRMSSE: 0.507391	valid_0's RMSE: 1.8878
[110]	valid_0's WRMSSE: 0.506783	valid_0's RMSE: 1.88696
[120]	valid_0's WRMSSE: 0.506058	valid_0's RMSE: 1.88561
[130]	valid_0's WRMSSE: 0.504761	valid_0's RMSE: 1.88441
[140]	valid_0's WRMSSE: 0.503712	valid_0's RMSE: 1.88341
[150]	valid_0's WRMSSE: 0.502045	valid_0's RMSE: 1.88221
[160]	valid_0's WRMSSE: 0.501938	valid_0's RMSE: 1.88145
[170]	valid_0's WRMSSE: 0.50272	valid_0'

LGBMRegressor(bagging_freq=1, boosting_type='gbdt', class_weight=None,
       colsample_bytree=0.8, first_metric_only=True,
       importance_type='gain', learning_rate=0.05, max_depth=-1,
       metric='None', min_child_samples=20, min_child_weight=0.001,
       min_data_in_leaf=255, min_split_gain=0.0, n_estimators=1000,
       n_jobs=-1, num_leaves=255,
       objective=<function custom_obj at 0x7fc933132ea0>, random_state=51,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=0)

In [33]:
pd.DataFrame(lgb_re2.feature_importances_,index=X_train.columns).sort_values(0,ascending=False)

Unnamed: 0,0
rolling_sale_28_mean,92077.813811
rolling_sale_14_mean,55404.009067
rolling_sale_7_mean,16143.368563
rolling_sale_dayofweek_52_mean,5737.871371
rolling_sale_28_std,5680.055175
rolling_sale_91_mean,3781.508862
rolling_sale_364_mean,2815.231363
state_id_item_id_rolling_7_sale_mean,2523.861683
dayofweek,2521.206346
rolling_sale_dayofweek_52_quantile75,2389.773350
