In [19]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [20]:
CAL_DTYPES = {
    "event_name_1": "category", 
    "event_name_2": "category", 
    "event_type_1": "category", 
    "event_type_2": "category", 
    "weekday": "category", 
    'wm_yr_wk': 'int16', 
    "wday": "int16",
    "month": "int16", 
    "year": "int16", 
    "snap_CA": "float32", 
    'snap_TX': 'float32', 
    'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [21]:
prices_path = "./sell_prices.csv"
cal_path = "./calendar.csv"
train_path = "./sales_train_validation.csv"

h = 28 
max_lags = 57
tr_last = 1913  # Last day of training data
fday = datetime(2016,4, 25)  # Last date of training data

In [22]:
def pre_process(is_train = True, nrows = None, first_day = 1200):
    # Read price data and transform datatypes
    prices = pd.read_csv(prices_path, dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
    print(prices.shape)
    
    # Read calendar data and transform datatypes
    cal = pd.read_csv(cal_path, dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    print(cal.shape)
    
    # First day to use
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    
    # Read training data abd transform datatypes
    dt = pd.read_csv(train_path, nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    print(dt.shape)
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    # Transform horizontal data (day columns) to vertical data (1 row per product per day)
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    # Merge training data with prices and calendar
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [23]:
def build_features(dt):
    
    # Defining the lag features
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    # Defining rolling mean features on lags
    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    # Transform data information
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [28]:
FIRST_DAY = 800 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

66

In [29]:
%%time

df = pre_process(is_train=True, first_day= FIRST_DAY)
df.shape

(6841121, 4)
(1969, 14)
(30490, 1120)
CPU times: user 19.6 s, sys: 5.83 s, total: 25.4 s
Wall time: 25.4 s


(31522396, 22)

In [30]:
%%time

build_features(df)
df.shape

CPU times: user 2min 35s, sys: 17.7 s, total: 2min 52s
Wall time: 2min 52s


(31522396, 31)

In [31]:
# Remove rows with missing values
df.dropna(inplace = True)
df.shape

(29845446, 31)

In [32]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]  # Columns not used for training
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]  # Training set
y_train = df["sales"]  # Ground truth labels
del df
gc.collect()

22

In [33]:
X_train.shape

(29845446, 25)

In [34]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

CPU times: user 14.8 s, sys: 2.02 s, total: 16.8 s
Wall time: 16.8 s


In [35]:
del X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

66

In [38]:
# LightGBM parameters
params = {
        "boosting": "goss",
        "objective" : "tweedie",
        "tweedie_variance_power": 1.1,
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.05,
        #"sub_row" : 0.5,
        #"bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": "rmse",
        "verbosity": 1,
        "num_iterations": 1400,
        "num_leaves": 128,
        "min_data_in_leaf": 100,
        "feature_fraction": 0.6,
}

In [39]:
%%time

# Training a model with LightGBM
m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 
m_lgb.save_model("model.lgb")
# m_lgb = lgb.Booster(model_file='model.lgb')

[20]	valid_0's rmse: 2.77269
[40]	valid_0's rmse: 2.42385
[60]	valid_0's rmse: 2.35279
[80]	valid_0's rmse: 2.33247
[100]	valid_0's rmse: 2.32144
[120]	valid_0's rmse: 2.31421
[140]	valid_0's rmse: 2.30749
[160]	valid_0's rmse: 2.30226
[180]	valid_0's rmse: 2.29561
[200]	valid_0's rmse: 2.29073
[220]	valid_0's rmse: 2.28528
[240]	valid_0's rmse: 2.28289
[260]	valid_0's rmse: 2.27964
[280]	valid_0's rmse: 2.27676
[300]	valid_0's rmse: 2.27278
[320]	valid_0's rmse: 2.27073
[340]	valid_0's rmse: 2.26691
[360]	valid_0's rmse: 2.26395
[380]	valid_0's rmse: 2.26141
[400]	valid_0's rmse: 2.25837
[420]	valid_0's rmse: 2.25682
[440]	valid_0's rmse: 2.25447
[460]	valid_0's rmse: 2.25241
[480]	valid_0's rmse: 2.25062
[500]	valid_0's rmse: 2.24868
[520]	valid_0's rmse: 2.24783
[540]	valid_0's rmse: 2.24601
[560]	valid_0's rmse: 2.24471
[580]	valid_0's rmse: 2.24356
[600]	valid_0's rmse: 2.24196
[620]	valid_0's rmse: 2.24121
[640]	valid_0's rmse: 2.24075
[660]	valid_0's rmse: 2.23959
[680]	valid_0'

<lightgbm.basic.Booster at 0x7f395039d590>

In [40]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = pre_process(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        build_features(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

(6841121, 4)
(1969, 14)
(30490, 64)
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
0 1.028 0.3333333333333333
(6841121, 4)
(1969, 14)
(30490, 64)
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
1