In [1]:
!pip install lightgbm



In [2]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [3]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [4]:
h = 28 
max_lags = 366
tr_last = 1913
fday = datetime(2016, 4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [5]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    
    prices = pd.read_csv("/project_data/data_asset/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("/project_data/data_asset/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("/project_data/data_asset/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 2*h +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [6]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [7]:
FIRST_DAY = 800 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [8]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

CPU times: user 38 s, sys: 3.58 s, total: 41.6 s
Wall time: 28.9 s


In [9]:
%%time

create_fea(df)
df.shape

CPU times: user 3min 37s, sys: 11.9 s, total: 3min 49s
Wall time: 2min 44s


In [10]:
df.dropna(inplace = True)
df.shape

(29845446, 31)

In [11]:
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales", "d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [12]:
train_data = lgb.Dataset(X_train, label = y_train, categorical_feature=cat_feats, free_raw_data=False)
fake_valid_inds = np.random.choice(len(X_train), 1000000)
fake_valid_data = lgb.Dataset(X_train.iloc[fake_valid_inds], label = y_train.iloc[fake_valid_inds], categorical_feature=cat_feats,
                             free_raw_data=False)   # This is just a subsample of the training set, not a real validation set !

In [13]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1500,
}

In [14]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=50) 



[50]	valid_0's rmse: 2.6985
[100]	valid_0's rmse: 2.58533
[150]	valid_0's rmse: 2.54934
[200]	valid_0's rmse: 2.51367
[250]	valid_0's rmse: 2.48897
[300]	valid_0's rmse: 2.46486
[350]	valid_0's rmse: 2.44728
[400]	valid_0's rmse: 2.43307
[450]	valid_0's rmse: 2.41975
[500]	valid_0's rmse: 2.40879
[550]	valid_0's rmse: 2.39863
[600]	valid_0's rmse: 2.38852
[650]	valid_0's rmse: 2.38081
[700]	valid_0's rmse: 2.37381
[750]	valid_0's rmse: 2.36657
[800]	valid_0's rmse: 2.36169
[850]	valid_0's rmse: 2.35508
[900]	valid_0's rmse: 2.35049
[950]	valid_0's rmse: 2.34265
[1000]	valid_0's rmse: 2.33698
[1050]	valid_0's rmse: 2.33373
[1100]	valid_0's rmse: 2.32192
[1150]	valid_0's rmse: 2.30795
[1200]	valid_0's rmse: 2.30005
[1250]	valid_0's rmse: 2.29339
[1300]	valid_0's rmse: 2.28772
[1350]	valid_0's rmse: 2.27656
[1400]	valid_0's rmse: 2.27355
[1450]	valid_0's rmse: 2.26406
[1500]	valid_0's rmse: 2.25938
CPU times: user 6h 54min 44s, sys: 1min 7s, total: 6h 55min 52s
Wall time: 26min 46s


In [15]:
%%time

te = create_dt(False)
te.shape

CPU times: user 28.2 s, sys: 1.54 s, total: 29.7 s
Wall time: 14.4 s


In [16]:
%%time

for i in range(0, 28):
    day = fday + timedelta(days=i)
    print(i, day)
    tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
    create_fea(tst)
    tst = tst.loc[tst.date == day , train_cols]
    te.loc[te.date == day, "sales"] = 1.02*m_lgb.predict(tst) # magic multiplier by kyakovlev

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
CPU times: user 1h 26min 24s, sys: 2min 47s, total: 1h 29min 11s
Wall time: 55min 49s


In [18]:
from project_lib import Project
project = Project.access()

In [21]:
%%time

te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
                                                                      "id"].str.replace("validation$", "evaluation")
te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][[f"F{i}" for i in range(1,29)]].reset_index()
te_sub.fillna(0., inplace = True)

CPU times: user 15.6 s, sys: 251 ms, total: 15.8 s
Wall time: 4.32 s


In [23]:
project.save_data("my_kaggle_submission.csv", te_sub.to_csv(index=False), overwrite=True)
te_sub.shape

(60980, 29)