In [2]:
import time
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from itertools import cycle
from sklearn.svm import SVR
import statsmodels.api as sm
from pmdarima import auto_arima
import matplotlib.pyplot as plt
from datetime import datetime,timedelta
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

In [3]:
%matplotlib inline
plt.style.use('bmh')
sns.set_style('whitegrid')
plt.rc('xtick',labelsize=15)
plt.rc('ytick',labelsize=15)
warnings.filterwarnings("ignore")
pd.set_option('max_colwidth',100)
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle=cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [4]:
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [5]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [6]:
pd.options.display.max_columns = 50

In [7]:
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

### Data preprocessing to form a better df

In [8]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        #Find categorical feature
        if col_dtype == "category":
            #Convert into type int16
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [9]:
def create_fea(dt):
    #What does lags as well as lag_cols mean
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    #Features about date
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [10]:
FIRST_DAY = 350 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [11]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

Wall time: 30.2 s


(40718219, 22)

In [12]:
df['id'].unique()

array(['HOBBIES_1_002_CA_1_validation', 'HOBBIES_1_004_CA_1_validation',
       'HOBBIES_1_005_CA_1_validation', ...,
       'HOUSEHOLD_1_400_CA_4_validation',
       'HOUSEHOLD_1_386_WI_1_validation',
       'HOUSEHOLD_1_020_WI_2_validation'], dtype=object)

In [16]:
df_350 = df
df_350

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.50
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40718214,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.98
40718215,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.28
40718216,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.28
40718217,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.00


In [None]:
fig,ax = plt.subplots(figsize=(15,3))
plot_acf(df['sales'].tolist(),lags=60,ax=ax)

In [None]:
t0 = time.time()
model_name = 'Simple Exponential Smoothing'
span = 7
alpha = 2/(span+1)

ses_model = simpleExpSmooth_model.forecast(28).values

fig,ax = plt.subplots(figsize=(25,4))
df.plot(x='date',y=sales,)

**This method create_fea generate critical features regarding lags and wins.**

**Now create feature**

In [None]:
%%time

create_fea(df)
df.shape

In [11]:
df.dropna(inplace=True)
df.shape

(39041269, 31)

In [15]:
df.sort_values("d")

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
13990303,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1000,0.0,2013-10-24,11339,4,6,10,2013,0,0,0,0,0.0,0.0,0.0,8.26,0.0,0.0,0.285714,0.571429,0.500000,0.464286,43,4,24
14132340,FOODS_3_724_WI_1_validation,2947,6,7,2,2,d_1000,0.0,2013-10-24,11339,4,6,10,2013,0,0,0,0,0.0,0.0,0.0,6.48,0.0,0.0,0.142857,0.142857,0.142857,0.071429,43,4,24
14064209,HOUSEHOLD_1_064_TX_1_validation,626,2,4,1,1,d_1000,0.0,2013-10-24,11339,4,6,10,2013,0,0,0,0,0.0,0.0,0.0,5.78,0.0,1.0,0.142857,0.142857,0.142857,0.107143,43,4,24
14064202,HOUSEHOLD_1_063_TX_1_validation,625,2,4,1,1,d_1000,0.0,2013-10-24,11339,4,6,10,2013,0,0,0,0,0.0,0.0,0.0,10.97,0.0,0.0,0.000000,0.000000,0.000000,0.000000,43,4,24
14132347,FOODS_3_725_WI_1_validation,2948,6,7,2,2,d_1000,0.0,2013-10-24,11339,4,6,10,2013,0,0,0,0,0.0,0.0,0.0,2.88,0.0,0.0,0.000000,0.000000,0.000000,0.000000,43,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14021319,FOODS_3_222_CA_2_validation,2446,6,1,2,0,d_999,0.0,2013-10-23,11339,6,5,10,2013,0,0,0,0,0.0,0.0,0.0,3.98,2.0,0.0,1.142857,0.285714,0.785714,0.500000,43,4,23
14138205,HOUSEHOLD_1_329_WI_2_validation,887,2,8,1,2,d_999,0.0,2013-10-23,11339,6,5,10,2013,0,0,0,0,0.0,0.0,0.0,4.97,0.0,0.0,0.142857,0.142857,0.142857,0.642857,43,4,23
14056466,FOODS_3_144_CA_4_validation,2368,6,3,2,0,d_999,0.0,2013-10-23,11339,6,5,10,2013,0,0,0,0,0.0,0.0,0.0,1.88,1.0,4.0,2.428571,3.000000,2.714286,2.607143,43,4,23
14138198,HOUSEHOLD_1_327_WI_2_validation,885,2,8,1,2,d_999,9.0,2013-10-23,11339,6,5,10,2013,0,0,0,0,0.0,0.0,0.0,0.97,27.0,6.0,11.571428,14.000000,10.607142,11.000000,43,4,23


In [14]:
#Feed lightgbm with selected features
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [29]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)
# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

Wall time: 23.4 s


In [30]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

20

In [31]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [32]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 2.92019
[40]	valid_0's rmse: 2.60363


KeyboardInterrupt: 