In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#For demonstration with google colab
#!pip install lightgbm

import lightgbm as lgb

from sklearn.model_selection import KFold, GroupKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, mean_squared_error

import time
import datetime

import os

## Import data as pandas Dataframe

In [2]:
#change this to your folder containing the csv files
path = 'Store Item Demand Forecasting/data/'

In [3]:
train = pd.read_csv(f'{path}train.csv')
test = pd.read_csv(f'{path}test.csv')
sub = pd.read_csv(f'{path}sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [5]:
test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [6]:
train.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

In [7]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


## Create date features

In [8]:
for df in train, test:
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True) # format="%Y-%m-%d"

In [9]:
for df in train, test:
    df['year']   = df['date'].dt.year
    df['month']  = df['date'].dt.month
    df['day']    = df['date'].dt.dayofweek
    df.head()

## Creating Validation set (3 months like test set)

In [10]:
val_idx = np.flatnonzero((train.date<datetime.datetime(2018,1,1)) & (train.date>=datetime.datetime(2017,10,1)))

train_idx = train[~train.index.isin(val_idx)].index

len(val_idx), len(train_idx)

(46000, 867000)

## Split columns into categorical/numerical and factorize categoricals

In [11]:
target = 'sales'

excluded_features = ['sales']

categorical_features = [_f for _f in train.columns if (_f not in excluded_features)]

categorical_features

['date', 'store', 'item', 'year', 'month', 'day']

In [12]:
for f in categorical_features:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [13]:
y_reg = train['sales']

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 7 columns):
date     913000 non-null int64
store    913000 non-null int64
item     913000 non-null int64
sales    913000 non-null int64
year     913000 non-null int64
month    913000 non-null int64
day      913000 non-null int64
dtypes: int64(7)
memory usage: 48.8 MB


## Fitting Lightgbm model

In [15]:
train_features = [_f for _f in train.columns if _f not in excluded_features]
print(train_features)

importances = pd.DataFrame()

oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

trn_x, trn_y = train[train_features].iloc[train_idx], y_reg.iloc[train_idx]
val_x, val_y = train[train_features].iloc[val_idx], y_reg.iloc[val_idx]

reg = lgb.LGBMRegressor(
    num_leaves=31,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=5000,
    random_state=1
)
reg.fit(
    trn_x, trn_y,
    eval_set=[(val_x, val_y)],
    early_stopping_rounds=50,
    verbose=100,
    eval_metric='mape'
)
imp_df = pd.DataFrame()
imp_df['feature'] = train_features
imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')

importances = pd.concat([importances, imp_df], axis=0)

oof_reg_preds[val_idx] = reg.predict(val_x, num_iteration=reg.best_iteration_)
sub_reg_preds += reg.predict(test[train_features], num_iteration=reg.best_iteration_)

#mape
ape = 2 * np.abs(oof_reg_preds - y_reg) / (np.abs(oof_reg_preds) + np.abs(y_reg))
e = ape.mean() 
print('Full validation score %.4f' %e)

['date', 'store', 'item', 'year', 'month', 'day']
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's l2: 102.432	valid_0's mape: 0.178061
[200]	valid_0's l2: 70.8012	valid_0's mape: 0.146107
[300]	valid_0's l2: 65.7614	valid_0's mape: 0.139208
[400]	valid_0's l2: 62.8373	valid_0's mape: 0.13495
[500]	valid_0's l2: 61.7528	valid_0's mape: 0.133492
[600]	valid_0's l2: 61.1156	valid_0's mape: 0.13271
[700]	valid_0's l2: 60.4071	valid_0's mape: 0.131763
[800]	valid_0's l2: 59.9749	valid_0's mape: 0.131336
[900]	valid_0's l2: 59.8499	valid_0's mape: 0.131126
Early stopping, best iteration is:
[920]	valid_0's l2: 59.8143	valid_0's mape: 0.131026
Full validation score 1.9056


In [16]:
e

1.9056122105013995

In [17]:
sub_reg_preds

array([ 8.4299134 , 10.67868529, 10.81766705, ..., 54.76780439,
       59.5291008 , 62.35073684])

## Same model using sklearns TimeSeriesSplit for folding

In [18]:
folds = TimeSeriesSplit(n_splits=5)

train_features = [_f for _f in train.columns if _f not in excluded_features]
print(train_features)

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds.split(train)):
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    print("fold: " + str(fold_))
    
    reg = lgb.LGBMRegressor(
        num_leaves=10,
        max_depth=4,
        learning_rate=0.01,
        n_estimators=2000,
        random_state=1
    )
    reg.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='mape'
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_
    importances = pd.concat([importances, imp_df], axis=0)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    sub_reg_preds += _preds / folds.get_n_splits()

#mape
ape = 2 * np.abs(oof_reg_preds - y_reg) / (np.abs(oof_reg_preds) + np.abs(y_reg))
e = ape.mean() 
print('Full validation score %.4f' %e)

['date', 'store', 'item', 'year', 'month', 'day']
fold: 0
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[8]	valid_0's l2: 1174.81	valid_0's mape: 0.472889
fold: 1
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1]	valid_0's l2: 874.372	valid_0's mape: 0.503948
fold: 2
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[16]	valid_0's l2: 770.268	valid_0's mape: 0.618888
fold: 3
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's l2: 802.422	valid_0's mape: 0.85967
[200]	valid_0's l2: 717.313	valid_0's mape: 0.778031
[300]	valid_0's l2: 681.613	valid_0's mape: 0.70437
[400]	valid_0's l2: 663.281	valid_0's mape: 0.643154
[500]	valid_0's l2: 652.354	valid_0's mape: 0.573206
Early stopping, best iteration is:
[490]	valid_0's l2: 651.914	valid_0's mape: 0.580303
fold: 4
Training until validation scores don't improve for 5

In [19]:
sub_reg_preds.mean(), oof_reg_preds[val_idx].mean(), y_reg.mean()

(45.5835258118452, 40.61884879576772, 52.250286966046005)

In [20]:
sub['sales'] = sub_reg_preds
sub.to_csv(f'{path}sub_lgb_baseline.csv', index=False)

In [21]:
sub['sales'].head()

0    35.975753
1    36.595098
2    36.595098
3    36.682939
4    37.052162
Name: sales, dtype: float64