In [41]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

!pip install lightgbm
import lightgbm as lgb

from sklearn.model_selection import KFold, GroupKFold, TimeSeriesSplit, StratifiedKFold #only classification
from sklearn.metrics import roc_auc_score, mean_squared_error

import time
import datetime

import os



In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ssub = pd.read_csv('sample_submission.csv')

In [43]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [44]:
test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [45]:
train.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

In [46]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [0]:
#train[train.columns[train.dtypes=='float']].describe()

In [0]:
for df in train, test:
  df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True) # format="%Y-%m-%d"

In [0]:
val_idx = np.flatnonzero((train.date<datetime.datetime(2018,1,1)) & (train.date>=datetime.datetime(2017,10,1)))

train_idx = train[~train.index.isin(val_idx)].index

len(val_idx), len(train_idx)

In [0]:
for df in train, test:
  df['year']   = df['date'].dt.year
  df['month']  = df['date'].dt.month
  df['day']    = df['date'].dt.dayofweek
  df.drop('date', axis=1, inplace=True)
  df.head()

In [53]:
target = 'sales'

excluded_features = [
    'sales' #, 'store', 'item'
]

categorical_features = [
    _f for _f in train.columns
    if (_f not in excluded_features)
]
categorical_features

['store', 'item', 'year', 'month', 'day']

In [0]:
for f in categorical_features:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [0]:
y_reg = train['sales']

In [57]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 6 columns):
store    913000 non-null int64
item     913000 non-null int64
sales    913000 non-null int64
year     913000 non-null int64
month    913000 non-null int64
day      913000 non-null int64
dtypes: int64(6)
memory usage: 41.8 MB


In [72]:
train_features = [_f for _f in train.columns if _f not in excluded_features]
print(train_features)

importances = pd.DataFrame()

oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

trn_x, trn_y = train[train_features].iloc[train_idx], y_reg.iloc[train_idx]
val_x, val_y = train[train_features].iloc[val_idx], y_reg.iloc[val_idx]

reg = lgb.LGBMRegressor(
    num_leaves=31,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=5000,
    random_state=1
)
reg.fit(
    trn_x, trn_y,
    eval_set=[(val_x, val_y)],
    early_stopping_rounds=50,
    verbose=100,
    eval_metric='mape'
)
imp_df = pd.DataFrame()
imp_df['feature'] = train_features
imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')

importances = pd.concat([importances, imp_df], axis=0)

oof_reg_preds[val_idx] = reg.predict(val_x, num_iteration=reg.best_iteration_)
#oof_reg_preds[oof_reg_preds < 0] = 0
sub_reg_preds += reg.predict(test[train_features], num_iteration=reg.best_iteration_)
#sub_reg_preds[sub_reg_preds < 0] = 0

e = 2 * abs(y_reg - oof_reg_preds) / ( abs(y_reg)+abs(oof_reg_preds) )
e = e.mean()
print('Full validation score %.4f' %e)

['store', 'item', 'year', 'month', 'day']
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's l2: 76.9954	valid_0's mape: 0.154413
[200]	valid_0's l2: 68.9737	valid_0's mape: 0.144144
[300]	valid_0's l2: 65.5503	valid_0's mape: 0.139974
[400]	valid_0's l2: 64.0055	valid_0's mape: 0.13852
[500]	valid_0's l2: 62.9236	valid_0's mape: 0.138069
[600]	valid_0's l2: 62.0417	valid_0's mape: 0.137686
[700]	valid_0's l2: 61.7915	valid_0's mape: 0.137565
[800]	valid_0's l2: 61.4941	valid_0's mape: 0.137251
Early stopping, best iteration is:
[800]	valid_0's l2: 61.4941	valid_0's mape: 0.137251
Full validation score 1.9057


In [73]:
e

1.9056957356485946

In [74]:
sub_reg_preds

array([ 7.90795264,  8.95799105,  9.49416442, ..., 56.22357369,
       60.96068862, 64.1459715 ])

In [0]:
'''folds = KFold(n_splits=5, random_state=1)

train_features = [_f for _f in train.columns if _f not in excluded_features]
print(train_features)

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (train_, val_) in enumerate(folds.split(train)):
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    print("fold: " + str(fold_))
    
    reg = lgb.LGBMRegressor(
        num_leaves=10,
        max_depth=4,
        learning_rate=0.2,
        n_estimators=1000,
        random_state=1
    )
    reg.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='mape'
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_
    importances = pd.concat([importances, imp_df], axis=0)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    #oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    #_preds[_preds < 0] = 0
    sub_reg_preds += _preds / folds.get_n_splits()
      

mean_squared_error(y_reg, oof_reg_preds)'''

In [75]:
sub_reg_preds.mean(), oof_reg_preds[val_idx].mean(), y_reg.mean()

(33.87400990710704, 55.604312212431154, 52.250286966046005)

In [0]:
sub = pd.read_csv('sample_submission.csv')
sub['sales'] = sub_reg_preds
sub.to_csv('sub_lgb_baseline.csv', index=False)

In [69]:
sub['sales'].head()

0     7.907953
1     8.957991
2     9.494164
3    10.952682
4    11.360785
Name: sales, dtype: float64