In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

plt.style.use('ggplot')
np.set_printoptions(precision=4)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)
pd.set_option('precision', 6)

### Load the dataset

In [4]:
train = pd.read_csv('train.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])
store = pd.read_csv('store.csv')

print('train.shape: {}'.format(train.shape))
print('test.shape: {}'.format(test.shape))
print('store.shape: {}'.format(store.shape))

  interactivity=interactivity, compiler=compiler, result=result)


train.shape: (1017209, 9)
test.shape: (41088, 8)
store.shape: (1115, 10)


### Prepare the store table

In [5]:
# Drop all the rows with store open but zero sales
train = train.drop(train[(train.Sales==0) & (train.Open==1)].index, axis=0)

# For each store, year, and month, drop the abnormal sales

In [6]:
# Convert Competition date information to datetime for comparing
def convert_competition_open(row):
    try:
        date = '{}-{}'.format(int(row['CompetitionOpenSinceYear']), int(row['CompetitionOpenSinceMonth']))
        return pd.to_datetime(date)
    except:
        return np.nan
store['CompetitionOpen'] = store.apply(convert_competition_open, axis=1)
store = store.drop(['CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth'], axis=1)

# Convert Promo2 information to datetime for comparing
def convert_promo2(row):
    try:
        date = '{}{}1'.format(int(row['Promo2SinceYear']), int(row['Promo2SinceWeek']))
        return pd.to_datetime(date, format='%Y%W%w')
    except:
        return np.nan
store['Promo2Since'] = store.apply(convert_promo2, axis=1)
store = store.drop(['Promo2', 'Promo2SinceYear', 'Promo2SinceWeek'], axis=1)

# Add 12 attributes of the months each store is running promo2
months = {
    'Jan': 1,
    'Feb' : 2,
    'Mar' : 3,
    'Apr' : 4,
    'May' : 5,
    'Jun' : 6,
    'Jul' : 7,
    'Aug' : 8,
    'Sept' : 9, 
    'Oct' : 10,
    'Nov' : 11,
    'Dec' : 12
}
def add_promo2_month(interval, month):
    if pd.isnull(interval):
        return np.nan
    else:
        if month in interval.split(','):
            return 1
        else:
            return 0

for month in months.keys():
    store['Promo2_on_month' + '_' + str(months[month])] = store.PromoInterval.apply(add_promo2_month, args=(month,))
store = store.drop('PromoInterval', axis=1)


In [7]:
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 18 columns):
Store                  1115 non-null int64
StoreType              1115 non-null object
Assortment             1115 non-null object
CompetitionDistance    1112 non-null float64
CompetitionOpen        761 non-null datetime64[ns]
Promo2Since            571 non-null datetime64[ns]
Promo2_on_month_1      571 non-null float64
Promo2_on_month_2      571 non-null float64
Promo2_on_month_3      571 non-null float64
Promo2_on_month_4      571 non-null float64
Promo2_on_month_5      571 non-null float64
Promo2_on_month_6      571 non-null float64
Promo2_on_month_7      571 non-null float64
Promo2_on_month_8      571 non-null float64
Promo2_on_month_9      571 non-null float64
Promo2_on_month_10     571 non-null float64
Promo2_on_month_11     571 non-null float64
Promo2_on_month_12     571 non-null float64
dtypes: datetime64[ns](2), float64(13), int64(1), object(2)
memory usage: 156.9+ KB


### Drop the outliers

In [8]:
# Drop all the rows with store open but zero sales
train = train.drop(train[(train.Sales==0) & (train.Open==1)].index, axis=0)

# For each store, year, and month, drop the abnormal sales
stats = train.groupby('Store').Sales.agg(['mean', 'std']).reset_index()
sales = train[['Store', 'Sales']].copy().reset_index()
sales = pd.merge(sales, stats, on='Store')
sales['z_score'] = (sales['Sales'] - sales['mean']) / sales['std']

index_to_drop = sales.loc[sales.z_score>=3, 'index']
train = train.drop(index_to_drop, axis=0)

### Combine the data for easy processing

In [9]:
train['istestset'] = 0
test['istestset'] = 1
combine = pd.concat([train.drop(['Sales', 'Customers'], axis=1), test.drop('Id', axis=1)], axis=0, ignore_index=True)

### Add the store table information

In [10]:
combine = pd.merge(combine, store, how='left', on='Store')

In [11]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1057131 entries, 0 to 1057130
Data columns (total 25 columns):
Store                  1057131 non-null int64
DayOfWeek              1057131 non-null int64
Date                   1057131 non-null datetime64[ns]
Open                   1057120 non-null float64
Promo                  1057131 non-null int64
StateHoliday           1057131 non-null object
SchoolHoliday          1057131 non-null int64
istestset              1057131 non-null int64
StoreType              1057131 non-null object
Assortment             1057131 non-null object
CompetitionDistance    1054398 non-null float64
CompetitionOpen        718946 non-null datetime64[ns]
Promo2Since            532356 non-null datetime64[ns]
Promo2_on_month_1      532356 non-null float64
Promo2_on_month_2      532356 non-null float64
Promo2_on_month_3      532356 non-null float64
Promo2_on_month_4      532356 non-null float64
Promo2_on_month_5      532356 non-null float64
Promo2_on_month_6     

### Fill in missing values

In [12]:
# Fill in Open with mode
combine['Open'] = combine.Open.fillna(1.0)

# Fill in all the promo2 related attributes with -1, which indicates a special case(not participate in promo2)
promo2_related_feats = [_ for _ in combine.columns if 'Promo2' in _]
for feat in promo2_related_feats:
    combine[feat] = combine[feat].fillna(-1)
    
# Fill in the missing values in CompetitionDistance with the max + 1000, which means no nearby competition
combine['CompetitionDistance'] = combine['CompetitionDistance'].fillna(combine.CompetitionDistance.max() + 1000)

# For the missing values in CompetitionOpen, I just chenge it to categorical attribute (with 0 means not open yet, 1 means opened, -1 means unclear)
def convert_CompetitionOpen_cat(row):
    if pd.isnull(row['CompetitionOpen']):
        return -1
    else:
        if row['Date'] >= row['CompetitionOpen']:
            return 1
        else:
            return 0
combine['CompetitionOpen'] = combine.apply(convert_CompetitionOpen_cat, axis=1)

In [13]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1057131 entries, 0 to 1057130
Data columns (total 25 columns):
Store                  1057131 non-null int64
DayOfWeek              1057131 non-null int64
Date                   1057131 non-null datetime64[ns]
Open                   1057131 non-null float64
Promo                  1057131 non-null int64
StateHoliday           1057131 non-null object
SchoolHoliday          1057131 non-null int64
istestset              1057131 non-null int64
StoreType              1057131 non-null object
Assortment             1057131 non-null object
CompetitionDistance    1057131 non-null float64
CompetitionOpen        1057131 non-null int64
Promo2Since            1057131 non-null object
Promo2_on_month_1      1057131 non-null float64
Promo2_on_month_2      1057131 non-null float64
Promo2_on_month_3      1057131 non-null float64
Promo2_on_month_4      1057131 non-null float64
Promo2_on_month_5      1057131 non-null float64
Promo2_on_month_6      1057131 n

### Feature engineering

In [14]:
# DayOfWeek
combine['DayOfWeek'] = combine.DayOfWeek.astype(int)

# Date
combine['Date'] = pd.to_datetime(combine.Date)
combine['Year'] = combine.Date.dt.year.astype(int)
combine['Month'] = combine.Date.dt.month.astype(int)
combine['Day'] = combine.Date.dt.day.astype(int)
combine['Week'] = combine.Date.dt.week.astype(int)

# Open ...
# Promo ...

# StateHoliday
combine['StateHoliday'] = combine.StateHoliday.replace({0:'0'}).astype('category').cat.codes

# SchoolHoliday ...

# StoreType
combine['StoreType'] = combine.StoreType.astype('category').cat.codes

# Assortment
combine['Assortment'] = combine.Assortment.astype('category').cat.codes

# Promo2 conver into catigorical feature(with 0 means not on, 1 means on, -1 means not participate)
def create_promo2(row):
    if row['Promo2Since'] == -1:
        return -1
    else:
        month = row['Month']
        if row['Promo2_on_month_' + str(month)] == 1 and row['Date'] > row['Promo2Since']:
            return 1
        else:
            return 0
combine['Promo2'] = combine.apply(create_promo2, axis=1)
combine = combine.drop(promo2_related_feats, axis=1)

combine = combine.drop('Date', axis=1)

combine = pd.get_dummies(combine)

In [15]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1057131 entries, 0 to 1057130
Data columns (total 16 columns):
Store                  1057131 non-null int64
DayOfWeek              1057131 non-null int32
Open                   1057131 non-null float64
Promo                  1057131 non-null int64
StateHoliday           1057131 non-null int8
SchoolHoliday          1057131 non-null int64
istestset              1057131 non-null int64
StoreType              1057131 non-null int8
Assortment             1057131 non-null int8
CompetitionDistance    1057131 non-null float64
CompetitionOpen        1057131 non-null int64
Year                   1057131 non-null int32
Month                  1057131 non-null int32
Day                    1057131 non-null int32
Week                   1057131 non-null int32
Promo2                 1057131 non-null int64
dtypes: float64(2), int32(5), int64(6), int8(3)
memory usage: 95.8 MB


### Prepare the data for modeling

In [16]:
data_train = combine[combine.istestset==0].drop('istestset', axis=1)
label_train = train['Sales']
label_train_log = np.log1p(train['Sales'])
data_test = combine[combine.istestset==1].drop('istestset', axis=1)

### Find the basic setting of the model and Set up the metrics

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

#### I tried the following 4 settings for the model and choose the best:

- Default objective function without taking log of the target
- Default objective function with taking log of the target
- Custom objective function without taking log of the target
- Custom objective function with taking log of the target

In [18]:
X_train, X_eval, y_train, y_eval = train_test_split(data_train, label_train, test_size=0.1, random_state=0)
X_train, X_eval, y_train_log, y_eval_log = train_test_split(data_train, label_train_log, test_size=0.1, random_state=0)

dataset_train = lgb.Dataset(X_train, y_train)
dataset_eval = lgb.Dataset(X_eval, y_eval, reference=dataset_train)

dataset_train_log = lgb.Dataset(X_train, y_train_log)
dataset_eval_log = lgb.Dataset(X_eval, y_eval_log, reference=dataset_train_log)

In [19]:
num_rounds = 2000
verb_rounds = 100
earlystop_rounds = 100

### Default objective function without taking log of the target

In [20]:
def loss(y_pred, y):
    y = y.get_label()
    n = y.shape[0]
    ind = (y!=0)
    y_pred, y = y_pred[ind], y[ind]
    dis_percent_squared = ((y - y_pred) / y) ** 2
    loss = np.sqrt(dis_percent_squared.sum() / n)
    return 'rmspe', loss, False

# lgb.train({}, dataset_train, num_rounds, 
#           valid_sets=[dataset_train, dataset_eval], valid_names=['training set', 'validation set'], 
#           feval=loss, verbose_eval=verb_rounds, early_stopping_rounds=earlystop_rounds)

### Default objective function with taking log of the target

In [21]:
def loss_log(y_pred, y):
    y = y.get_label()
    n = y.shape[0]
    y_pred = np.exp(y_pred) - 1
    y = np.exp(y) - 1
    ind = (y!=0)
    y_pred, y = y_pred[ind], y[ind]
    dis_percent_squared = ((y - y_pred) / y) ** 2
    loss = np.sqrt(dis_percent_squared.sum() / n)
    return 'rmspe_log', loss, False

# lgb.train({}, dataset_train_log, num_rounds, 
#           valid_sets = [dataset_train_log, dataset_eval_log], valid_names=['training set', 'validation set'], 
#           feval=loss_log, verbose_eval=verb_rounds, early_stopping_rounds=earlystop_rounds)

### Custom objective function without taking log of the target

In [22]:
def rmspe_loss(y_pred, y):
    y = y.get_label()
    dis = y - y_pred
    ind = (dis!=0) & (y!=0)
    gradient = np.zeros(y.shape)
    hession = np.zeros(y.shape)
    n = y.shape[0]
    gradient[ind] = 2 * (y_pred[ind] - y[ind]) / y[ind] ** 2
    hession[ind] = 2 / y[ind] ** 2
    return gradient, hession

# lgb.train({}, dataset_train, num_rounds, 
#           valid_sets = [dataset_train, dataset_eval], 
#           valid_names=['training set', 'validation set'], 
#           fobj=rmspe_loss, feval=loss, 
#           verbose_eval=verb_rounds, early_stopping_rounds=earlystop_rounds)

### Custom objective function with taking log of the target

In [23]:
def rmspe_loss(y_pred, y):
    y = y.get_label()
    dis = y - y_pred
    ind = (dis!=0) & (y!=0)
    gradient = np.zeros(y.shape)
    hession = np.zeros(y.shape)
    n = y.shape[0]
    gradient[ind] = 2 * (y_pred[ind] - y[ind]) / y[ind] ** 2
    hession[ind] = 2 / y[ind] ** 2
    return gradient, hession

# lgb.train({}, dataset_train_log, num_rounds, 
#           valid_sets = [dataset_train_log, dataset_eval_log], 
#           valid_names=['training set', 'validation set'], 
#           fobj=rmspe_loss, feval=loss_log, 
#           verbose_eval=verb_rounds, early_stopping_rounds=earlystop_rounds)

### Gridsearch for best parameters

In [24]:
# KFold for consistant cv:
kf = KFold(n_splits=4, shuffle=True, random_state=0)

# Custom scoring function
def loss_log(y, y_pred):
    n = y.shape[0]
    y_pred = np.exp(y_pred) - 1
    y = np.exp(y) - 1
    ind = (y!=0)
    y_pred, y = y_pred[ind], y[ind]
    dis_percent_squared = ((y - y_pred) / y) ** 2
    loss = np.sqrt(dis_percent_squared.sum() / n)
    return loss
mt = make_scorer(loss_log, greater_is_better=False)

# Simple performance measure function
def performance(model):
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mt, n_jobs=4, verbose=True)
    score_mean = scores.mean()
    score_std = scores.std()
    print('score mean: {}'.format(-score_mean))
    print('score std: {}'.format(score_std))

# Grid search function
def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kf, scoring=mt, n_jobs=4, verbose=True).fit(X_train, y_train)
    print('grid.best_score_: {}'.format(-grid.best_score_))
    print('grid.best_params_: \n{}'.format(grid.best_params_))
    return grid.best_estimator_

# Custom objective function for sklearn api
def rmspe_loss(y, y_pred):
    dis = y - y_pred
    ind = (dis!=0) & (y!=0)
    gradient = np.zeros(y.shape)
    hession = np.zeros(y.shape)
    n = y.shape[0]
    gradient[ind] = 2 * (y_pred[ind] - y[ind]) / y[ind] ** 2
    hession[ind] = 2 / y[ind] ** 2
    return gradient, hession

In [27]:
combine.to_csv('combine.csv', index=False)

In [25]:
performance(LGBMRegressor())

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    7.2s remaining:    7.2s


MemoryError: 

In [None]:
dataset = lgb.Dataset(data_train, label_train_log)
reg = lgb.train({}, dataset, num_rounds,
                valid_sets=[dataset],
                valid_names=['training set'],
                fobj=rmspe_loss, feval=loss_log, 
                verbose_eval=verb_rounds, early_stopping_rounds=earlystop_rounds)

In [None]:
sns.barplot(x=reg.feature_importance(), y=reg.feature_name())

In [None]:
y_pred = np.exp(reg.predict(data_test)) - 1
y_pred = pd.Series(y_pred, name='Sales')
sub = pd.concat([test['Id'], y_pred], axis=1)
sub.to_csv('sub.csv', index=False)

**Since the custom objective function with taking log performs the best, I choose this setting**

In [None]:
# reg = LGBMRegressor().fit(X_train, y_train)
# y_pred = np.exp(reg.predict(X_test)) - 1
# y_pred = pd.Series(y_pred, name='Sales')
# sub = pd.concat([test['Id'], y_pred], axis=1)
# sub.to_csv('sub.csv', index=False)