In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor, plot_importance
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('input/train.csv')
train_df['is_train'] = 1
test_df = pd.read_csv('input/test.csv')
test_df['is_train'] = 0
all_data_df = pd.concat([train_df,test_df])
all_data_df['sales'] = all_data_df['sales'].fillna(47)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """


In [3]:
all_data_df['date'] = pd.to_datetime(all_data_df['date'])
all_data_df['year'] = all_data_df['date'].dt.year
all_data_df['month'] = all_data_df['date'].dt.month
all_data_df['weekday'] = all_data_df['date'].dt.dayofweek
all_data_df['week_of_year']  = all_data_df['date'].dt.weekofyear
all_data_df['quater'] = all_data_df['year'] * 4 + all_data_df['month'] // 3 - 8051
#all_data_df['dayofyear'] = dataset['date'].dt.dayofyear

all_data_df['number-of-month'] = 12 * (all_data_df['year'] - 2013) + all_data_df['month']
all_data_df['date'] = all_data_df['date'] - pd.to_datetime('2013-01-01')

all_data_df['date'] = all_data_df['date'].apply(lambda x: x.days)

# Additional date features
# all_data_df['log_dayofyear'] = np.log(all_data_df['dayofyear'])
# all_data_df['day_power_year'] = np.log((np.log(all_data_df['dayofyear'] + 1)) ** (all_data_df['year'] - 2000))
# all_data_df['day_week_power_year'] = np.log(np.log(all_data_df['dayofyear'] + 1) * (np.log(all_data_df['weekofyear'] + 1)) ** (all_data_df['year'] - 2000))

In [4]:
all_data_df["median-store_item-month"] = all_data_df.groupby(["month","item","store"])["sales"].transform("median")
all_data_df["median-store_item-week"] = all_data_df.groupby(["week_of_year","item","store"])["sales"].transform("median")
all_data_df["median-store_item-weekday"] = all_data_df.groupby(["weekday","item","store"])["sales"].transform("median")

all_data_df["mean-store_item-month"] = all_data_df.groupby(["month","item","store"])["sales"].transform("mean")
all_data_df["mean-store_item-week"] = all_data_df.groupby(["week_of_year","item","store"])["sales"].transform("mean")
all_data_df["mean-store_item-weekday"] = all_data_df.groupby(["weekday","item","store"])["sales"].transform("mean")

all_data_df["std-store_item-month"] = all_data_df.groupby(["month","item","store"])["sales"].transform("std")
all_data_df["std-store_item-week"] = all_data_df.groupby(["week_of_year","item","store"])["sales"].transform("std")
all_data_df["std-store_item-weekday"] = all_data_df.groupby(["weekday","item","store"])["sales"].transform("std")

all_data_df["item-sum-month"] = all_data_df.groupby(["month","item"])["sales"].transform("sum")
all_data_df["item-sum-week"] = all_data_df.groupby(["week_of_year","item"])["sales"].transform("sum")
all_data_df["item-sum-weekday"] = all_data_df.groupby(["weekday","item"])["sales"].transform("sum")

all_data_df["store-sum-month"] = all_data_df.groupby(["month","store"])["sales"].transform("sum")
all_data_df["store-sum-week"] = all_data_df.groupby(["week_of_year","store"])["sales"].transform("sum")
all_data_df["store-sum-weekday"] = all_data_df.groupby(["weekday","store"])["sales"].transform("sum")

all_data_df["item-max-month"] = all_data_df.groupby(["month","item"])["sales"].transform("max")
all_data_df["item-max-week"] = all_data_df.groupby(["week_of_year","item"])["sales"].transform("max")
all_data_df["item-max-weekday"] = all_data_df.groupby(["weekday","item"])["sales"].transform("max")

# Adding rolling mean feature to train
# df = train.groupby(['item'])['sales'].rolling(10).mean().reset_index().drop('level_1', axis=1)
# train['rolling_mean'] = df['sales']

all_data_df['store_item_shifted-95'] = all_data_df.groupby(["item","store"])['sales'].transform(lambda x:x.shift(95)) #3 months ago
all_data_df['store_item_shifted-120'] = all_data_df.groupby(["item","store"])['sales'].transform(lambda x:x.shift(120)) #4 months ago
all_data_df['store_item_shifted-180'] = all_data_df.groupby(["item","store"])['sales'].transform(lambda x:x.shift(180)) #6 months ago

In [5]:
for i in range (4, 11):
    tmp = all_data_df.groupby(["number-of-month", "store", 'item'], as_index=False)[["sales"]].mean()
    tmp['number-of-month'] = tmp['number-of-month'] + i
    all_data_df = pd.merge(all_data_df, tmp, how='left', on = ["number-of-month", "store", 'item'], suffixes=('','_' + str(i) + '-month-ago'))

In [8]:
all_data_df = all_data_df[all_data_df['number-of-month'] > 10]
col = all_data_df.columns.drop(['id', 'sales', 'is_train', 'year'])

In [9]:
y = 'sales'
print("old train",train_df.shape)
train_df = all_data_df.loc[all_data_df['is_train'] == 1]
print("new train",train_df.shape)
test_df = all_data_df.loc[all_data_df['is_train'] != 1]

train_x = train_df[train_df['number-of-month'] < 58][col]
train_y = train_df[train_df['number-of-month'] < 58]['sales']
train_cv = train_df[train_df['number-of-month'] > 57][col]
train_cv_y = train_df[train_df['number-of-month'] > 57]['sales']

old train (913000, 40)
new train (761000, 40)


In [10]:
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    res = 0
    for y_t , y_p in zip(y_true, y_pred):
        diff = abs(y_t - y_p)
        summ = abs(y_t) + abs(y_p)
        summ /= 2
        res += diff/summ
        
    return 100*res/len(y_true)

In [20]:
catBoost = CatBoostRegressor()
# catBoost.fit(train_x, train_y)
# smape_catBoost = smape(train_cv_y, catBoost.predict(train_cv))
# print(smape_catBoost)

In [23]:
min = 11.975
for i in range(1, 10, 1):
    lightGBM = LGBMRegressor(max_depth=11, 
                             n_estimators=640, 
                             learning_rate=0.021, 
                             num_leaves=82, 
                             min_child_samples=94, 
                             colsample_bytree=0.9)
    lightGBM.fit(train_x, train_y)
    smape_lightGBM = smape(train_cv_y, lightGBM.predict(train_cv))
    print('i ' + str(i / 10))
    if(smape_lightGBM < min):
        min = smape_lightGBM
        print(smape_lightGBM)

i 0.1
i 0.2
i 0.3
i 0.4
i 0.5
i 0.6
i 0.7
i 0.8
i 0.9
11.973502379640593


In [14]:
min = 12.035
for i in range(10, 21, 1):
    lightGBM = LGBMRegressor(max_depth=11, colsample_bytree=1)
    lightGBM.fit(train_x, train_y)
    smape_lightGBM = smape(train_cv_y, lightGBM.predict(train_cv))
    print('i ' + str(i))
    if(smape_lightGBM < min):
        min = smape_lightGBM
        print(smape_lightGBM)

i 10
12.03477482930951


LightGBMError: Check failed: feature_fraction <=1.0 at /home/travis/build/Microsoft/LightGBM/python-package/compile/src/io/config_auto.cpp, line 280 .


In [9]:
lightGBM = LGBMRegressor()
lightGBM.fit(train_x, train_y)
smape_lightGBM = smape(train_cv_y, lightGBM.predict(train_cv))
smape_lightGBM

11.814919545775256

In [10]:
ax = plot_importance(lightGBM, figsize = (15, 20))
plt.show()

NotFittedError: No booster found. Need to call fit beforehand.

smape_score = smape(train_cv_y, 0.6 * lightGBM.predict(train_cv) + 0.4 * catBoost.predict(train_cv))
print(smape_score)

In [32]:
best_score = 12.09408312052163
lgbm = 12.1145
lgbm_1 = 12.0328
lightGBM.fit(train_df[col], train_df[y])
y_test = lightGBM.predict(test_df[col])
sample = pd.read_csv('input/sample_submission.csv')
sample['sales'] = y_test
sample.to_csv('simple_starter.csv', index=False)