In [35]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from prophet import Prophet
import warnings

sns.set_style('darkgrid')
warnings.filterwarnings('ignore')

In [36]:
def prophet_features(df):
    temp_df = df.reset_index()
    temp_df = temp_df[['date', 'sales']]
    temp_df.rename(columns={'date': 'ds', 'sales': 'y'}, inplace=True)

    # define prophet model
    m = Prophet(
        growth='linear',
        seasonality_mode='additive',
        interval_width=0.95,
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=False
    )
    # train prophet model
    m.fit(temp_df)

    future = m.make_future_dataframe(periods=92)
    forecast = m.predict(future)

    return forecast

In [37]:
df = pd.read_csv('train.csv')
data = prophet_features(df)
df['date'] = pd.to_datetime(df['date'])
data

11:45:06 - cmdstanpy - INFO - Chain [1] start processing
11:45:06 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,daily,daily_lower,daily_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2013-01-01,13.368550,1.921355,25.115606,13.368550,13.368550,-0.316725,-0.316725,-0.316725,1.48895,1.48895,1.48895,-1.805675,-1.805675,-1.805675,0.0,0.0,0.0,13.051825
1,2013-01-02,13.377836,2.823782,24.263662,13.377836,13.377836,0.371677,0.371677,0.371677,1.48895,1.48895,1.48895,-1.117273,-1.117273,-1.117273,0.0,0.0,0.0,13.749513
2,2013-01-03,13.387122,2.777532,25.310127,13.387122,13.387122,0.935054,0.935054,0.935054,1.48895,1.48895,1.48895,-0.553896,-0.553896,-0.553896,0.0,0.0,0.0,14.322176
3,2013-01-04,13.396408,5.579483,27.286662,13.396408,13.396408,2.672100,2.672100,2.672100,1.48895,1.48895,1.48895,1.183150,1.183150,1.183150,0.0,0.0,0.0,16.068508
4,2013-01-05,13.405694,6.736638,28.654151,13.405694,13.405694,4.421248,4.421248,4.421248,1.48895,1.48895,1.48895,2.932298,2.932298,2.932298,0.0,0.0,0.0,17.826942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,2017-12-27,23.036124,12.349524,35.353730,22.989805,23.087121,0.371677,0.371677,0.371677,1.48895,1.48895,1.48895,-1.117273,-1.117273,-1.117273,0.0,0.0,0.0,23.407801
1822,2017-12-28,23.041361,12.108617,35.894902,22.994176,23.093143,0.935053,0.935053,0.935053,1.48895,1.48895,1.48895,-0.553897,-0.553897,-0.553897,0.0,0.0,0.0,23.976415
1823,2017-12-29,23.046599,15.169711,37.287123,22.998516,23.099148,2.672100,2.672100,2.672100,1.48895,1.48895,1.48895,1.183150,1.183150,1.183150,0.0,0.0,0.0,25.718699
1824,2017-12-30,23.051837,16.385046,38.847565,23.002889,23.105153,4.421248,4.421248,4.421248,1.48895,1.48895,1.48895,2.932298,2.932298,2.932298,0.0,0.0,0.0,27.473084


In [51]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [52]:
lags=[7,14,15,30,60,90]
for lag in lags:
    data[f'yhat_lag_{lag}'] = data['yhat'].shift(lag)
data

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,daily,...,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat,yhat_lag_7,yhat_lag_14,yhat_lag_15,yhat_lag_30,yhat_lag_60,yhat_lag_90
0,2013-01-01,13.368550,1.921355,25.115606,13.368550,13.368550,-0.316725,-0.316725,-0.316725,1.48895,...,0.0,0.0,0.0,13.051825,,,,,,
1,2013-01-02,13.377836,2.823782,24.263662,13.377836,13.377836,0.371677,0.371677,0.371677,1.48895,...,0.0,0.0,0.0,13.749513,,,,,,
2,2013-01-03,13.387122,2.777532,25.310127,13.387122,13.387122,0.935054,0.935054,0.935054,1.48895,...,0.0,0.0,0.0,14.322176,,,,,,
3,2013-01-04,13.396408,5.579483,27.286662,13.396408,13.396408,2.672100,2.672100,2.672100,1.48895,...,0.0,0.0,0.0,16.068508,,,,,,
4,2013-01-05,13.405694,6.736638,28.654151,13.405694,13.405694,4.421248,4.421248,4.421248,1.48895,...,0.0,0.0,0.0,17.826942,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,2017-12-27,23.036124,12.349524,35.353730,22.989805,23.087121,0.371677,0.371677,0.371677,1.48895,...,0.0,0.0,0.0,23.407801,23.371137,23.334474,22.640834,19.956905,27.143113,23.499790
1822,2017-12-28,23.041361,12.108617,35.894902,22.994176,23.093143,0.935053,0.935053,0.935053,1.48895,...,0.0,0.0,0.0,23.976415,23.939751,23.903088,23.334474,22.567507,27.988490,25.242074
1823,2017-12-29,23.046599,15.169711,37.287123,22.998516,23.099148,2.672100,2.672100,2.672100,1.48895,...,0.0,0.0,0.0,25.718699,25.682036,25.645372,23.903088,23.261147,19.810251,26.996459
1824,2017-12-30,23.051837,16.385046,38.847565,23.002889,23.105153,4.421248,4.421248,4.421248,1.48895,...,0.0,0.0,0.0,27.473084,27.436421,27.399757,25.645372,23.829761,22.420853,27.841836


In [53]:
X_train = data.loc[(data["ds"] < "2017-09-30"), :].iloc[:,1:]
Y_train = df.loc[(df["date"] < "2017-09-30"), :]['sales'].values
X_train.shape,Y_train.shape

((1733, 24), (1733,))

In [54]:
X_test = data.loc[(data["ds"] > "2017-09-30"), :].iloc[:,1:]

In [55]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds)+np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [63]:
lgb_params = {'metric': {'mae'},
              'seed': 0,
              'num_leaves': 32,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 16,
              'verbose': 0,
              'subsample': 0.8,
              'nthread': -1}

In [64]:
lgbdata = lgb.Dataset(data=X_train, label=Y_train)
model = lgb.train(lgb_params, lgbdata,num_boost_round=300)

You can set `force_col_wise=true` to remove the overhead.


In [65]:
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

In [47]:
# model = LGBMRegressor(random_state=0)
# model.fit(X_train, Y_train)
# predictions = model.predict(X_test)

In [66]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(pd.read_csv('test.csv')['sales'].values,test_preds)

0.4223312498279564

In [45]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(pd.read_csv('test.csv')['sales'].values,pd.read_csv('lgb.csv')['sales'].values)

0.25984471290340005

In [46]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(pd.read_csv('test.csv')['sales'].values,pd.read_csv('prophet.csv')['sales'].values)

0.2379936834116205