In [143]:
import pandas as pd
import numpy as np

import pmdarima as pm
from statsmodels.tsa.holtwinters import ExponentialSmoothing

from plotly.subplots import make_subplots

from time import sleep
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load train
df_train = pd.read_csv(r'D:\Project\Pet_Project\Demand_Forecast\Data\sales_train.csv')
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [3]:
# Convert train date to date format
df_train['date'] = pd.to_datetime(df_train.date, format = '%d.%m.%Y')
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [4]:
# Check missing values in train
df_train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [5]:
# Get Year month aggr
df_train['date_block_num'] = pd.to_datetime(df_train['date']).dt.to_period('M')
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,2013-01,59,22154,999.0,1.0
1,2013-01-03,2013-01,25,2552,899.0,1.0
2,2013-01-05,2013-01,25,2552,899.0,-1.0
3,2013-01-06,2013-01,25,2554,1709.05,1.0
4,2013-01-15,2013-01,25,2555,1099.0,1.0


In [15]:
# Group by Year - Month - ItemId
df_train_aggr = df_train.groupby( ["date_block_num", "shop_id","item_id"], as_index=False ).agg({"item_price" : "mean",
                                                                       "item_cnt_day" : "sum",})
df_train_aggr.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01,0,32,221.0,6.0
1,2013-01,0,33,347.0,3.0
2,2013-01,0,35,247.0,1.0
3,2013-01,0,43,221.0,1.0
4,2013-01,0,51,128.5,2.0


In [9]:
# Load test
df_test = pd.read_csv(r'D:\Project\Pet_Project\Demand_Forecast\Data\test.csv')
df_test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [10]:
# Check missing values in train
df_test.isnull().sum()

ID         0
shop_id    0
item_id    0
dtype: int64

# Moving Average (MA)

In [67]:
submission = pd.DataFrame(index=np.arange(0, max(df_test.index)), columns=('ID', 'item_cnt_month') )

for index, row in tqdm(df_test.iterrows()):
    tmp = df_train_aggr[(df_train_aggr['shop_id'] == row['shop_id']) &
                        (df_train_aggr['item_id'] == row['item_id'])]
    if len(tmp.index) == 0:
        submission.loc[index] = [row['ID'],
                                 0]
    elif (len(tmp.index) > 0) & (len(tmp.index) < 2):
        submission.loc[index] = [row['ID'],
                                 tmp['item_cnt_day'].values[0]]
    else:
        submission.loc[index] = [row['ID'], tmp.tail(2)['item_cnt_day'].mean()]


214200it [31:53, 111.97it/s]


In [68]:
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,2.0
1,1,0.0
2,2,2.0
3,3,1.0
4,4,0.0


In [70]:
submission.dtypes

ID                object
item_cnt_month    object
dtype: object

In [69]:
submission.to_csv(r"D:\Project\Pet_Project\Demand_Forecast\Results\submission_MA.csv", index=False)

# ARIMA

In [238]:
submission = pd.DataFrame(index=np.arange(0, max(df_test.index)), columns=('ID', 'item_cnt_month') )

for index, row in tqdm(df_test.iterrows()):

    tmp = df_train_aggr[(df_train_aggr['shop_id'] == row['shop_id']) &
                        (df_train_aggr['item_id'] == row['item_id'])]

    if len(tmp.index) == 0:
        
        submission.loc[index] = [row['ID'],
                                 0]

    elif (len(tmp.index) > 0) & (len(tmp.index) < 24):

        submission.loc[index] = [row['ID'],
                                 tmp['item_cnt_day'].values[-1]]

    elif (len(tmp.index) >=24 ):
        model = pm.auto_arima(tmp.item_cnt_day, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                             start_P=0, seasonal=False,
                             d=1, D=1, trace=False,
                             error_action='ignore',  # don't want to know if an order does not work
                             suppress_warnings=True,  # don't want convergence warnings
                             stepwise=True)

        submission.loc[index] = [row['ID'],
                               round(model.predict(n_periods=1, return_conf_int=False)[0], 1)]

214200it [50:14, 71.05it/s]


In [239]:
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.0
1,1,0.0
2,2,1.0
3,3,1.0
4,4,0.0


In [240]:
submission[submission < 0] = 0

In [241]:
submission.to_csv(r"D:\Project\Pet_Project\Demand_Forecast\Results\submission_ARIMA.csv", index=False)

# Holt Winter’s Exponential Smoothing

In [242]:
submission = pd.DataFrame(index=np.arange(0, max(df_test.index)), columns=('ID', 'item_cnt_month') )

for index, row in tqdm(df_test.iterrows()):

    tmp = df_train_aggr[(df_train_aggr['shop_id'] == row['shop_id']) &
                        (df_train_aggr['item_id'] == row['item_id'])]

    if len(tmp.index) == 0:

        submission.loc[index] = [row['ID'],
                                 0]
    elif (len(tmp.index) > 0) & (len(tmp.index) < 12):

        submission.loc[index] = [row['ID'],
                                 tmp['item_cnt_day'].values[-1]]
    else:

        model = ExponentialSmoothing(tmp.item_cnt_day)
        model_fit = model.fit()

        submission.loc[index] = [row['ID'],
                                 round(model_fit.predict(len(tmp.item_cnt_day), len(tmp.item_cnt_day)),1).values[0]]

214200it [31:50, 112.09it/s]


In [243]:
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.0
1,1,0.0
2,2,1.0
3,3,1.0
4,4,0.0


In [244]:
submission[submission < 0] = 0

In [245]:
submission.to_csv(r"D:\Project\Pet_Project\Demand_Forecast\Results\submission_HW.csv", index=False)