In [1]:
# Data processing
import pandas as pd
import numpy as np
from datetime import datetime
from pandas import Series

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from plotly.subplots import make_subplots
from prophet.plot import plot_plotly
from prophet.plot import plot_plotly, plot_components_plotly
import plotly.graph_objs as go

#model
from prophet import Prophet
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import sklearn.metrics

#autres
from typing import List
from typing import Tuple
import random


  from .autonotebook import tqdm as notebook_tqdm


# Data import & preprocessing

In [2]:
#data import
df_main = pd.read_csv('../Data/df_main_v4.csv')

#group by date
df = df_main.groupby(by='date').agg({'sales': 'sum','onpromotion':'sum', 'dcoilwtico':'mean','is_holiday':'mean'})

#Round the price of the oil
df['dcoilwtico']=df['dcoilwtico'].astype('int64')

#reset index
df.reset_index(inplace=True)
df['date'] = pd.to_datetime(df['date'])

#Name columns as prophet wants
df.rename(columns={'date': 'ds', 'sales':'y'}, inplace=True)

#suppresion of the last dates - just for our project
df = df.iloc[:-46]

#Creation of df train and test
df_train = df.iloc[:-91]
df_test = df.iloc[-91:]

# Prophet Model

In [3]:
#best params have been selected with a gridsearch before

params_grid = {'seasonality_mode':(['additive']),
               'changepoint_range':[0.8],
               'changepoint_prior_scale':[0.7],
               'seasonality_prior_scale':[0.4],
               'prior_scale_is_holiday':[0.1],
               'prior_scale_onpromotion':[0.002],
               'prior_scale_dcoilwtico':[0.0001],
               'fourier_order':[5]
              }

m = Prophet(changepoint_prior_scale = 0.7,
            seasonality_prior_scale = 0.4,
            changepoint_range=0.8,
            seasonality_mode = 'additive',
            weekly_seasonality=True,
            daily_seasonality = False,
            yearly_seasonality = True,
            interval_width=0.95)
m.add_regressor('is_holiday', prior_scale =0.1)
m.add_regressor('onpromotion', prior_scale = 0.002)
m.add_regressor('dcoilwtico', prior_scale = 0.0001)
m.add_seasonality(name='monthly', period=30.417, fourier_order=5)
model=m.fit(df_train[['ds', 'y', 'is_holiday', 'onpromotion', 'dcoilwtico']])

#forecast from prophet
df_loop = df
future=m.make_future_dataframe(periods=91, freq= 'D')
future.insert(1, 'onpromotion', df_loop['onpromotion'])
future.insert(2, 'dcoilwtico', df_loop['dcoilwtico'])
future.insert(3, 'is_holiday', df_loop['is_holiday'])


forecast = m.predict(future)

17:36:33 - cmdstanpy - INFO - Chain [1] start processing
17:36:35 - cmdstanpy - INFO - Chain [1] done processing


In [4]:
def mape(y_true,y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
y_true = np.array(df_loop['y'].tail(91))
y_pred = np.array(forecast['yhat'].tail(91))
mape = mape(y_true,y_pred)
mape

6.709137005302478

# XGBoost Model on Prophet

## Lag Data preperation for XGB

In [6]:
#Merging the forecast results of Prophet with the initial df in order to have an y

df_lags = pd.merge(left=forecast, right=df,left_on='ds', right_on='ds' )
df_lags.set_index('ds', inplace=True)

#Adding 3 lags to our model
for lag in range(0,7):
    df_lags[f'yhat_lag_{lag}']=df_lags['yhat'].shift(lag)
    
#Dropping the NAN due to the lags
df_lags = df_lags.dropna(axis=0,how='any')


#Creating an X --> everything but y and an Y --> y or Sales in this case
X_lags = df_lags.drop(columns='y')
y_lags = df_lags['y']

#Creating a train and test dataset

X_train = X_lags.iloc[:-91]
X_test = X_lags.iloc[-91:]
y_test = y_lags.iloc[-91:]
y_train = y_lags.iloc[:-91]


## XGB Model

In [7]:
model = xgb.XGBRegressor()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

In [8]:
y_test = np.array(y_test)
y_true, y_pred = np.array(y_test), np.array(predictions)
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape

6.8306440049036645

In [9]:
mape_mean = []
lag_limit = 10
lag = 1
while lag < lag_limit:
    df_lags = pd.merge(left=forecast, right=df,left_on='ds', right_on='ds' )
    df_lags.set_index('ds', inplace=True)
    df_lags_copy = df_lags.copy()
    compteur = 0
    mape_list = []
    while compteur < 10:
        print(lag,compteur)
        for current_lag in range(1, lag):
            df_lags_copy[f'yhat_lag_{current_lag}']=df_lags_copy['yhat'].shift(current_lag)

        df_lags_copy = df_lags_copy.dropna(axis=0,how='any')
        X_lags = df_lags_copy.drop(columns= 'y',axis=1)
        y_lags = df_lags_copy['y']

        X_train = X_lags.iloc[:-91]
        X_test = X_lags.iloc[-91:]
        y_test = y_lags.iloc[-91:]
        y_train = y_lags.iloc[:-91]
        model = xgb.XGBRegressor()
        model.fit(X_train,y_train)
        predictions = model.predict(X_test)
        y_test = np.array(y_test)
        y_true, y_pred = np.array(y_test), np.array(predictions)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mape_list.append(mape)
        compteur +=1
    print('boucle finie')
    lag += 1
    a = sum(mape_list)/len(mape_list)
    mape_mean.append(a)
        
      

1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
boucle finie
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
boucle finie
3 0
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
boucle finie
4 0
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
boucle finie
5 0
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
boucle finie
6 0
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
boucle finie
7 0
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
boucle finie
8 0
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
boucle finie
9 0
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
boucle finie


In [10]:
mape_mean

[7.894618662957528,
 7.6333570269196995,
 7.293662576318975,
 7.11306974138482,
 6.882823654430806,
 7.247578855032662,
 6.998052474426918,
 6.933588566792151,
 7.092776973433125]