In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 50)

from statsmodels.tsa.api import VAR
from statsmodels.tsa.statespace.varmax import VARMAX, VARMAXResults
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error,mean_absolute_error
from pmdarima import auto_arima
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import train_test_split

# Data Preprocessing

In [2]:
Grab = pd.read_csv('GrabAssignment.csv')

In [3]:
Grab.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


In [4]:
Grab.shape

(4206321, 4)

In [5]:
Grab.columns

Index(['geohash6', 'day', 'timestamp', 'demand'], dtype='object')

In [6]:
Grab.dtypes

geohash6      object
day            int64
timestamp     object
demand       float64
dtype: object

In [7]:
# Each day should have 96 timestamp (15 min), here to check out which day didnt have complete timestamp cycle
a_list = []
for c in range(1,61):
    lens = len(Grab[Grab["day"] == c].timestamp.value_counts())
    if lens != 96:
        a_list.append(c)
print(a_list)

[18]


In [8]:
#Regroup data to make the day and timestamp in order and unstack geohash
GrabFull = Grab.groupby(['geohash6', 'day', 'timestamp'])['demand'].mean().unstack('geohash6')

In [9]:
GrabFull.columns

Index(['qp02yc', 'qp02yf', 'qp02yu', 'qp02yv', 'qp02yy', 'qp02yz', 'qp02z1',
       'qp02z3', 'qp02z4', 'qp02z5',
       ...
       'qp0djv', 'qp0djw', 'qp0djy', 'qp0dn0', 'qp0dn1', 'qp0dn4', 'qp0dn5',
       'qp0dnh', 'qp0dnj', 'qp0dnn'],
      dtype='object', name='geohash6', length=1329)

In [10]:
# Creating the DataFrame to fillup the NaN value in Raw Data to complete timestamp cycle for timeseries forecasting
fillup_ts = pd.DataFrame(GrabFull.columns)
fillup_ts["day"] = 18
fillup_ts["demand"] = np.NaN

In [11]:
fillup_ts

Unnamed: 0,geohash6,day,demand
0,qp02yc,18,
1,qp02yf,18,
2,qp02yu,18,
3,qp02yv,18,
4,qp02yy,18,
5,qp02yz,18,
6,qp02z1,18,
7,qp02z3,18,
8,qp02z4,18,
9,qp02z5,18,


In [12]:
fillup_ts_str = pd.Series([str("9:45"),str("10:0"),str("10:15"),str("12:45"),str("11:30"),str("11:45"),str("12:0"),str("12:15"),str("12:30")])

In [13]:
fillup_ts_full = pd.DataFrame()
for ts in fillup_ts_str:
    fillup_ts["timestamp"] = ts
    fillup_ts_full = pd.concat([fillup_ts_full,fillup_ts], ignore_index=True)
    

In [14]:
fillup_ts_full

Unnamed: 0,geohash6,day,demand,timestamp
0,qp02yc,18,,9:45
1,qp02yf,18,,9:45
2,qp02yu,18,,9:45
3,qp02yv,18,,9:45
4,qp02yy,18,,9:45
5,qp02yz,18,,9:45
6,qp02z1,18,,9:45
7,qp02z3,18,,9:45
8,qp02z4,18,,9:45
9,qp02z5,18,,9:45


In [15]:
Grab_done = pd.concat([Grab,fillup_ts_full], ignore_index=True, sort=False)

In [16]:
Grab_done

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468
5,qp03tu,1,12:15,0.023843
6,qp096d,25,3:30,0.007460
7,qp03nr,51,20:45,0.000293
8,qp093r,48,6:15,0.054170
9,qp03r2,4,22:15,0.123463


In [17]:
Grab_done['timestamp'] = pd.to_datetime(Grab_done['timestamp'], format='%H:%M').dt.time

In [18]:
grab_train = Grab_done.groupby(['geohash6', 'day', 'timestamp'])['demand'].mean().unstack('geohash6')

In [19]:
grab_train.shape

(5856, 1329)

In [20]:
grab_train.isnull().sum().sum()

3576303

In [21]:
#Merge the consecutive day and timestamp into one time series with a assigned datetime and 15min frequency
grab_train = grab_train.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5856, freq='15min'))

In [22]:
grab_train = grab_train.fillna(0)

In [23]:
grab_train.head(10)

geohash6,qp02yc,qp02yf,qp02yu,qp02yv,qp02yy,qp02yz,qp02z1,qp02z3,qp02z4,qp02z5,qp02z6,qp02z7,qp02z9,qp02zc,qp02zd,qp02ze,qp02zf,qp02zg,qp02zh,qp02zj,qp02zk,qp02zm,qp02zn,qp02zp,qp02zq,...,qp0djb,qp0djc,qp0djd,qp0dje,qp0djf,qp0djg,qp0djh,qp0djj,qp0djk,qp0djm,qp0djn,qp0djq,qp0djs,qp0djt,qp0dju,qp0djv,qp0djw,qp0djy,qp0dn0,qp0dn1,qp0dn4,qp0dn5,qp0dnh,qp0dnj,qp0dnn
2019-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009482,0.0,0.0,0.0,0.003641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0006,0.0,0.0,0.0,0.038979,0.0,0.010554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.012865,0.0,0.0,0.0,0.001304,0.0,0.002454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004056,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00491,0.0,0.0,0.0,0.020167,0.0,0.0,0.0,0.0,0.000503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009381,0.0
2019-01-01 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070525,0.001295,0.0,0.0,0.008541,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083947,0.0,0.045482,0.047912,0.006503,0.0,0.0,0.0,0.049005,0.047575,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009687,0.0,0.051739,0.0,0.010428,0.0,0.0,0.0,0.0,0.0,0.008253,0.0,0.0,0.0,0.0,0.0,0.002701,0.0
2019-01-01 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045083,0.0,0.0,0.031414,0.0,0.0,0.0,0.0,0.0,0.0,0.060752,0.0,0.0,...,0.0,0.0,0.011394,0.0,0.0,0.0,0.009599,0.017358,0.0,0.0,0.0,0.020719,0.0,0.0,0.0,0.0,0.0,0.006971,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 01:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.099317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04836,0.000954,0.0,0.0,0.0,0.0,0.0,0.104351,0.075672,0.00063,0.066387,...,0.0,0.0,0.0,0.000471,0.012844,0.0,0.027026,0.029316,0.007121,0.005744,0.0,0.0,0.000993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019563,0.0
2019-01-01 01:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011855,0.003631,0.0,0.0,0.039394,0.0,0.0,0.0,0.0,0.019048,0.03685,0.079985,0.055821,0.071413,...,0.0,0.0,0.0,0.0,0.003883,0.0,0.004794,0.0,0.019596,0.031227,0.0,0.02307,0.000721,0.0,0.0,0.0,0.0,0.001858,0.0,0.0,0.0,0.0,0.0,0.007193,0.0
2019-01-01 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.029813,0.0,0.0,0.0,0.0,0.0,0.045931,0.0,0.0,0.0,0.0,0.02393,0.0,0.082367,0.0,0.019131,0.022062,0.036493,0.012899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016685,0.024294,0.034116,0.018435,0.0,0.011025,0.005066,0.011381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 02:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.099991,0.0,0.0,0.0,0.0,0.019614,0.019863,0.0,0.016487,0.0,0.0,0.044045,0.014075,0.237685,0.0,0.042676,0.124104,0.038605,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000553,0.006702,0.032528,0.007979,0.0,0.0,0.019812,0.002717,0.0,0.0,0.0,0.008936,0.0,0.0,0.0,0.0,0.0,0.007224,0.0


# Dimensionality Reduction, PCA to 5D with retained 80% variance

In [24]:
from sklearn.decomposition import PCA

In [25]:
pca = PCA(n_components=5)

In [26]:
train_2D = pca.fit_transform(grab_train[0:-672])

In [27]:
pca.explained_variance_ratio_.sum()

0.8085070179723179

In [28]:
pca.explained_variance_ratio_

array([0.65180352, 0.06130845, 0.04398342, 0.02706075, 0.02435087])

In [29]:
train_2D = pd.DataFrame(data = train_2D, columns=["z1","z2","z3","z4","z5"])

In [30]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [31]:
train_2D.shape

(5184, 5)

# 1. Modelling VAR(96, trend="ctt"), Train(5184), Test(672), PCA > 5D, RMSE = 0.04694127

In [45]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.statespace.varmax import VARMAX, VARMAXResults
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error,mean_absolute_error
from pmdarima import auto_arima
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import train_test_split

In [None]:
def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

In [None]:
for c in train_2D.columns: # All z1-z5 data stationary
    adf_test(train_2D[c],title=c)

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF = train_2D[:-nobs]
testF = grab_train[-nobs:]

In [None]:
model_VAR = VAR(trainF)

In [None]:
aic_dict = {}
for i in range(94,97):
    results = model_VAR.fit(i)
    aic_dict[i] = results.aic

In [None]:
model_VAR.endog_names

In [None]:
# To return the key_values, where it value are smallest
min(aic_dict, key=aic_dict.get)

In [None]:
# To return the max & min value in a dictionary object
key_max = max(aic_dict.keys(), key=(lambda k: aic_dict[k]))
key_min = min(aic_dict.keys(), key=(lambda k: aic_dict[k]))
print('Maximum Value: ',aic_dict[key_max])
print('Minimum Value: ',aic_dict[key_min])

In [None]:
results_VAR = model_VAR.fit(maxlags=96, trend="ctt") 
results_VAR.summary()

In [None]:
results_VAR.k_ar

In [None]:
results_VAR.aic

In [None]:
lagged_values_VAR = trainF.values[-96:]

In [None]:
z = results_VAR.forecast(y=lagged_values_VAR, steps=672) 
df_forecast_VAR = pca.inverse_transform(z)

In [None]:
#Set the time frame to be same as test dataset
idxH = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR = pd.DataFrame(df_forecast_VAR, index=idxH, columns=testF.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR[df_forecast_VAR < 0] = 0
df_forecast_VAR[df_forecast_VAR > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF,df_forecast_VAR)))
print(mean_absolute_error(testF,df_forecast_VAR)*100)
print(testF.mean().sum())

In [None]:
#The model residual mean
#Residual mean close to zero which is a good indication for the forecasting
np.mean(results_VAR.resid).sum()

In [None]:
#Visualise the forecast values and dataset values for all of the geohash6
df_forecast_VAR.iplot()
testF.iplot()

# 2. Modelling VAR(96, trend="ctt"), Train(-3360:-672), Test(672), PCA > 5D, RMSE = 0.05020833

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainHalf = train_2D[-3360:-nobs]
testHalf = grab_train[-nobs:]

In [None]:
model_VAR_H = VAR(trainHalf)

In [None]:
results_VAR_H = model_VAR_H.fit(maxlags=96, trend="ctt") 
results_VAR_H.summary()

In [None]:
results_VAR_H.k_ar

In [None]:
results_VAR_H.aic

In [None]:
lagged_values_VAR_H = trainHalf.values[-96:]

In [None]:
z_H = results_VAR_H.forecast(y=lagged_values_VAR_H, steps=672) 
df_forecast_VAR_H = pca.inverse_transform(z_H)

In [None]:
#Set the time frame to be same as test dataset
idxH_H = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_H = pd.DataFrame(df_forecast_VAR_H, index=idxH_H, columns=testHalf.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_H[df_forecast_VAR_H < 0] = 0
df_forecast_VAR_H[df_forecast_VAR_H > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testHalf,df_forecast_VAR_H)))
print(mean_absolute_error(testHalf,df_forecast_VAR_H)*100)
print(testHalf.mean().sum())

In [None]:
#Visualise the forecast values and dataset values for all of the geohash6
df_forecast_VAR_H.iplot()
testHalf.iplot()

# 3. Modelling VAR(96, trend="ctt",exog= Weekday), Train(5184), Test(672), PCA > 5D, RMSE = 0.04711870

In [33]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = train_2D[:-nobs]
testF_Exo = grab_train[-nobs:]

In [34]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [35]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [36]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [37]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [38]:
trainF_Exo_feature.columns

Index(['z1', 'z2', 'z3', 'z4', 'z5', 'Weekday', 'Weekday_Friday',
       'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday',
       'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object')

In [39]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [40]:
trainF_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-02-11 06:30:00,0,1,0,0,0,0,0
2019-01-09 00:30:00,0,0,0,0,0,0,1
2019-01-07 07:45:00,0,1,0,0,0,0,0
2019-02-02 07:30:00,0,0,1,0,0,0,0
2019-02-09 10:30:00,0,0,1,0,0,0,0


In [41]:
model_VAR_F_Exo = VAR(trainF_Exo, exog=trainF_Exo_feature)

In [42]:
results_model_VAR_F_Exo = model_VAR_F_Exo.fit(maxlags=96, trend="ctt") 
results_model_VAR_F_Exo.summary()


invalid value encountered in sqrt


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in less_equal



  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sat, 31, Aug, 2019
Time:                     19:22:38
--------------------------------------------------------------------
No. of Equations:         5.00000    BIC:                   -21.3489
Nobs:                     4416.00    HQIC:                  -23.6448
Log likelihood:           26089.7    FPE:                1.54844e-11
AIC:                     -24.8958    Det(Omega_mle):     9.14958e-12
--------------------------------------------------------------------
Results for equation z1
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const            0.005682              NAN              NAN             NAN
trend           -0.000025         0.000011           -2.288           0.022
trend**2         0.000000         0.000000            2.671           0.008
exog0  

In [43]:
results_model_VAR_F_Exo.aic

-24.895778184771164

In [44]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-96:]

In [46]:
z_F_Exo = results_model_VAR_F_Exo.forecast(y=lagged_values_VAR_F_Exo, steps=672, exog_future=testF_Exo_feature) 
df_forecast_VAR_F_Exo = pca.inverse_transform(z_F_Exo)

In [47]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(df_forecast_VAR_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [48]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [49]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo)))
print(mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100)
print(testF_Exo.mean().sum())

0.04711869748444822
2.2579737205471724
81.90959266258383


# 4. Modelling VAR(96, trend="ctt",exog= Weekday), Train(-3360:-672), Test(672), PCA > 5D, RMSE = 0.04785709

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = train_2D[-3360:-nobs]
testF_Exo = grab_train[-nobs:]

In [None]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [None]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [None]:
trainF_Exo_feature.columns

In [None]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [None]:
trainF_Exo_feature.sample(5)

In [None]:
model_VAR_F_Exo = VAR(trainF_Exo, exog=trainF_Exo_feature)

In [None]:
results_model_VAR_F_Exo = model_VAR_F_Exo.fit(maxlags=96, trend="ctt") 
results_model_VAR_F_Exo.summary()

In [None]:
results_model_VAR_F_Exo.aic

In [None]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-96:]

In [None]:
z_F_Exo = results_model_VAR_F_Exo.forecast(y=lagged_values_VAR_F_Exo, steps=672, exog_future=testF_Exo_feature) 
df_forecast_VAR_F_Exo = pca.inverse_transform(z_F_Exo)

In [None]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(df_forecast_VAR_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo)))
print(mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100)
print(testF_Exo.mean().sum())

# 5. Modelling VAR(7, trend="ctt"), Train(5184), Test(672), Full- Dimension, RMSE = 0.07916411

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainHalf = grab_train[0:-nobs]
testHalf = grab_train[-nobs:]

In [None]:
np.fill_diagonal(trainHalf.values, trainHalf + 1)

In [None]:
model_VAR_H = VAR(trainHalf)

In [None]:
results_VAR_H = model_VAR_H.fit(maxlags=7, trend="ctt")

In [None]:
results_VAR_H.k_ar

In [None]:
lagged_values_VAR_H = trainHalf.values[-7:]

In [None]:
z_H = results_VAR_H.forecast(y=lagged_values_VAR_H, steps=672)

In [None]:
#Set the time frame to be same as test dataset
idxH_H = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_H = pd.DataFrame(z_H, index=idxH_H, columns=testHalf.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_H[df_forecast_VAR_H < 0] = 0
df_forecast_VAR_H[df_forecast_VAR_H > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testHalf,df_forecast_VAR_H)))
print(mean_absolute_error(testHalf,df_forecast_VAR_H)*100)
print(testHalf.mean().sum())

# 6. Modelling VAR(7, trend="ctt"), Train(-3360:-672), Test(672), Full- Dimension, RMSE = 0.08025244

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672
trainHalf = grab_train[-3360:-nobs]
testHalf = grab_train[-nobs:]

In [None]:
np.fill_diagonal(trainHalf.values, trainHalf + 1)

In [None]:
model_VAR_H = VAR(trainHalf)

In [None]:
results_VAR_H = model_VAR_H.fit(maxlags=7, trend="ctt")

In [None]:
results_VAR_H.k_ar

In [None]:
lagged_values_VAR_H = trainHalf.values[-7:]

In [None]:
z_H = results_VAR_H.forecast(y=lagged_values_VAR_H, steps=672)

In [None]:
#Set the time frame to be same as test dataset
idxH_H = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_H = pd.DataFrame(z_H, index=idxH_H, columns=testHalf.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_H[df_forecast_VAR_H < 0] = 0
df_forecast_VAR_H[df_forecast_VAR_H > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testHalf,df_forecast_VAR_H)))
print(mean_absolute_error(testHalf,df_forecast_VAR_H)*100)
print(testHalf.mean().sum())

# 7. Modelling VAR(7, trend="ctt",exog= Weekday), Train(5184), Test(672), Full-Dimension, RMSE = 0.57290424

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = grab_train[0:-nobs]
testF_Exo = grab_train[-nobs:]

In [None]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [None]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [None]:
trainF_Exo_feature.columns

In [None]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [None]:
trainF_Exo_feature.sample(5)

In [None]:
model_VAR_F_Exo = VAR(trainF_Exo, exog=trainF_Exo_feature)

In [None]:
results_model_VAR_F_Exo = model_VAR_F_Exo.fit(maxlags=7, trend="ctt") 

In [None]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-7:]

In [None]:
z_F_Exo = results_model_VAR_F_Exo.forecast(y=lagged_values_VAR_F_Exo, steps=672, exog_future=testF_Exo_feature) 

In [None]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(z_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo)))
print(mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100)
print(testF_Exo.mean().sum())

# 8. Modelling VAR(7, trend="ctt",exog= Weekday), Train(-3360:-672), Test(672), Full-Dimension, RMSE = 0.55819723

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = grab_train[-3360:-nobs]
testF_Exo = grab_train[-nobs:]

In [None]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [None]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [None]:
trainF_Exo_feature.columns

In [None]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [None]:
trainF_Exo_feature.sample(5)

In [None]:
model_VAR_F_Exo = VAR(trainF_Exo, exog=trainF_Exo_feature)

In [None]:
results_model_VAR_F_Exo = model_VAR_F_Exo.fit(maxlags=7, trend="ctt") 

In [None]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-7:]

In [None]:
z_F_Exo = results_model_VAR_F_Exo.forecast(y=lagged_values_VAR_F_Exo, steps=672, exog_future=testF_Exo_feature) 

In [None]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(z_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo)))
print(mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100)
print(testF_Exo.mean().sum())

In [None]:
#Visualise the forecast values and dataset values for all of the geohash6
df_forecast_VAR_F_Exo.iplot()
testF_Exo.iplot()

# 9. Modelling VARMA(order = (3, 1)), Train(5184), Test(672), 5D-Dimension, RMSE = 0.07555706

In [50]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = train_2D[:-nobs]
testF_Exo = grab_train[-nobs:]

In [51]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [52]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [53]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [54]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [55]:
trainF_Exo_feature.columns

Index(['z1', 'z2', 'z3', 'z4', 'z5', 'Weekday', 'Weekday_Friday',
       'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday',
       'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object')

In [56]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [57]:
trainF_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-01-05 19:30:00,0,0,1,0,0,0,0
2019-01-29 10:00:00,0,0,0,0,0,1,0
2019-02-01 23:00:00,1,0,0,0,0,0,0
2019-01-11 16:00:00,1,0,0,0,0,0,0
2019-01-03 19:45:00,0,0,0,0,1,0,0


In [None]:
auto_arima(trainF_Exo['z1'],seasonal=True,m = 12)

In [58]:
model_VAR_F_Exo = VARMAX(trainF_Exo, order=(3,1))


Estimation of VARMA(p,q) models is not generically robust, due especially to identification issues.



In [59]:
results_model_VAR_F_Exo = model_VAR_F_Exo.fit(maxlags=3, disp=False) 
results_model_VAR_F_Exo.summary()


Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.


Maximum Likelihood optimization failed to converge. Check mle_retvals



0,1,2,3
Dep. Variable:,"['z1', 'z2', 'z3', 'z4', 'z5']",No. Observations:,4512.0
Model:,"VARMA(3,1)",Log Likelihood,21815.821
,+ intercept,AIC,-43391.643
Date:,"Sat, 31 Aug 2019",BIC,-42621.903
Time:,19:25:34,HQIC,-43120.453
Sample:,01-01-2019,,
,- 02-16-2019,,
Covariance Type:,opg,,

0,1,2,3
Ljung-Box (Q):,"698.61, 300.43, 156.80, 794.30, 303.24",Jarque-Bera (JB):,"16435938.45, 1117.93, 12945.35, 1139.39, 324.99"
Prob(Q):,"0.00, 0.00, 0.00, 0.00, 0.00",Prob(JB):,"0.00, 0.00, 0.00, 0.00, 0.00"
Heteroskedasticity (H):,"1.24, 1.32, 1.14, 1.03, 1.04",Skew:,"-4.19, 0.37, -0.27, 0.31, -0.01"
Prob(H) (two-sided):,"0.00, 0.00, 0.01, 0.54, 0.50",Kurtosis:,"298.56, 5.32, 11.28, 5.38, 4.31"

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0055,0.009,0.621,0.535,-0.012,0.023
L1.z1,1.2698,0.347,3.661,0.000,0.590,1.950
L1.z2,-0.0012,1.142,-0.001,0.999,-2.239,2.237
L1.z3,0.3305,0.431,0.766,0.444,-0.515,1.176
L1.z4,0.0405,1.521,0.027,0.979,-2.940,3.021
L1.z5,0.0868,1.305,0.067,0.947,-2.470,2.644
L2.z1,-0.1845,0.542,-0.340,0.734,-1.247,0.878
L2.z2,0.1135,1.570,0.072,0.942,-2.963,3.190
L2.z3,-0.1329,0.342,-0.389,0.697,-0.802,0.537

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0011,0.003,-0.320,0.749,-0.008,0.006
L1.z1,0.0309,0.130,0.238,0.812,-0.224,0.286
L1.z2,1.3576,0.451,3.013,0.003,0.475,2.241
L1.z3,0.0802,0.165,0.486,0.627,-0.243,0.404
L1.z4,-0.0084,0.597,-0.014,0.989,-1.178,1.162
L1.z5,-0.0533,0.514,-0.104,0.917,-1.060,0.954
L2.z1,-0.0627,0.206,-0.305,0.761,-0.466,0.341
L2.z2,-0.2774,0.621,-0.447,0.655,-1.494,0.940
L2.z3,0.0466,0.142,0.329,0.742,-0.231,0.324

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-9.487e-05,0.003,-0.029,0.977,-0.007,0.006
L1.z1,0.0049,0.128,0.038,0.970,-0.247,0.257
L1.z2,-0.0699,0.432,-0.162,0.871,-0.917,0.777
L1.z3,1.2136,0.166,7.315,0.000,0.888,1.539
L1.z4,0.0805,0.582,0.138,0.890,-1.061,1.222
L1.z5,-0.1316,0.496,-0.265,0.791,-1.104,0.841
L2.z1,0.0222,0.202,0.110,0.912,-0.374,0.419
L2.z2,0.1764,0.595,0.297,0.767,-0.989,1.342
L2.z3,-0.2821,0.130,-2.170,0.030,-0.537,-0.027

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0013,0.003,0.452,0.652,-0.004,0.007
L1.z1,0.0082,0.108,0.075,0.940,-0.204,0.220
L1.z2,-0.1050,0.366,-0.287,0.774,-0.823,0.613
L1.z3,0.1485,0.148,1.002,0.316,-0.142,0.439
L1.z4,1.2341,0.488,2.531,0.011,0.278,2.190
L1.z5,0.1337,0.425,0.315,0.753,-0.699,0.966
L2.z1,-0.0005,0.169,-0.003,0.998,-0.332,0.331
L2.z2,0.1465,0.504,0.290,0.771,-0.842,1.135
L2.z3,-0.1623,0.119,-1.362,0.173,-0.396,0.071

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0024,0.003,-0.715,0.475,-0.009,0.004
L1.z1,-0.1570,0.127,-1.239,0.216,-0.406,0.091
L1.z2,-0.0696,0.445,-0.156,0.876,-0.941,0.802
L1.z3,-0.1379,0.163,-0.848,0.397,-0.457,0.181
L1.z4,0.2422,0.588,0.412,0.680,-0.910,1.394
L1.z5,1.2584,0.506,2.489,0.013,0.267,2.250
L2.z1,0.1785,0.202,0.882,0.378,-0.218,0.575
L2.z2,0.0475,0.613,0.078,0.938,-1.153,1.248
L2.z3,0.0458,0.145,0.317,0.751,-0.238,0.329

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sqrt.var.z1,0.1777,0.001,159.868,0.000,0.176,0.180
sqrt.cov.z1.z2,0.0227,0.001,15.592,0.000,0.020,0.026
sqrt.var.z2,0.0852,0.001,110.921,0.000,0.084,0.087
sqrt.cov.z1.z3,-0.0125,0.001,-11.429,0.000,-0.015,-0.010
sqrt.cov.z2.z3,-0.0270,0.001,-24.319,0.000,-0.029,-0.025
sqrt.var.z3,0.0765,0.001,91.435,0.000,0.075,0.078
sqrt.cov.z1.z4,0.0089,0.001,6.458,0.000,0.006,0.012
sqrt.cov.z2.z4,-0.0101,0.001,-10.166,0.000,-0.012,-0.008
sqrt.cov.z3.z4,0.0116,0.001,12.030,0.000,0.010,0.014


In [60]:
results_model_VAR_F_Exo.aic

-43391.64262866739

In [61]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-3:]

In [62]:
z_F_Exo = results_model_VAR_F_Exo.forecast(y=lagged_values_VAR_F_Exo, steps=672) 
df_forecast_VAR_F_Exo = pca.inverse_transform(z_F_Exo)


Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.


Estimation of VARMA(p,q) models is not generically robust, due especially to identification issues.



In [63]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(df_forecast_VAR_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [64]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [65]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo)))
print(mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100)
print(testF_Exo.mean().sum())

0.07555749375261353
3.5794621930408757
81.90959266258383


# 10. Modelling SARIMAX(order(3,0,1), seasonal_order(2,0,0,4)) with Exog, Train(5184), Test(672), 1D-Dimension, RMSE = 0.07861528

In [66]:
pca = PCA(n_components=1)

In [67]:
train_2D = pca.fit_transform(grab_train[0:-672])

In [68]:
pca.explained_variance_ratio_.sum()

0.651803520982282

In [69]:
pca.explained_variance_ratio_

array([0.65180352])

In [70]:
train_2D = pd.DataFrame(data = train_2D, columns=["z1"])

In [71]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [72]:
train_2D.shape

(5184, 1)

In [73]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = train_2D[:-nobs]
testF_Exo = grab_train[-nobs:]

In [74]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [75]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [76]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [77]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [78]:
trainF_Exo_feature.columns

Index(['z1', 'Weekday', 'Weekday_Friday', 'Weekday_Monday', 'Weekday_Saturday',
       'Weekday_Sunday', 'Weekday_Thursday', 'Weekday_Tuesday',
       'Weekday_Wednesday'],
      dtype='object')

In [79]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [80]:
trainF_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-01-03 21:15:00,0,0,0,0,1,0,0
2019-01-26 06:15:00,0,0,1,0,0,0,0
2019-02-13 23:30:00,0,0,0,0,0,0,1
2019-01-14 12:15:00,0,1,0,0,0,0,0
2019-01-02 06:00:00,0,0,0,0,0,0,1


In [None]:
auto_arima(trainF_Exo['z1'],seasonal=True,m = 4, exogenous=trainF_Exo_feature)

In [90]:
model_VAR_F_Exo = SARIMAX(trainF_Exo['z1'],exog=trainF_Exo_feature,order=(3,0,1),seasonal_order=(2,0,0,4))
results_model_VAR_F_Exo = model_VAR_F_Exo.fit()
results_model_VAR_F_Exo.summary()


Maximum Likelihood optimization failed to converge. Check mle_retvals



0,1,2,3
Dep. Variable:,z1,No. Observations:,4512.0
Model:,"SARIMAX(3, 0, 1)x(2, 0, 0, 4)",Log Likelihood,1265.489
Date:,"Sat, 31 Aug 2019",AIC,-2502.978
Time:,19:28:54,BIC,-2413.175
Sample:,01-01-2019,HQIC,-2471.339
,- 02-16-2019,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Weekday_Friday,-0.1180,0.224,-0.526,0.599,-0.557,0.321
Weekday_Monday,0.1577,0.230,0.684,0.494,-0.294,0.609
Weekday_Saturday,-0.2550,0.225,-1.133,0.257,-0.696,0.186
Weekday_Sunday,-0.1869,0.231,-0.810,0.418,-0.639,0.265
Weekday_Thursday,-0.0587,0.225,-0.261,0.794,-0.500,0.382
Weekday_Tuesday,0.0264,0.228,0.116,0.908,-0.421,0.474
Weekday_Wednesday,-0.0282,0.227,-0.125,0.901,-0.472,0.416
ar.L1,1.4049,0.027,51.898,0.000,1.352,1.458
ar.L2,-0.2106,0.035,-5.989,0.000,-0.280,-0.142

0,1,2,3
Ljung-Box (Q):,446.94,Jarque-Bera (JB):,13044136.12
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.21,Skew:,-4.38
Prob(H) (two-sided):,0.0,Kurtosis:,266.26


In [91]:
results_model_VAR_F_Exo.aic

-2502.978198876587

In [83]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-3:]

In [92]:
z_F_Exo = results_model_VAR_F_Exo.forecast(steps=672, exog=testF_Exo_feature) 



Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.



In [93]:
z_F_Exo = pd.DataFrame(z_F_Exo)

In [94]:
df_forecast_VAR_F_Exo = pca.inverse_transform(z_F_Exo)

In [95]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(df_forecast_VAR_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [96]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [97]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo)))
print(mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100)
print(testF_Exo.mean().sum())

0.07871060181732027
3.7267366878644252
81.90959266258383


# 11. Dimensionality Reduction, PCA with retained 95% variance, Modelling VAR(96, trend="ctt",exog= Weekday), Train(5184), Test(672), RMSE = 0.04711870

In [24]:
from sklearn.decomposition import PCA

In [25]:
pca = PCA(0.95)

In [26]:
train_2D = pca.fit_transform(grab_train[:-672])

In [27]:
pca.explained_variance_ratio_.sum()

0.9501405036970398

In [28]:
pca.explained_variance_ratio_

array([6.51803521e-01, 6.13084516e-02, 4.39834237e-02, 2.70607501e-02,
       2.43508716e-02, 1.75100875e-02, 1.29762158e-02, 8.40748785e-03,
       6.61011213e-03, 6.24463943e-03, 4.81676383e-03, 4.72410334e-03,
       3.87567708e-03, 3.64940584e-03, 3.12047035e-03, 2.69894027e-03,
       2.48680843e-03, 2.22087156e-03, 2.08501822e-03, 2.01848902e-03,
       1.88598121e-03, 1.63494691e-03, 1.52169704e-03, 1.50807216e-03,
       1.41445432e-03, 1.32655326e-03, 1.28530012e-03, 1.20351730e-03,
       1.16560559e-03, 1.12709808e-03, 1.10392382e-03, 1.06164220e-03,
       9.99131901e-04, 9.80129237e-04, 9.58076150e-04, 9.36822241e-04,
       8.99536398e-04, 8.77785092e-04, 8.50523352e-04, 8.15951215e-04,
       7.82523108e-04, 7.78746326e-04, 7.59420882e-04, 7.41447503e-04,
       7.20550428e-04, 7.11871780e-04, 6.90015780e-04, 6.76431463e-04,
       6.67984519e-04, 6.57261678e-04, 6.38505613e-04, 6.21709019e-04,
       6.09059280e-04, 5.98429760e-04, 5.90245775e-04, 5.80509339e-04,
      

In [29]:
pca.explained_variance_ratio_.shape

(117,)

In [32]:
train_2D = pd.DataFrame(train_2D)

In [33]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [34]:
train_2D.shape

(5184, 117)

In [36]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
trainF_Exo = train_2D
testF_Exo = grab_train[-nobs:]

In [37]:
trainF_Exo_feature = trainF_Exo.copy()
testF_Exo_feature = testF_Exo.copy()

In [38]:
trainF_Exo_feature['Weekday'] = trainF_Exo_feature.index.weekday_name
testF_Exo_feature['Weekday'] = testF_Exo_feature.index.weekday_name

In [39]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [40]:
trainF_Exo_feature = create_dummies(trainF_Exo_feature,'Weekday')
testF_Exo_feature = create_dummies(testF_Exo_feature,'Weekday')

In [42]:
testF_Exo_feature.columns

Index(['qp02yc', 'qp02yf', 'qp02yu', 'qp02yv', 'qp02yy', 'qp02yz', 'qp02z1',
       'qp02z3', 'qp02z4', 'qp02z5',
       ...
       'qp0dnj', 'qp0dnn', 'Weekday', 'Weekday_Friday', 'Weekday_Monday',
       'Weekday_Saturday', 'Weekday_Sunday', 'Weekday_Thursday',
       'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object', length=1337)

In [43]:
trainF_Exo_feature = trainF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
testF_Exo_feature = testF_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [44]:
trainF_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-01-22 19:45:00,0,0,0,0,0,1,0
2019-01-12 15:15:00,0,0,1,0,0,0,0
2019-02-06 22:45:00,0,0,0,0,0,0,1
2019-01-29 14:45:00,0,0,0,0,0,1,0
2019-01-15 13:00:00,0,0,0,0,0,1,0


In [50]:
model_VAR_F_Exo = VAR(trainF_Exo, exog=trainF_Exo_feature)

In [60]:
results_model_VAR_F_Exo = model_VAR_F_Exo.fit(maxlags=672, trend="ctt") 

In [61]:
results_model_VAR_F_Exo.aic

-2672.2772384458553

In [62]:
lagged_values_VAR_F_Exo = trainF_Exo.values[-672:]

In [63]:
z_F_Exo = results_model_VAR_F_Exo.forecast(y=lagged_values_VAR_F_Exo, steps=672, exog_future=testF_Exo_feature) 
df_forecast_VAR_F_Exo = pca.inverse_transform(z_F_Exo)

In [64]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_F_Exo = pd.DataFrame(df_forecast_VAR_F_Exo, index=idxH_F_Exo, columns=testF_Exo.columns)

In [65]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo < 0] = 0
df_forecast_VAR_F_Exo[df_forecast_VAR_F_Exo > 1] = 1

In [82]:
#The RMSE for the forecasted values and test dataset
RMSE = np.sqrt(mean_squared_error(testF_Exo,df_forecast_VAR_F_Exo))
MAE = mean_absolute_error(testF_Exo,df_forecast_VAR_F_Exo)*100
Test_dataset_mean = testF_Exo.mean().sum()
RMSE_divide_Test_mean = (RMSE / Test_dataset_mean) * 100
relative_error_in_100_percentage = 100 - RMSE_divide_Test_mean
print(RMSE)
print(MAE)
print(Test_dataset_mean)
print(RMSE_divide_Test_mean)
print(relative_error_in_100_percentage)

0.041840895707627154
2.1388077664917358
81.90959266258383
0.05108180171275593
99.94891819828725
