# Summary
## Experiments with different models to forecast next 672 timestamp values and compare models results with the evaluation metric RMSE. Results show as follow:

| Features Set | Model (4512 training dataset( | Model (2688 training dataset) |
| --- | --- | --- |
| PCA > 5 Dimension only | VAR(96, "ctt") | VAR(96, "ctt") |
| RMSE: | 0.04906459| 0.05020833 |
| PCA > 5 Dimension only, with Exog:Weekday | VAR(96, "ctt")| VAR(96, "ctt") |
| RMSE: | 0.04711870| 0.04785709 |
| Full(Original) Dimension | VAR(14, "ctt")| VAR(14, "ctt") |
| RMSE: | 0.06077872| 0.06136112 |
| Full(Original) Dimension, with Exog:Weekday | VAR(14, "ctt")| VAR(14, "ctt") |
| RMSE: | 0.50961934| 0.41309167 |
| PCA > 5 Dimension only | VARMA(3,1)|
| RMSE: | 0.07555706|
| PCA > 1 Dimension only | SARIMAX(3,0,1)x(2,0,0,4)|
| RMSE: | 0.07861528|
| PCA with retained 95% variation, with Exog:Weekday | VAR(672, "ctt")|
| RMSE: | 0.04184089|
| PCA with retained 95% variation | VAR(672, "ctt")|
| RMSE: | 0.04187778|


In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from sklearn.decomposition import PCA
from statsmodels.tsa.api import VAR
from statsmodels.tsa.statespace.varmax import VARMAX, VARMAXResults
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error,mean_absolute_error
from pmdarima import auto_arima
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 50)

# Data Preprocessing

In [2]:
Grab = pd.read_csv('GrabAssignment.csv')

In [3]:
Grab.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


In [4]:
Grab.shape

(4206321, 4)

In [5]:
Grab.columns

Index(['geohash6', 'day', 'timestamp', 'demand'], dtype='object')

In [6]:
Grab.dtypes

geohash6      object
day            int64
timestamp     object
demand       float64
dtype: object

In [7]:
# Each day should have 96 timestamp (15 min), here to check out which day didnt have complete timestamp cycle
a_list = []
for c in range(1,61):
    lens = len(Grab[Grab["day"] == c].timestamp.value_counts())
    if lens != 96:
        a_list.append(c)
print(a_list)

[18]


In [8]:
#Regroup data to make the day and timestamp in order and unstack geohash
GrabFull = Grab.groupby(['geohash6', 'day', 'timestamp'])['demand'].mean().unstack('geohash6')

In [9]:
GrabFull.columns

Index(['qp02yc', 'qp02yf', 'qp02yu', 'qp02yv', 'qp02yy', 'qp02yz', 'qp02z1',
       'qp02z3', 'qp02z4', 'qp02z5',
       ...
       'qp0djv', 'qp0djw', 'qp0djy', 'qp0dn0', 'qp0dn1', 'qp0dn4', 'qp0dn5',
       'qp0dnh', 'qp0dnj', 'qp0dnn'],
      dtype='object', name='geohash6', length=1329)

In [10]:
# Creating the DataFrame to fillup the NaN value in Raw Data to complete timestamp cycle for timeseries forecasting
fillup_ts = pd.DataFrame(GrabFull.columns)
fillup_ts["day"] = 18
fillup_ts["demand"] = np.NaN

In [11]:
fillup_ts

Unnamed: 0,geohash6,day,demand
0,qp02yc,18,
1,qp02yf,18,
2,qp02yu,18,
3,qp02yv,18,
4,qp02yy,18,
5,qp02yz,18,
6,qp02z1,18,
7,qp02z3,18,
8,qp02z4,18,
9,qp02z5,18,


In [12]:
fillup_ts_str = pd.Series([str("9:45"),str("10:0"),str("10:15"),str("12:45"),str("11:30"),str("11:45"),str("12:0"),str("12:15"),str("12:30")])

In [13]:
fillup_ts_full = pd.DataFrame()
for ts in fillup_ts_str:
    fillup_ts["timestamp"] = ts
    fillup_ts_full = pd.concat([fillup_ts_full,fillup_ts], ignore_index=True)
    

In [14]:
fillup_ts_full

Unnamed: 0,geohash6,day,demand,timestamp
0,qp02yc,18,,9:45
1,qp02yf,18,,9:45
2,qp02yu,18,,9:45
3,qp02yv,18,,9:45
4,qp02yy,18,,9:45
5,qp02yz,18,,9:45
6,qp02z1,18,,9:45
7,qp02z3,18,,9:45
8,qp02z4,18,,9:45
9,qp02z5,18,,9:45


In [15]:
Grab_done = pd.concat([Grab,fillup_ts_full], ignore_index=True, sort=False)

In [16]:
Grab_done

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468
5,qp03tu,1,12:15,0.023843
6,qp096d,25,3:30,0.007460
7,qp03nr,51,20:45,0.000293
8,qp093r,48,6:15,0.054170
9,qp03r2,4,22:15,0.123463


In [17]:
Grab_done['timestamp'] = pd.to_datetime(Grab_done['timestamp'], format='%H:%M').dt.time

In [18]:
grab_train = Grab_done.groupby(['geohash6', 'day', 'timestamp'])['demand'].mean().unstack('geohash6')

In [19]:
grab_train.shape

(5856, 1329)

In [20]:
grab_train.isnull().sum().sum()

3576303

In [21]:
#Merge the consecutive day and timestamp into one time series with a assigned datetime and 15min frequency
grab_train = grab_train.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5856, freq='15min'))

In [22]:
grab_train = grab_train.fillna(0)

In [23]:
grab_train.head(10)

geohash6,qp02yc,qp02yf,qp02yu,qp02yv,qp02yy,qp02yz,qp02z1,qp02z3,qp02z4,qp02z5,qp02z6,qp02z7,qp02z9,qp02zc,qp02zd,qp02ze,qp02zf,qp02zg,qp02zh,qp02zj,qp02zk,qp02zm,qp02zn,qp02zp,qp02zq,...,qp0djb,qp0djc,qp0djd,qp0dje,qp0djf,qp0djg,qp0djh,qp0djj,qp0djk,qp0djm,qp0djn,qp0djq,qp0djs,qp0djt,qp0dju,qp0djv,qp0djw,qp0djy,qp0dn0,qp0dn1,qp0dn4,qp0dn5,qp0dnh,qp0dnj,qp0dnn
2019-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009482,0.0,0.0,0.0,0.003641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0006,0.0,0.0,0.0,0.038979,0.0,0.010554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.012865,0.0,0.0,0.0,0.001304,0.0,0.002454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004056,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00491,0.0,0.0,0.0,0.020167,0.0,0.0,0.0,0.0,0.000503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009381,0.0
2019-01-01 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070525,0.001295,0.0,0.0,0.008541,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083947,0.0,0.045482,0.047912,0.006503,0.0,0.0,0.0,0.049005,0.047575,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009687,0.0,0.051739,0.0,0.010428,0.0,0.0,0.0,0.0,0.0,0.008253,0.0,0.0,0.0,0.0,0.0,0.002701,0.0
2019-01-01 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045083,0.0,0.0,0.031414,0.0,0.0,0.0,0.0,0.0,0.0,0.060752,0.0,0.0,...,0.0,0.0,0.011394,0.0,0.0,0.0,0.009599,0.017358,0.0,0.0,0.0,0.020719,0.0,0.0,0.0,0.0,0.0,0.006971,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 01:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.099317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04836,0.000954,0.0,0.0,0.0,0.0,0.0,0.104351,0.075672,0.00063,0.066387,...,0.0,0.0,0.0,0.000471,0.012844,0.0,0.027026,0.029316,0.007121,0.005744,0.0,0.0,0.000993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019563,0.0
2019-01-01 01:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011855,0.003631,0.0,0.0,0.039394,0.0,0.0,0.0,0.0,0.019048,0.03685,0.079985,0.055821,0.071413,...,0.0,0.0,0.0,0.0,0.003883,0.0,0.004794,0.0,0.019596,0.031227,0.0,0.02307,0.000721,0.0,0.0,0.0,0.0,0.001858,0.0,0.0,0.0,0.0,0.0,0.007193,0.0
2019-01-01 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.029813,0.0,0.0,0.0,0.0,0.0,0.045931,0.0,0.0,0.0,0.0,0.02393,0.0,0.082367,0.0,0.019131,0.022062,0.036493,0.012899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016685,0.024294,0.034116,0.018435,0.0,0.011025,0.005066,0.011381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-01-01 02:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.099991,0.0,0.0,0.0,0.0,0.019614,0.019863,0.0,0.016487,0.0,0.0,0.044045,0.014075,0.237685,0.0,0.042676,0.124104,0.038605,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000553,0.006702,0.032528,0.007979,0.0,0.0,0.019812,0.002717,0.0,0.0,0.0,0.008936,0.0,0.0,0.0,0.0,0.0,0.007224,0.0


# Dimensionality Reduction, PCA to 5D with retained 80% variance

In [24]:
from sklearn.decomposition import PCA

In [25]:
pca = PCA(n_components=5)

In [26]:
train_2D = pca.fit_transform(grab_train[0:-672])

In [27]:
pca.explained_variance_ratio_.sum()

0.8085070179723178

In [28]:
pca.explained_variance_ratio_

array([0.65180352, 0.06130845, 0.04398342, 0.02706075, 0.02435087])

In [29]:
train_2D = pd.DataFrame(data = train_2D, columns=["z1","z2","z3","z4","z5"])

In [30]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [31]:
train_2D.shape

(5184, 5)

# 1. Model VAR(96, trend="ctt"), Train(4512), Test(672), PCA > 5D, RMSE = 0.04906459

In [32]:
def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

In [33]:
for c in train_2D.columns: # All z1-z5 data stationary
    adf_test(train_2D[c],title=c)

Augmented Dickey-Fuller Test: z1
ADF test statistic     -1.518328e+01
p-value                 6.154571e-28
# lags used             3.200000e+01
# observations          5.151000e+03
critical value (1%)    -3.431620e+00
critical value (5%)    -2.862101e+00
critical value (10%)   -2.567069e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
Augmented Dickey-Fuller Test: z2
ADF test statistic     -1.008157e+01
p-value                 1.184835e-17
# lags used             2.800000e+01
# observations          5.155000e+03
critical value (1%)    -3.431619e+00
critical value (5%)    -2.862101e+00
critical value (10%)   -2.567069e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
Augmented Dickey-Fuller Test: z3
ADF test statistic     -1.263919e+01
p-value                 1.447476e-23
# lags used             3.300000e+01
# observations          5.150000e+03
critical valu

In [34]:
#Let the test dataset be one week dataset (672 samples in 15min timestamps) as it is the cycle of weekday & weekend
nobs=672    
train1 = train_2D
test1 = grab_train[-nobs:]

In [35]:
model_VAR1 = VAR(train1)

In [36]:
#Iterative loop the lag parameter around 1 weeks time, in order to choose a better lag parameters
aic_dict = {}
for i in range(94,97):
    results = model_VAR1.fit(i)
    aic_dict[i] = results.aic

In [37]:
model_VAR1.endog_names

['z1', 'z2', 'z3', 'z4', 'z5']

In [38]:
# To return the key_values, where it value are smallest
min(aic_dict.keys(), key=aic_dict.get)

96

In [39]:
# To return the max & min value in a dictionary object
key_max = max(aic_dict.keys(), key=(lambda k: aic_dict[k]))
key_min = min(aic_dict.keys(), key=(lambda k: aic_dict[k]))
print('Maximum Value: ',aic_dict[key_max])
print('Minimum Value: ',aic_dict[key_min])

Maximum Value:  -24.90994772167914
Minimum Value:  -24.921510414060247


In [40]:
results_VAR1 = model_VAR1.fit(maxlags=96, trend="ctt") 
results_VAR1.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 10, Sep, 2019
Time:                     21:16:06
--------------------------------------------------------------------
No. of Equations:         5.00000    BIC:                   -21.8211
Nobs:                     5088.00    HQIC:                  -23.8366
Log likelihood:           29720.7    FPE:                1.50463e-11
AIC:                     -24.9228    Det(Omega_mle):     9.56091e-12
--------------------------------------------------------------------
Results for equation z1
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const           -0.001221         0.006817           -0.179           0.858
trend           -0.000012         0.000007           -1.663           0.096
trend**2         0.000000         0.000000            2.309           0.021
L1.z1  

In [41]:
results_VAR1.k_ar

96

In [42]:
results_VAR1.aic

-24.922756654609923

In [43]:
lagged_values_VAR1 = train1.values[-96:]

In [44]:
z = results_VAR1.forecast(y=lagged_values_VAR1, steps=672) 
df_forecast_VAR1 = pca.inverse_transform(z)

In [45]:
#Set the time frame to be same as test dataset
idxH = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR1 = pd.DataFrame(df_forecast_VAR1, index=idxH, columns=test1.columns)

In [46]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR1[df_forecast_VAR1 < 0] = 0
df_forecast_VAR1[df_forecast_VAR1 > 1] = 1

In [47]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test1,df_forecast_VAR1)))
print(mean_absolute_error(test1,df_forecast_VAR1)*100)
print(test1.mean().sum())

0.04694127654459222
2.30220419391297
81.90959266258383


In [48]:
#The model residual mean
#Residual mean close to zero which is a good indication for the forecasting
np.mean(results_VAR1.resid).sum()

-5.518164463923828e-12

In [None]:
#Visualise the forecast values and dataset values for all of the geohash6
df_forecast_VAR1.iplot()
test1.iplot()

# 2. Model VAR(96, trend="ctt"), Train(2688), Test(672), PCA > 5D, RMSE = 0.05020833

In [49]:
#Let the train dataset be recently 1 month dataset to see whether perform better or not
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train2 = train_2D[-2688:]
test2 = grab_train[-nobs:]

In [50]:
model_VAR_2 = VAR(train2)

In [51]:
results_VAR_2 = model_VAR_2.fit(maxlags=96, trend="ctt") 
results_VAR_2.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 10, Sep, 2019
Time:                     21:16:08
--------------------------------------------------------------------
No. of Equations:         5.00000    BIC:                   -19.8070
Nobs:                     2592.00    HQIC:                  -23.2884
Log likelihood:           16771.6    FPE:                1.08704e-11
AIC:                     -25.2670    Det(Omega_mle):     4.62590e-12
--------------------------------------------------------------------
Results for equation z1
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const           -0.027750         0.009559           -2.903           0.004
trend            0.000062         0.000019            3.322           0.001
trend**2        -0.000000         0.000000           -2.941           0.003
L1.z1  

In [52]:
results_VAR_2.k_ar

96

In [53]:
results_VAR_2.aic

-25.26700327847004

In [54]:
lagged_values_VAR_2 = train2.values[-96:]

In [55]:
z_2 = results_VAR_2.forecast(y=lagged_values_VAR_2, steps=672) 
df_forecast_VAR_2 = pca.inverse_transform(z_2)

In [56]:
#Set the time frame to be same as test dataset
idxH_H = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_2 = pd.DataFrame(df_forecast_VAR_2, index=idxH_H, columns=test2.columns)

In [57]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_2[df_forecast_VAR_2 < 0] = 0
df_forecast_VAR_2[df_forecast_VAR_2 > 1] = 1

In [58]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test2,df_forecast_VAR_2)))
print(mean_absolute_error(test2,df_forecast_VAR_2)*100)
print(test2.mean().sum())

0.05226352718881218
2.449121726183137
81.90959266258383


In [None]:
#Visualise the forecast values and dataset values for all of the geohash6
df_forecast_VAR_2.iplot()
test2.iplot()

# 3. Model VAR(96, trend="ctt",exog= Weekday), Train(4512), Test(672), PCA > 5D, RMSE = 0.04711870

In [59]:
nobs=672    
train_3 = train_2D
test_3 = grab_train[-nobs:]

In [60]:
train_3_Exo_feature = train_3.copy()
test_3_Exo_feature = test_3.copy()

In [61]:
train_3_Exo_feature['Weekday'] = train_3_Exo_feature.index.weekday_name
test_3_Exo_feature['Weekday'] = test_3_Exo_feature.index.weekday_name

In [62]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [63]:
train_3_Exo_feature = create_dummies(train_3_Exo_feature,'Weekday')
test_3_Exo_feature = create_dummies(test_3_Exo_feature,'Weekday')

In [64]:
train_3_Exo_feature.columns

Index(['z1', 'z2', 'z3', 'z4', 'z5', 'Weekday', 'Weekday_Friday',
       'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday',
       'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object')

In [65]:
train_3_Exo_feature = train_3_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_3_Exo_feature = test_3_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [66]:
train_3_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-01-04 08:15:00,1,0,0,0,0,0,0
2019-01-01 15:30:00,0,0,0,0,0,1,0
2019-02-01 01:00:00,1,0,0,0,0,0,0
2019-02-05 03:30:00,0,0,0,0,0,1,0
2019-02-15 03:00:00,1,0,0,0,0,0,0


In [67]:
model_VAR_3 = VAR(train_3, exog=train_3_Exo_feature)

In [68]:
results_model_VAR_3 = model_VAR_3.fit(maxlags=96, trend="ctt") 
results_model_VAR_3.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 10, Sep, 2019
Time:                     21:16:11
--------------------------------------------------------------------
No. of Equations:         5.00000    BIC:                   -21.7938
Nobs:                     5088.00    HQIC:                  -23.8385
Log likelihood:           29800.5    FPE:                1.47857e-11
AIC:                     -24.9404    Det(Omega_mle):     9.33653e-12
--------------------------------------------------------------------
Results for equation z1
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const       484237.909627    228705.732735            2.117           0.034
trend           -0.000009         0.000008           -1.119           0.263
trend**2         0.000000         0.000000            1.649           0.099
exog0  

In [69]:
results_model_VAR_3.aic

-24.94035334934034

In [70]:
lagged_values_VAR_3 = train_3.values[-96:]

In [71]:
z_F_3 = results_model_VAR_3.forecast(y=lagged_values_VAR_3, steps=672, exog_future=test_3_Exo_feature) 
df_forecast_VAR_3 = pca.inverse_transform(z_F_3)

In [72]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_3 = pd.DataFrame(df_forecast_VAR_3, index=idxH_F_Exo, columns=test_3.columns)

In [73]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_3[df_forecast_VAR_3 < 0] = 0
df_forecast_VAR_3[df_forecast_VAR_3 > 1] = 1

In [74]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_3,df_forecast_VAR_3)))
print(mean_absolute_error(test_3,df_forecast_VAR_3)*100)
print(test_3.mean().sum())

0.04466204988745655
2.1836061461977474
81.90959266258383


# 4. Model VAR(96, trend="ctt",exog= Weekday), Train(2688), Test(672), PCA > 5D, RMSE = 0.04785709

In [75]:
#Let the train dataset be recently 1 month dataset to see whether perform better or not
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_4 = train_2D[-2688:]
test_4 = grab_train[-nobs:]

In [76]:
train_4_Exo_feature = train_4.copy()
test_4_Exo_feature = test_4.copy()

In [77]:
train_4_Exo_feature['Weekday'] = train_4_Exo_feature.index.weekday_name
test_4_Exo_feature['Weekday'] = test_4_Exo_feature.index.weekday_name

In [78]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [79]:
train_4_Exo_feature = create_dummies(train_4_Exo_feature,'Weekday')
test_4_Exo_feature = create_dummies(test_4_Exo_feature,'Weekday')

In [80]:
train_4_Exo_feature.columns

Index(['z1', 'z2', 'z3', 'z4', 'z5', 'Weekday', 'Weekday_Friday',
       'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday',
       'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object')

In [81]:
train_4_Exo_feature = train_4_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_4_Exo_feature = test_4_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [82]:
train_4_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-02-07 08:00:00,0,0,0,0,1,0,0
2019-02-20 20:00:00,0,0,0,0,0,0,1
2019-02-07 16:30:00,0,0,0,0,1,0,0
2019-01-29 07:15:00,0,0,0,0,0,1,0
2019-02-23 18:30:00,0,0,1,0,0,0,0


In [83]:
model_VAR_4 = VAR(train_4, exog=train_4_Exo_feature)

In [84]:
results_model_VAR_4 = model_VAR_4.fit(maxlags=96, trend="ctt") 
results_model_VAR_4.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 10, Sep, 2019
Time:                     21:16:13
--------------------------------------------------------------------
No. of Equations:         5.00000    BIC:                   -19.7446
Nobs:                     2592.00    HQIC:                  -23.2765
Log likelihood:           16828.3    FPE:                1.07001e-11
AIC:                     -25.2838    Det(Omega_mle):     4.50192e-12
--------------------------------------------------------------------
Results for equation z1
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const           -0.037605    172539.969283           -0.000           1.000
trend            0.000098         0.000024            4.116           0.000
trend**2        -0.000000         0.000000           -3.797           0.000
exog0  

In [85]:
results_model_VAR_4.aic

-25.283787918655364

In [86]:
lagged_values_VAR_4 = train_4.values[-96:]

In [87]:
z_F_4 = results_model_VAR_4.forecast(y=lagged_values_VAR_4, steps=672, exog_future=test_4_Exo_feature) 
df_forecast_VAR_4 = pca.inverse_transform(z_F_4)

In [88]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_4 = pd.DataFrame(df_forecast_VAR_4, index=idxH_F_Exo, columns=test_4.columns)

In [89]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_4[df_forecast_VAR_4 < 0] = 0
df_forecast_VAR_4[df_forecast_VAR_4 > 1] = 1

In [90]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_4,df_forecast_VAR_4)))
print(mean_absolute_error(test_4,df_forecast_VAR_4)*100)
print(test_4.mean().sum())

0.04768886022222595
2.2528338981212914
81.90959266258383


# 5. Model VAR(14, trend="ctt"), Train(4512), Test(672), Full- Dimension, RMSE = 0.06077872

In [91]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_5 = grab_train[0:-nobs]
test_5 = grab_train[-nobs:]

In [92]:
# adding noise into diagonal to prevent matrix non-invertible issues, as the geohash had linear dependent relationship
np.fill_diagonal(train_5.values, train_5 + 1)

In [93]:
model_VAR_5 = VAR(train_5)

In [94]:
results_VAR_5 = model_VAR_5.fit(maxlags=14, trend="ctt")

In [95]:
results_VAR_5.k_ar

14

In [96]:
lagged_values_VAR_5 = train_5.values[-14:]

In [97]:
z_H_5 = results_VAR_5.forecast(y=lagged_values_VAR_5, steps=672)

In [98]:
#Set the time frame to be same as test dataset
idxH_H = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_5 = pd.DataFrame(z_H_5, index=idxH_H, columns=test_5.columns)

In [99]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_5[df_forecast_VAR_5 < 0] = 0
df_forecast_VAR_5[df_forecast_VAR_5 > 1] = 1

In [100]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_5,df_forecast_VAR_5)))
print(mean_absolute_error(test_5,df_forecast_VAR_5)*100)
print(test_5.mean().sum())

0.05836734824621788
2.881066645784354
81.90959266258383


# 6. Model VAR(14, trend="ctt"), Train(-3360:-672), Test(672), Full- Dimension, RMSE = 0.06136112

In [101]:
#Let the train dataset be recently 1 month dataset to see whether perform better or not
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672
train_6 = grab_train[-3360:-nobs]
test_6 = grab_train[-nobs:]

In [102]:
np.fill_diagonal(train_6.values, train_6 + 1)

In [103]:
model_VAR_6 = VAR(train_6)

In [104]:
results_VAR_6 = model_VAR_6.fit(maxlags=14, trend="ctt")

In [105]:
results_VAR_6.k_ar

14

In [106]:
lagged_values_VAR_6 = train_6.values[-14:]

In [107]:
z_H_6 = results_VAR_6.forecast(y=lagged_values_VAR_6, steps=672)

In [108]:
#Set the time frame to be same as test dataset
idxH_H = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_6 = pd.DataFrame(z_H_6, index=idxH_H, columns=test_6.columns)

In [109]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_6[df_forecast_VAR_6 < 0] = 0
df_forecast_VAR_6[df_forecast_VAR_6 > 1] = 1

In [110]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_6,df_forecast_VAR_6)))
print(mean_absolute_error(test_6,df_forecast_VAR_6)*100)
print(test_6.mean().sum())

0.06136112899637492
2.959571260362577
81.90959266258383


# 7. Model VAR(14, trend="ctt",exog= Weekday), Train(4512), Test(672), Full-Dimension, RMSE = 0.50961934

In [32]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_7 = grab_train[0:-nobs]
test_7 = grab_train[-nobs:]

In [33]:
train_7_Exo_feature = train_7.copy()
test_7_Exo_feature = test_7.copy()

In [34]:
train_7_Exo_feature['Weekday'] = train_7_Exo_feature.index.weekday_name
test_7_Exo_feature['Weekday'] = test_7_Exo_feature.index.weekday_name

In [35]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [36]:
train_7_Exo_feature = create_dummies(train_7_Exo_feature,'Weekday')
test_7_Exo_feature = create_dummies(test_7_Exo_feature,'Weekday')

In [37]:
train_7_Exo_feature.columns

Index(['qp02yc', 'qp02yf', 'qp02yu', 'qp02yv', 'qp02yy', 'qp02yz', 'qp02z1',
       'qp02z3', 'qp02z4', 'qp02z5',
       ...
       'qp0dnj', 'qp0dnn', 'Weekday', 'Weekday_Friday', 'Weekday_Monday',
       'Weekday_Saturday', 'Weekday_Sunday', 'Weekday_Thursday',
       'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object', length=1337)

In [38]:
train_7_Exo_feature = train_7_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_7_Exo_feature = test_7_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [39]:
train_7_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-02-23 23:45:00,0,0,1,0,0,0,0
2019-01-01 19:30:00,0,0,0,0,0,1,0
2019-01-01 12:15:00,0,0,0,0,0,1,0
2019-02-12 14:30:00,0,0,0,0,0,1,0
2019-01-02 18:45:00,0,0,0,0,0,0,1


In [40]:
model_VAR_7 = VAR(train_7, exog=train_7_Exo_feature)

In [41]:
results_model_VAR_7 = model_VAR_7.fit(maxlags=14, trend="ctt") 

In [42]:
lagged_values_VAR_7 = train_7.values[-14:]

In [43]:
z_F_7 = results_model_VAR_7.forecast(y=lagged_values_VAR_7, steps=672, exog_future=test_7_Exo_feature) 

In [44]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_7 = pd.DataFrame(z_F_7, index=idxH_F_Exo, columns=test_7.columns)

In [45]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_7[df_forecast_VAR_7 < 0] = 0
df_forecast_VAR_7[df_forecast_VAR_7 > 1] = 1

In [46]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_7,df_forecast_VAR_7)))
print(mean_absolute_error(test_7,df_forecast_VAR_7)*100)
print(test_7.mean().sum())

0.47956625321448615
27.76232582544965
81.90959266258383


# 8. Model VAR(14, trend="ctt",exog= Weekday), Train(-3360:-672), Test(672), Full-Dimension, RMSE = 0.41309167

In [59]:
#Let the train dataset be recently 1 month dataset to see whether perform better or not
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_8 = grab_train[-3360:-nobs]
test_8 = grab_train[-nobs:]

In [60]:
np.fill_diagonal(train_8.values, train_8 + 1)

In [61]:
train_8_Exo_feature = train_8.copy()
test_8_Exo_feature = test_8.copy()

In [62]:
train_8_Exo_feature['Weekday'] = train_8_Exo_feature.index.weekday_name
test_8_Exo_feature['Weekday'] = test_8_Exo_feature.index.weekday_name

In [63]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [64]:
train_8_Exo_feature = create_dummies(train_8_Exo_feature,'Weekday')
test_8_Exo_feature = create_dummies(test_8_Exo_feature,'Weekday')

In [65]:
train_8_Exo_feature.columns

Index(['qp02yc', 'qp02yf', 'qp02yu', 'qp02yv', 'qp02yy', 'qp02yz', 'qp02z1',
       'qp02z3', 'qp02z4', 'qp02z5',
       ...
       'qp0dnj', 'qp0dnn', 'Weekday', 'Weekday_Friday', 'Weekday_Monday',
       'Weekday_Saturday', 'Weekday_Sunday', 'Weekday_Thursday',
       'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object', length=1337)

In [66]:
train_8_Exo_feature = train_8_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_8_Exo_feature = test_8_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [67]:
train_8_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-02-10 08:15:00,0,0,0,1,0,0,0
2019-02-14 18:00:00,0,0,0,0,1,0,0
2019-01-28 14:30:00,0,1,0,0,0,0,0
2019-02-02 12:45:00,0,0,1,0,0,0,0
2019-02-22 15:30:00,1,0,0,0,0,0,0


In [68]:
model_VAR_8 = VAR(train_8, exog=train_8_Exo_feature)

In [69]:
results_model_VAR_8 = model_VAR_8.fit(maxlags=14, trend="ctt") 

In [70]:
lagged_values_VAR_8 = train_8.values[-14:]

In [71]:
z_F_8 = results_model_VAR_8.forecast(y=lagged_values_VAR_8, steps=672, exog_future=test_8_Exo_feature) 

In [72]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_8 = pd.DataFrame(z_F_8, index=idxH_F_Exo, columns=test_8.columns)

In [73]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_8[df_forecast_VAR_8 < 0] = 0
df_forecast_VAR_8[df_forecast_VAR_8 > 1] = 1

In [74]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_8,df_forecast_VAR_8)))
print(mean_absolute_error(test_8,df_forecast_VAR_8)*100)
print(test_8.mean().sum())

0.41309167092070576
22.380996626660288
81.90959266258383


# 9. Model VARMA(order = (3, 1)), Train(4512), Test(672), 5D-Dimension, RMSE = 0.07555706

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_9 = train_2D
test_9 = grab_train[-nobs:]

In [None]:
train_9_Exo_feature = train_9.copy()
test_9_Exo_feature = test_9.copy()

In [None]:
train_9_Exo_feature['Weekday'] = train_9_Exo_feature.index.weekday_name
test_9_Exo_feature['Weekday'] = test_9_Exo_feature.index.weekday_name

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
train_9_Exo_feature = create_dummies(train_9_Exo_feature,'Weekday')
test_9_Exo_feature = create_dummies(test_9_Exo_feature,'Weekday')

In [None]:
train_9_Exo_feature.columns

In [None]:
train_9_Exo_feature = train_9_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_9_Exo_feature = test_9_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [None]:
train_9_Exo_feature.sample(5)

In [None]:
model_VARMA_9 = VARMAX(train_9, order=(672,1), enforce_stationarity=False, exog=train_9_Exo_feature)

In [None]:
results_model_VARMA_9 = model_VARMA_9.fit(maxlags= 672) 

In [None]:
results_model_VARMA_9.aic

In [None]:
lagged_values_VARMA_9 = train_9.values[-672:]

In [None]:
z_F_9 = results_model_VARMA_9.forecast(y=lagged_values_VARMA_9, steps=672) 
df_forecast_VARMA_9 = pca.inverse_transform(z_F_9)

In [None]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VARMA_9 = pd.DataFrame(df_forecast_VARMA_9, index=idxH_F_Exo, columns=test_9.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VARMA_9[df_forecast_VARMA_9 < 0] = 0
df_forecast_VARMA_9[df_forecast_VARMA_9 > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_9,df_forecast_VARMA_9)))
print(mean_absolute_error(test_9,df_forecast_VARMA_9)*100)
print(test_9.mean().sum())

# 10. Model SARIMAX(order(3,0,1), seasonal_order(2,0,0,4)) with Exog, Train(4512), Test(672), 1D-Dimension, RMSE = 0.07861528

In [None]:
pca = PCA(n_components=1)

In [None]:
train_2D = pca.fit_transform(grab_train[0:-672])

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
pca.explained_variance_ratio_

In [None]:
train_2D = pd.DataFrame(data = train_2D, columns=["z1"])

In [None]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [None]:
train_2D.shape

In [None]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_10 = train_2D
test_10 = grab_train[-nobs:]

In [None]:
train_10_Exo_feature = train_10.copy()
test_10_Exo_feature = test_10.copy()

In [None]:
train_10_Exo_feature['Weekday'] = train_10_Exo_feature.index.weekday_name
test_10_Exo_feature['Weekday'] = test_10_Exo_feature.index.weekday_name

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
train_10_Exo_feature = create_dummies(train_10_Exo_feature,'Weekday')
test_10_Exo_feature = create_dummies(test_10_Exo_feature,'Weekday')

In [None]:
train_10_Exo_feature.columns

In [None]:
train_10_Exo_feature = train_10_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_10_Exo_feature = test_10_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [None]:
train_10_Exo_feature.sample(5)

In [None]:
auto_arima(train_10['z1'],seasonal=True,m = 4, exogenous=train_10_Exo_feature)

In [None]:
model_SARIMAX_10 = SARIMAX(train_10['z1'],exog=train_10_Exo_feature,order=(3,0,1),seasonal_order=(2,0,0,4))
results_model_SARIMAX_10 = model_SARIMAX_10.fit()
results_model_SARIMAX_10.summary()

In [None]:
results_model_SARIMAX_10.aic

In [None]:
lagged_values_SARIMAX_10 = train_10.values[-3:]

In [None]:
z_F_10 = results_model_SARIMAX_10.forecast(steps=672, exog=test_10_Exo_feature) 


In [None]:
z_F_10 = pd.DataFrame(z_F_10)

In [None]:
df_forecast_SARIMAX_10 = pca.inverse_transform(z_F_10)

In [None]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_SARIMAX_10 = pd.DataFrame(df_forecast_SARIMAX_10, index=idxH_F_Exo, columns=test_10.columns)

In [None]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_SARIMAX_10[df_forecast_SARIMAX_10 < 0] = 0
df_forecast_SARIMAX_10[df_forecast_SARIMAX_10 > 1] = 1

In [None]:
#The RMSE for the forecasted values and test dataset
print(np.sqrt(mean_squared_error(test_10,df_forecast_SARIMAX_10)))
print(mean_absolute_error(test_10,df_forecast_SARIMAX_10)*100)
print(test_10.mean().sum())

# 11. Model VAR(672, trend="ctt",exog= Weekday), Dimensionality Reduction, PCA with retained 95% variance, Train(4512), Test(672), RMSE = 0.04184089

In [36]:
from sklearn.decomposition import PCA

In [37]:
pca = PCA(0.95)

In [38]:
train_2D = pca.fit_transform(grab_train[:-672])

In [39]:
pca.explained_variance_ratio_.sum()

0.9501405036970398

In [40]:
pca.explained_variance_ratio_

array([6.51803521e-01, 6.13084516e-02, 4.39834237e-02, 2.70607501e-02,
       2.43508716e-02, 1.75100875e-02, 1.29762158e-02, 8.40748785e-03,
       6.61011213e-03, 6.24463943e-03, 4.81676383e-03, 4.72410334e-03,
       3.87567708e-03, 3.64940584e-03, 3.12047035e-03, 2.69894027e-03,
       2.48680843e-03, 2.22087156e-03, 2.08501822e-03, 2.01848902e-03,
       1.88598121e-03, 1.63494691e-03, 1.52169704e-03, 1.50807216e-03,
       1.41445432e-03, 1.32655326e-03, 1.28530012e-03, 1.20351730e-03,
       1.16560559e-03, 1.12709808e-03, 1.10392382e-03, 1.06164220e-03,
       9.99131901e-04, 9.80129237e-04, 9.58076150e-04, 9.36822241e-04,
       8.99536398e-04, 8.77785092e-04, 8.50523352e-04, 8.15951215e-04,
       7.82523108e-04, 7.78746326e-04, 7.59420882e-04, 7.41447503e-04,
       7.20550428e-04, 7.11871780e-04, 6.90015780e-04, 6.76431463e-04,
       6.67984519e-04, 6.57261678e-04, 6.38505613e-04, 6.21709019e-04,
       6.09059280e-04, 5.98429760e-04, 5.90245775e-04, 5.80509339e-04,
      

In [42]:
pca.explained_variance_ratio_.shape

(117,)

In [43]:
train_2D = pd.DataFrame(train_2D)

In [44]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [45]:
train_2D.shape

(5184, 117)

In [46]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_11 = train_2D
test_11 = grab_train[-nobs:]

In [47]:
train_11_Exo_feature = train_11.copy()
test_11_Exo_feature = test_11.copy()

In [48]:
train_11_Exo_feature['Weekday'] = train_11_Exo_feature.index.weekday_name
test_11_Exo_feature['Weekday'] = test_11_Exo_feature.index.weekday_name

In [49]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [50]:
train_11_Exo_feature = create_dummies(train_11_Exo_feature,'Weekday')
test_11_Exo_feature = create_dummies(test_11_Exo_feature,'Weekday')

In [51]:
test_11_Exo_feature.columns

Index(['qp02yc', 'qp02yf', 'qp02yu', 'qp02yv', 'qp02yy', 'qp02yz', 'qp02z1',
       'qp02z3', 'qp02z4', 'qp02z5',
       ...
       'qp0dnj', 'qp0dnn', 'Weekday', 'Weekday_Friday', 'Weekday_Monday',
       'Weekday_Saturday', 'Weekday_Sunday', 'Weekday_Thursday',
       'Weekday_Tuesday', 'Weekday_Wednesday'],
      dtype='object', length=1337)

In [52]:
train_11_Exo_feature = train_11_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']
test_11_Exo_feature = test_11_Exo_feature.loc[:,'Weekday_Friday':'Weekday_Wednesday']

In [53]:
train_11_Exo_feature.sample(5)

Unnamed: 0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
2019-02-19 06:00:00,0,0,0,0,0,1,0
2019-01-05 00:30:00,0,0,1,0,0,0,0
2019-01-26 15:00:00,0,0,1,0,0,0,0
2019-02-15 12:15:00,1,0,0,0,0,0,0
2019-02-18 09:15:00,0,1,0,0,0,0,0


In [54]:
model_VAR_11 = VAR(train_11, exog=train_11_Exo_feature)

In [55]:
results_model_VAR_11 = model_VAR_11.fit(maxlags=672, trend="ctt") 

In [56]:
results_model_VAR_11.aic

-2672.2772384458553

In [57]:
lagged_values_VAR_11 = train_11.values[-672:]

In [58]:
z_F_11 = results_model_VAR_11.forecast(y=lagged_values_VAR_11, steps=672, exog_future=test_11_Exo_feature) 
df_forecast_VAR_11 = pca.inverse_transform(z_F_11)

In [59]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_11 = pd.DataFrame(df_forecast_VAR_11, index=idxH_F_Exo, columns=test_11.columns)

In [60]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_11[df_forecast_VAR_11 < 0] = 0
df_forecast_VAR_11[df_forecast_VAR_11 > 1] = 1

In [61]:
#The RMSE for the forecasted values and test dataset
RMSE = np.sqrt(mean_squared_error(test_11,df_forecast_VAR_11))
MAE = mean_absolute_error(test_11,df_forecast_VAR_11)*100
Test_dataset_mean = test_11.mean().sum()
RMSE_divide_Test_mean = (RMSE / Test_dataset_mean) * 100
relative_error_in_100_percentage = 100 - RMSE_divide_Test_mean
print(RMSE)
print(MAE)
print(Test_dataset_mean)
print(RMSE_divide_Test_mean)
print(relative_error_in_100_percentage)

0.041840895707627154
2.1388077664917358
81.90959266258383
0.05108180171275593
99.94891819828725


# 12. Model VAR(672, trend="ctt"), Dimensionality Reduction, PCA with retained 95% variance, Train(4512), Test(672), RMSE = 0.04187778

In [62]:
from sklearn.decomposition import PCA

In [63]:
pca = PCA(0.95)

In [64]:
train_2D = pca.fit_transform(grab_train[:-672])

In [65]:
pca.explained_variance_ratio_.sum()

0.9501405036970398

In [66]:
pca.explained_variance_ratio_

array([6.51803521e-01, 6.13084516e-02, 4.39834237e-02, 2.70607501e-02,
       2.43508716e-02, 1.75100875e-02, 1.29762158e-02, 8.40748785e-03,
       6.61011213e-03, 6.24463943e-03, 4.81676383e-03, 4.72410334e-03,
       3.87567708e-03, 3.64940584e-03, 3.12047035e-03, 2.69894027e-03,
       2.48680843e-03, 2.22087156e-03, 2.08501822e-03, 2.01848902e-03,
       1.88598121e-03, 1.63494691e-03, 1.52169704e-03, 1.50807216e-03,
       1.41445432e-03, 1.32655326e-03, 1.28530012e-03, 1.20351730e-03,
       1.16560559e-03, 1.12709808e-03, 1.10392382e-03, 1.06164220e-03,
       9.99131901e-04, 9.80129237e-04, 9.58076150e-04, 9.36822241e-04,
       8.99536398e-04, 8.77785092e-04, 8.50523352e-04, 8.15951215e-04,
       7.82523108e-04, 7.78746326e-04, 7.59420882e-04, 7.41447503e-04,
       7.20550428e-04, 7.11871780e-04, 6.90015780e-04, 6.76431463e-04,
       6.67984519e-04, 6.57261678e-04, 6.38505613e-04, 6.21709019e-04,
       6.09059280e-04, 5.98429760e-04, 5.90245775e-04, 5.80509339e-04,
      

In [67]:
pca.explained_variance_ratio_.shape

(117,)

In [69]:
train_2D = pd.DataFrame(train_2D)

In [70]:
train_2D = train_2D.set_index(pd.date_range(datetime(2019, 1, 1, hour=0, minute=0), periods=5184, freq='15min'))

In [71]:
train_2D.shape

(5184, 117)

In [72]:
#Let the test dataset be one week dataset as it is the cycle of weekday & weekend
nobs=672    
train_12 = train_2D
test_12 = grab_train[-nobs:]

In [73]:
model_VAR_12 = VAR(train_12)

In [74]:
results_model_VAR_12 = model_VAR_12.fit(maxlags=672, trend="ctt") 

In [75]:
results_model_VAR_12.aic

-2657.9516952218105

In [76]:
lagged_values_VAR_12 = train_12.values[-672:]

In [77]:
z_F_12 = results_model_VAR_12.forecast(y=lagged_values_VAR_12, steps=672) 
df_forecast_VAR_12 = pca.inverse_transform(z_F_12)

In [78]:
#Set the time frame to be same as test dataset
idxH_F_Exo = pd.date_range(datetime(2019, 2, 24, hour=0, minute=0), periods=672, freq='15min')

#Modify variable z to be a data frame and have the same time frame and columns with the test dataset rather than a array
df_forecast_VAR_12 = pd.DataFrame(df_forecast_VAR_12, index=idxH_F_Exo, columns=test_12.columns)

In [79]:
#Refine the forecast value by setting the demand value between 0 and 1
df_forecast_VAR_12[df_forecast_VAR_12 < 0] = 0
df_forecast_VAR_12[df_forecast_VAR_12 > 1] = 1

In [80]:
#The RMSE for the forecasted values and test dataset
RMSE = np.sqrt(mean_squared_error(test_12,df_forecast_VAR_12))
MAE = mean_absolute_error(test_12,df_forecast_VAR_12)*100
Test_dataset_mean = test_12.mean().sum()
RMSE_divide_Test_mean = (RMSE / Test_dataset_mean) * 100
relative_error_in_100_percentage = 100 - RMSE_divide_Test_mean
print(RMSE)
print(MAE)
print(Test_dataset_mean)
print(RMSE_divide_Test_mean)
print(relative_error_in_100_percentage)

0.04187778452585649
2.1401231362945032
81.90959266258383
0.05112683772994295
99.94887316227006
