In [1]:
# importing packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
import random

In [2]:
periods, divider = 300, 6

# Example 1: Creating a DatetimeIndex with a frequency
start_date = '2023-01-01'
index_freq = '1H'  # Daily frequency

# Create DateTimeIndex at minute frequency
index = pd.date_range(start=start_date, periods=periods, freq=index_freq)
data = np.array([i+random.randint(1,4) for i in range(int(periods/divider)) for j in range(divider)])
df = pd.DataFrame(data, index=index, columns=['Value'])

## 1. Understanding Daily Trends

In [3]:
!pip install pmdarima
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf,  plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima.arima import auto_arima

Collecting pmdarima
  Downloading pmdarima-2.0.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.3
[0m

In [4]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split 
train_df, test_df = train_test_split(df, test_size = 0.2, shuffle=False)
# Perform additive seasonal decomposition
decomposition = sm.tsa.seasonal_decompose(df, model='additive')

# Access the decomposed components
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

fig = go.Figure()
fig.add_trace(go.Scatter(x=seasonal.index, y=seasonal, name='Daily Trends'))
fig.update_layout(
    height=600,
    width=800,
    showlegend=True,
    title='Daily Trends Plot'
)
fig.show()

# 2. Define the right model
Find the right model from the lowest AIC indicates better fit for the model. AIC describe good balance between accuracy in capturing the underlying patterns of the data and simplicity in terms of the number of parameters

In [5]:
from statsmodels.tsa.arima.model import ARIMA

In [6]:
import itertools
limit = 3
pdq = list(itertools.product(range(0, limit), range(1, limit), range(0, limit)))

aic, number = [],[]
number = []
for i in pdq:
    # Training the model
    model = ARIMA(train_df.values, order=(i))
    model_fit = model.fit()
    # Consolidate training label and metrics
    print(f'ARIMA({i}) AIC : {round(model_fit.aic,2)}')
    aic.append(round(model_fit.aic, 2))
    number.append(i)
    
model = ARIMA(train_df.values, order=number[aic.index(min(aic))])
model_fit = model.fit()
prediction = model_fit.forecast(len(test_df))
prediction_value = prediction
prediction_index = list(test_df.index)

ARIMA((0, 1, 0)) AIC : 897.44
ARIMA((0, 1, 1)) AIC : 810.78
ARIMA((0, 1, 2)) AIC : 810.85
ARIMA((0, 2, 0)) AIC : 1157.76



Non-invertible starting MA parameters found. Using zeros as starting parameters.



ARIMA((0, 2, 1)) AIC : 899.61
ARIMA((0, 2, 2)) AIC : 757.51
ARIMA((1, 1, 0)) AIC : 827.29
ARIMA((1, 1, 1)) AIC : 810.7
ARIMA((1, 1, 2)) AIC : 812.41
ARIMA((1, 2, 0)) AIC : 997.13
ARIMA((1, 2, 1)) AIC : 825.18
ARIMA((1, 2, 2)) AIC : 759.51
ARIMA((2, 1, 0)) AIC : 818.37
ARIMA((2, 1, 1)) AIC : 812.54



Maximum Likelihood optimization failed to converge. Check mle_retvals



ARIMA((2, 1, 2)) AIC : 760.87
ARIMA((2, 2, 0)) AIC : 938.69
ARIMA((2, 2, 1)) AIC : 812.06



Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.



ARIMA((2, 2, 2)) AIC : 760.65


In [7]:
model_fit.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,240.0
Model:,"ARIMA(0, 2, 2)",Log Likelihood,-375.755
Date:,"Thu, 06 Jul 2023",AIC,757.511
Time:,06:53:24,BIC,767.928
Sample:,0,HQIC,761.709
,- 240,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ma.L1,-1.9406,1.381,-1.405,0.160,-4.647,0.766
ma.L2,0.9407,1.298,0.725,0.468,-1.602,3.484
sigma2,1.3036,1.851,0.704,0.481,-2.324,4.931

0,1,2,3
Ljung-Box (L1) (Q):,0.02,Jarque-Bera (JB):,8.49
Prob(Q):,0.9,Prob(JB):,0.01
Heteroskedasticity (H):,0.9,Skew:,-0.21
Prob(H) (two-sided):,0.64,Kurtosis:,2.18


In [8]:
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

# Add traces to the figure
fig.add_trace(go.Scatter(x=train_df.index, y=train_df['Value'], name='Train Data'))
fig.add_trace(go.Scatter(x=prediction_index, y=prediction_value, name='Prediction', line=dict(color='red')))
fig.add_trace(go.Scatter(x=test_df.index, y=test_df['Value'], name='Real Visit Data', line=dict(color='orange', dash='dash')))

# Update the layout
fig.update_layout(
    title='Water Consumption Projections',
    xaxis_title='Date',
    yaxis_title='Value',
    legend=dict(x=0, y=1, traceorder='normal'),
    width=800,
    height=400)

# Show the plot
fig.show()