In [None]:
# Time-series analysis - March 2024 - with Tractor-Sales.csv data

In [None]:
# import needed common libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt 

In [None]:
import os
os.chdir("Documents")
os.getcwd()

In [None]:
# read the Tractor-Sales.csv data file
df = pd.read_csv('Tractor-Sales.csv')
df.head()

In [None]:
# Convert Date column to datetime column
df['Date'] = pd.to_datetime(df['Date'], format='%b-%Y').dt.strftime('%Y-%m-%d')

df

In [None]:
# Convert data to time series
sales_ts = pd.Series(df['Sales'].values, index=df['Date'], name='Sales')
sales_ts

In [None]:
# Plot the time series
# Is this time series stationary?
import matplotlib.pylab as plt
ax = sales_ts.plot()
ax.set_xlabel('Date')
ax.set_ylabel('Sales')
plt.show()

In [None]:
# Dickey-Fuller Test to verify whether a time-series is stationary
# Null hypothesis: Series is non-stationary
# p-vlaue: non-significance indicating non-stationary
# Test Statistic > Critical Value: Reject null hypothesis and conclude Series is stationary

from statsmodels.tsa.stattools import adfuller
print('Results of Dickey-Fuller Test:')
dftest = adfuller(sales_ts.values[1:],autolag='AIC')
dfoutput = pd.Series(dftest[0:4],index=['Test Statistic', 'p-value', '#lags used', 'Number of observations Used'])

for key , value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)


In [None]:
# Log transformation to achieve constant variance
import numpy as np
sales_ts_log = np.log10(sales_ts)
ax = sales_ts_log.plot()
ax.set_xlabel('Date')
ax.set_ylabel('Log Transformed Sales')
plt.show()

In [None]:
# Plot ACF to check potential seasonality
# ACF: autocorrelation factor
import statsmodels.tsa.api as smt
sales_ts_log_diff = sales_ts_log.diff(periods= 1)
sales_ts_log_diff.dropna(inplace=True)
smt.graphics.plot_acf(sales_ts_log_diff, lags=30, alpha= 0.05)
plt.show()

In [None]:
# Partial ACF plot to show significant lags
smt.graphics.plot_pacf(sales_ts_log_diff, lags=30, alpha= 0.05)
plt.show()

In [None]:
# Data partition for the time series data
# Keep the last three years data as the validation dataset
nValid = 36
nTrain = len(sales_ts_log) - nValid
train_ts = sales_ts_log[:nTrain]
valid_ts = sales_ts_log[nTrain:]

In [None]:
# Test a SARIMA model
import statsmodels.api as sm
temp_model = sm.tsa.statespace.SARIMAX(train_ts,
                                       order=(1,1,1),
                                       seasonal_order=(1,0,1,12),
                                       enforce_stationarity=True,
                                       enforce_invertibility=True)
results = temp_model.fit()
print(results.summary())

In [None]:
# forecast with the fitted model
pred = results.get_forecast(steps=36,alpha=0.05)

# Get the forecasted values
forecasted_values = pred.predicted_mean

# Get the confidence intervals for the forecasted values
confidence_intervals = pred.conf_int()

# Print the forecasted values
print("Forecasted values:")
print(forecasted_values)

# Print the confidence intervals
print("\n95% confidence intervals:")
print(confidence_intervals)

In [None]:
# Search for best parameters for SARIMA
# generate the parameter combinations
# Define the p, d and q parameters to take any value between 0 and 2
import itertools

p = d = q = range(0,2)

# Generate all possible combinations of p, d and q triplets
pdq = list(itertools.product(p, d, q))

# Generate all possible combinations of seasonal p, d and q triplets
seasonal_pdq = [(x[0],x[1],x[2],12) for x in list(itertools.product(p, d, q))]

print(pdq)
print(seasonal_pdq)


In [None]:
import sys
# Specify to ignore warning messages
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
print("Start search time:", datetime.now())

# Set up the initial values
best_aic = np.inf
best_pdq = None
best_seasonal_pdq = None
temp_model = None

import statsmodels.api as sm
# Using loops to search for the best SARIMA model
# Choose the best SARIMA model based on minimum AIC values
# AIC: to balance the model between goodness-of-fit and number of parameters
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            temp_model = sm.tsa.statespace.SARIMAX(train_ts,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=True,
                                            enforce_invertibility=True)
            results = temp_model.fit()

            if results.aic < best_aic:
                best_aic = results.aic
                best_pdq = param
                best_seasonal_pdq = param_seasonal

        except:
            continue

print("Best SARIMAX{}x{}12 model - AIC:{}".format(best_pdq, best_seasonal_pdq, best_aic))

print("End search time:", datetime.now())

In [None]:
# run best model
best_model = sm.tsa.statespace.SARIMAX(train_ts,
                                       order=(0,1,1),
                                       seasonal_order=(1,0,1,12),
                                       enforce_stationarity=True,
                                       enforce_invertibility=True)
best_results = best_model.fit()

print(best_results.summary())


In [None]:
!pip install dmba

In [None]:
# !pip install dmba
# after installing the above library, you can comment the above line for subsequent code runs

from dmba import regressionSummary
regressionSummary(np.power(10,valid_ts),
                  np.power(10,pred.predicted_mean))

In [None]:
pred = best_results.get_forecast(steps=36,alpha=0.05)

# Get the forecasted values
forecasted_values = pred.predicted_mean

# Get the confidence intervals for the forecasted values
confidence_intervals = pred.conf_int()

# Print the forecasted values
print("Forecasted values:")
print(forecasted_values)

# Print the confidence intervals
print("\n95% confidence intervals:")
print(confidence_intervals)

In [None]:
ax = pred.predicted_mean.plot()
ax.set_xlabel('Date')
ax.set_ylabel('Log Transformed Sales')
plt.show()