In [None]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_squared_error,
    root_mean_squared_error,
)
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

%config InlineBackend.figure_formats = 'svg'

In [None]:
DATASET = "lzemx"

In [None]:
df = pd.read_csv(f"../data/raw/{DATASET}.csv", parse_dates=True)

In [None]:
plt.plot(df["Close"])

In [None]:
train_val, test = train_test_split(df["Close"].values, test_size=0.2, shuffle=False)
train, val = train_test_split(train_val, test_size=0.25, shuffle=False)

In [None]:
def objective(trial):
    p = trial.suggest_int("p", 0, 5)
    q = trial.suggest_int("q", 0, 5)
    arima = SARIMAX(train, order=(p, 1, q))
    model = arima.fit(disp=False)
    points = []
    for data in val:
        pred = model.forecast(steps=1)
        points.append(pred)
        model = model.append([data], refit=False)
    return mean_squared_error(val, points)

In [None]:
result = adfuller(df["Close"])
print("ADF Statistic:", result[0])
print("p-value:", result[1])
print("Critical Values:", result[4])

In [None]:
d1 = df["Close"].diff().dropna()
result = adfuller(d1)
print("ADF Statistic:", result[0])
print("p-value:", result[1])
print("Critical Values:", result[4])
d1.plot()

In [None]:
sampler = optuna.samplers.TPESampler(seed=0)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)

In [None]:
params = study.best_params
p, q = params["p"], params["q"]
print(f"Best params: p={p}, q={q}")

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# ACF plot
plot_acf(d1, ax=axes[0])
axes[0].set_title("Autocorrelation Function (ACF)")

# PACF plot
plot_pacf(d1, ax=axes[1])
axes[1].set_title("Partial Autocorrelation Function (PACF)")

plt.show()

In [None]:
arima = SARIMAX(train_val, order=(p, 1, q))
model = arima.fit()

points = []

for i, data in enumerate(test):
    pred = model.forecast(steps=1)
    points.append(pred)
    model = model.append([data], refit=False)
    if i % 100 == 0:
        print(f"{i} / {len(test)}")

print("RMSE:", root_mean_squared_error(test, points))
print("MAPE:", mean_absolute_percentage_error(test, points) * 100)

plt.plot(test, label="Observed")
plt.plot(points, "--", label="Predicted")
plt.xlabel("Day")
plt.ylabel("Price")
plt.grid()
plt.legend()
plt.savefig(f"../report/images/{DATASET}_arima.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
import pickle

with open(f"../{DATASET}_arima.pkl", "wb") as f:
    pickle.dump(points, f)