In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

%config InlineBackend.figure_formats = 'svg'

In [None]:
DATASET = "eem"

In [None]:
df = pd.read_csv(f"../data/raw/{DATASET}.csv", parse_dates=True)

In [None]:
plt.plot(df["Close"])

In [None]:
def create_sequences(data, window_size):
    xs, ys = [], []
    for i in range(len(data) - window_size):
        x = data[i : (i + window_size)]
        y = data[i + window_size]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [None]:
Xy_train, Xy_test = train_test_split(df["Close"].values, test_size=0.2, shuffle=False)
X_train, y_train = create_sequences(Xy_train, window_size=7)
X_test, y_test = create_sequences(Xy_test, window_size=7)

In [None]:
result = adfuller(df["Close"])
print("ADF Statistic:", result[0])
print("p-value:", result[1])
print("Critical Values:", result[4])

In [None]:
d1 = df["Close"].diff().dropna()
result = adfuller(d1)
print("ADF Statistic:", result[0])
print("p-value:", result[1])
print("Critical Values:", result[4])
d1.plot()

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

# Perform Ljung-Box test on the differenced series
ljung_box_result = acorr_ljungbox(d1, lags=[10], return_df=True)
print(ljung_box_result)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# ACF plot
plot_acf(d1, ax=axes[0])
axes[0].set_title("Autocorrelation Function (ACF)")

# PACF plot
plot_pacf(d1, ax=axes[1])
axes[1].set_title("Partial Autocorrelation Function (PACF)")

plt.show()

In [None]:
ari_train, ari_test = train_test_split(df["Close"].values, test_size=0.2, shuffle=False)

In [None]:
# Fit the ARIMA model on the full training series
arima = SARIMAX(ari_train, order=(0, 1, 0))
model = arima.fit()

points = []

# Loop over the test set using enumerate to get the index
for i, data in enumerate(ari_test):
    # Forecast one step ahead
    pred = model.forecast(steps=1)
    points.append(pred)  # Extract the forecast value from the array/series
    # Update the model's state with the new observation; note that 'refit' is correct
    model = model.append([data], refit=False)
    if i % 100 == 0:
        print(f"{i} / {len(ari_test)}")

print("RMSE:", root_mean_squared_error(ari_test, points))

# Plot the true vs. predicted values
plt.plot(ari_test, label="True")
plt.plot(points, "--", label="Predicted")
plt.xlabel("Day")
plt.ylabel("Price")
plt.grid()
plt.legend()
plt.savefig(f"../report/images/{DATASET}_arima.png", dpi=300, bbox_inches="tight")
plt.show()