In [None]:
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
from pmdarima.arima import auto_arima
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
def find_best_order(data):
    best_fit = auto_arima(
        data,
        seasonal=False,
        suppress_warnings=True,
        stepwise=True,
        max_P=10,
        max_d=10,
        max_q=10,
        start_p=5,
        start_q=5,
    )
    print(best_fit.order)
    return best_fit.order

In [None]:
def train_arima(data):
    train_size = int(len(data) * 0.9)
    train, test = data[:train_size], data[train_size:]

    p = 5  # Insert the value of p determined from the PACF plot
    d = 1  # Insert the value of d based on the number of times differencing was applied
    q = 6  # Insert the value of q determined from the ACF plot

    model = ARIMA(train, order=(p, d, q))
    model_fit = model.fit()

    print(model_fit.summary())

    forecast_steps = 1
    forecast = model_fit.forecast(steps=forecast_steps)

    mae = mean_absolute_error(test, forecast)
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    return mae, mse, rmse

In [None]:
def prepare_data(file_path):
    data = pd.read_csv(file_path)
    data.set_index("Datetime", inplace=True)
    target_data = data["Flow_Kalltveit"].values
    return target_data

In [None]:
def plot_target_data(target_data):
    plt.plot(target_data)
    plt.xlabel("Time")
    plt.ylabel("Flow_Kalltveit")
    plt.title("Flow_Kalltveit Time Series")
    plt.show()


def plot_diff_target_data(target_data):
    plt.plot(target_data)
    plt.xlabel("Time")
    plt.ylabel("Differenced Flow_Kalltveit")
    plt.title("Differenced Flow_Kalltveit Time Series")
    plt.show()
    return target_data

In [None]:
file_names = [
    "../data/clean_data/cleaned_data_1.csv",
    "../data/clean_data/cleaned_data_2.csv",
    "../data/clean_data/cleaned_data_3.csv",
    "../data/clean_data/cleaned_data_4.csv",
]

for i, file_path in enumerate(file_names, 1):
    target_data = prepare_data(file_path)
    # plot_target_data(target_data)
    target_data = np.diff(target_data)
    # plot_diff_target_data(target_data)
    # best_order = find_best_order(target_data)
    # plot_pacf(target_data, lags=25)
    # plt.show()
    # plot_acf(target_data, lags=25)
    # plt.show()

    print(f"Dataset {i}:")
    mae, mse, rmse = train_arima(target_data)

Dataset 1:
  MAE: 3.4936
  MSE: 58.4553
  RMSE: 7.6456

Dataset 2:
  MAE: 6.2305
  MSE: 57.7023
  RMSE: 7.5962

Dataset 3:
  MAE: 4.0248
  MSE: 48.3744
  RMSE: 6.9552


Dataset 4:
  MAE: 6.1155
  MSE: 47.5642
  RMSE: 6.8967