In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.regression.linear_model import yule_walker
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit



df = pd.read_csv(f"../data/input_data/MAIN_DATASET.csv", index_col=[0])
# df['year'] = pd.DatetimeIndex(df['date_time']).year
# df['month'] = pd.DatetimeIndex(df['date_time']).month

# df.head()

In [None]:
def hypothesis_test(df:pd.DataFrame, area:int)->None:
    result = adfuller(df[f"NO{area}_price"])
    df.head()
    #print(result)
    p_value = result[1]
    if p_value < 0.005: 
        print(f"{p_value=}<0.05,Data series is theoretically stationary")
        return df

    else:
        print(f"{p_value=}, data need differencing")
        df["NO2_price_diff"] = df[f"NO{area}_price"].diff()
        df = df.dropna()
        return df
         
df = hypothesis_test(df, 2)


df.head()

In [None]:


time_steps = 24
order = 24

train= df["NO2_price"][:int(len(df)*0.9)].to_numpy()
# train = df["NO2_price_diff"][:-time_steps].to_numpy()

test = df["NO2_price"][int(len(df)*0.9):].to_numpy()
#slice som vals to split into equal arrays of 24 hours:
test = test[len(test)%24:]

# print(f"{len(train)=}")
# print(f"{len(test)=}")
test_days = np.array_split(test, (len(test)/24))


def AR_model(train:np.ndarray,test:list, time_steps:int,model_order:int, pre_diff = None)->np.ndarray:
    """Implemented Auto Regressive model

    Args:
        train (np.ndarray): Data used to estimating phi´s
        time_steps (int): Number of steps to predict
        model_order (int): Order p of model 
        pre_diff ([type], optional): Original data, if data has been differenced.Defaults to None.

    Returns:
        np.ndarray: Forecasted values 
    """
    #Yule Walker method to estimate phi´s
    phi, _ = yule_walker(train, order = order)
    
    #create list for storing MSE-vals for all test days
    mse_scores = np.zeros(len(test))
    
    # create matrix for storing all preds:
    X_hat_matrix = np.zeros(shape=(len(test),(model_order + time_steps)))
    #create empty array with lagged values and extra space for future preds(currently zeros)
    
    for count,day in enumerate(test):    
        X_hat = np.zeros(model_order + time_steps)
        X_hat[:order] = day
        
        
        for step in range(time_steps):
            X_t = 0
            #Actual AR-model:
            for p in range(model_order):
                X_t += phi[p] * X_hat[(model_order+step)-(p+1)]
            X_hat[model_order+step] = X_t

        if pre_diff is not None:
            #Add original inital value back and sum cumlative to undo differencing: 
            original_val = pre_diff[-order:]
            X_hat[0] = original_val[0] + X_hat[0]
            new_X_hat = X_hat.cumsum()
            return new_X_hat
        
        
        X_hat_matrix[count,:] = X_hat
        mse_scores[count] = mean_squared_error(day, X_hat[:-time_steps])
    
    print(mse_scores.mean())
    return X_hat_matrix
    

    
    
X_hat_matr = AR_model(train,test_days,time_steps,order)


In [None]:
plt.plot(np.arange(len(X_hat)-time_steps), X_hat[-time_steps:], label = "Forecast")
plt.plot(np.arange(len(test)), test, label = "Actual")
plt.legend()