In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.regression.linear_model import yule_walker
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit

In [None]:
df = pd.read_csv(f"../data/input_data/MAIN_DATASET.csv", index_col=[0])
df['year'] = pd.DatetimeIndex(df['date_time']).year
df = df[df['year'] == 2021]
df.head()


In [None]:
day_ahead_prices = df[["dato_id","NO2_price", "NO5_price"]].head(200)
day_ahead_prices.head()

In [None]:
ax = day_ahead_prices.plot()
ax.set_ylabel("Day Ahead Price")
plt.show()

### Test if data is stationary: 

In [None]:
def hypothesis_test(df:pd.DataFrame):
    result = adfuller(df)
    #print(result)
    p_value = result[1]
    if p_value > 0.005: 
        print(f"{p_value=}, differencing data")
        df = df.diff()
        return df
    else:
        print(f"{p_value=}>0.05, no need to difference data")
        return df

differenced_NO2 = hypothesis_test(day_ahead_prices["NO2_price"])

ax1 = differenced_NO2.plot()
day_ahead_prices["NO2_price"].plot(ax=ax1)
ax1.set_ylabel("Day Ahead Price")
plt.show()

differenced_NO5 = hypothesis_test(day_ahead_prices["NO5_price"])

ax2 = differenced_NO5.plot()
day_ahead_prices["NO5_price"].plot(ax=ax2)
ax2.set_ylabel("Day Ahead Price")
plt.show()

In [None]:
plot_acf(differenced_NO5.dropna(), lags=24)
plt.show()

In [None]:
plot_pacf(differenced_NO5.dropna(),method='ywm', lags = 24)
plt.show()

In [None]:
from sklearn.metrics import r2_score


n_steps = 10

train = list(differenced_NO5.dropna())[:-n_steps]
test = list(day_ahead_prices["NO5_price"])[-n_steps:]

coefficients, sigma = yule_walker(train, order = 24)

# Make a list of differenced values
val_list = list(train)
# Reverse the list so that the order corresponds with the order of the coefficients
val_list.reverse()
# Define the number of years to predict

# For each year to predict
for i in range(n_steps):
    
    # Compute the new value as the sum of lagged values multiplied by their corresponding coefficient
    new_val = 0
    for j in range(len(coefficients)):
        
        new_val += coefficients[j] * val_list[j]
    
    # Insert the new value at the beginning of the list
    val_list.insert(0, new_val)

# Redo the reverso to have the order of time
val_list.reverse()

# Add the original first value back into the list and do a cumulative sum to undo the differencing 
val_list = [day_ahead_prices["NO5_price"].values[0]] + val_list
new_val_list = pd.Series(val_list).cumsum()

# Plot the newly obtained list
validation = pd.DataFrame({
    'pred': new_val_list,
    'original': day_ahead_prices["NO5_price"].reset_index(drop=True)
     })

# print('Test R2:', r2_score(validation.iloc[-10:, 0], validation.iloc[-10:, 1]))

# Plot the newly obtained list
plot_window = n_steps+12
plt.plot(range(len(validation)-plot_window, len(validation)),validation["original"].tail(plot_window))
plt.plot(range(len(validation)-plot_window, len(validation)),validation["pred"].tail(plot_window))
plt.legend(validation.columns)
plt.ylabel('Day Ahead Price')
plt.xlabel('t')
plt.show()