In [148]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

import DataRetriever as dr

RETRIEVER = dr.DataRetriever()
CON_ATTRIBUTES = RETRIEVER.get_attributes(file_name='consuming_attributes.pkl')
DATA = RETRIEVER.get_data(file_name='All-Subsystems-hour-Year2.pkl')[CON_ATTRIBUTES].sum(axis=1).clip(lower=0)

TRAIN_SIZE = int(DATA.size * 0.7)
VALIDATION_SIZE = int(DATA.size * 0.9)

TRAIN, VALIDATION, TEST = DATA[0:TRAIN_SIZE].to_numpy(), DATA[TRAIN_SIZE: VALIDATION_SIZE].to_numpy(), DATA[VALIDATION_SIZE: DATA.size].to_numpy()
TRAIN_INDEX, VALIDATION_INDEX, TEST_INDEX = list(range(0, len(TRAIN))), list(range(0, len(VALIDATION))), list(range(0, len(TEST)))

In [149]:
def fourierExtrapolation(data: np.array, number_of_predictions: int, n_sinusoids: int) -> np.array:
    """
    Predict {number_of_predictions} observations after the index data.size of {data}, using {n_sinusoids} sinusoids.
    :param trend:
    :param data: The data on which to train the model. Corresponds to {_x = x_0, x_1, ... x_(n-1)} in the theory
    :param number_of_predictions: The amount of predictions to output. Corresponds to {x_((n-1)+1), x_((n-1)+2), ..., x_((n-1)+{number_of_predictions})}
    :param n_sinusoids: The amount of sinusoids on which to base the predictions. I.e. the data probably contains many sinusoids, but we only wish to make predictions based on the {n_sinusoids} largest frequencies.
    :return: A numpy array of length data.size + number_of_predictions, containing the transformed original data + predictions
    """
    data_size = data.size  # n
    sample_index = np.arange(0, data_size)  # sum-limits (0) to (n-1)

    # Fit a linear regression line to data
    linear_trend = np.polyfit(x=sample_index, y=data, deg=1)
    # Subtract the learned line (linear_trend[0] * sample_index) from original data (data).
        # Here, linear_trend[0] are the coefficients of the linear regression, # and sample_index is equivalent to X;
        # thereby giving the, well-known from STAT, form (beta * X)
    x_data_detrended = data - linear_trend[0] * sample_index  # The series of numbers x = {x1, x2, ...}

    X_frequency_domain = np.fft.fft(x_data_detrended)  # The series of complex numbers X = {X1, X2, ...}
    frequencies = np.fft.fftfreq(data_size)  # Some frequencies, e.g. {4, 3, -7, 8, -5, ...}
    indexes = list(range(data_size))  # {0, 1, ..., n-1}
    indexes.sort(key=lambda idx: np.absolute(X_frequency_domain[idx]), reverse=True)  # ascendingly sort indices by frequency
    
    sample_index = np.arange(0, data_size + number_of_predictions)  # sample_index = {0, 1, ..., n-1, n, n+1, ..., (n-1)+number_of_predictions}
    x_restored_sig = np.zeros(sample_index.size)  # Prepare a numpy array to receive x reconstructed from its Fourier Transform

    for i in indexes[:1 + n_sinusoids]:
        amplitude = np.absolute(X_frequency_domain[i]) / data_size   # amplitude
        phase = np.angle(X_frequency_domain[i])                      # phase
        x_restored_sig += amplitude * np.cos(2 * np.pi * frequencies[i] * sample_index + phase)

    x_restored = x_restored_sig + linear_trend[0] * sample_index

    return x_restored[:data_size], x_restored[data_size:-TEST.size], x_restored[:TEST.size]

In [150]:
x = TRAIN
n_predict = VALIDATION.size
n_harmonics = 100

predictions = fourierExtrapolation(data=x,
                                   number_of_predictions=n_predict + TEST.size,
                                   n_sinusoids=n_harmonics)

fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(predictions[0].size),
                         y=predictions[0],
                         name='Prediction [Train]',
                         mode='lines'))

fig.add_trace(go.Scatter(x=np.arange(start=predictions[0].size, stop=predictions[0].size + predictions[1].size),
                         y=predictions[1],
                         name='Prediction [Validation]',
                         mode='lines'))

fig.add_trace(go.Scatter(x=np.arange(TRAIN.size),
                         y=TRAIN,
                         name='Training Data',
                         mode='lines'))

fig.add_trace(go.Scatter(x=np.arange(start=TRAIN.size, stop=DATA.size),
                         y=VALIDATION,
                         name='Validation Data',
                         mode='lines'))


fig.show()

# Hyperparameter tuning

In [151]:
from sklearn.metrics import mean_squared_error

In [152]:
x = TRAIN
n_predict = VALIDATION.size

In [153]:
data_loss = []
for sinusoid in list(range(0, 51)):
    predictions = fourierExtrapolation(data=x,
                                       number_of_predictions=n_predict + TEST.size,
                                       n_sinusoids=sinusoid)
    train_loss = np.sqrt(mean_squared_error(predictions[0], TRAIN))
    validation_loss = np.sqrt(mean_squared_error(predictions[1], VALIDATION))
    test_loss = np.sqrt(mean_squared_error(predictions[2], TEST))

    data_loss.append([sinusoid, train_loss, validation_loss, test_loss])

df_loss = pd.DataFrame(data=data_loss, columns=["Sinusoid", "Train RMSE", "Validation RMSE", "Test RMSE"])
df_loss

Unnamed: 0,Sinusoid,Train RMSE,Validation RMSE,Test RMSE
0,0,795.052644,721.225374,850.364331
1,1,770.03492,746.18106,861.263892
2,2,761.513056,779.773447,874.421716
3,3,742.731181,818.49744,869.778946
4,4,736.364107,866.245791,881.399417
5,5,719.930074,850.07078,864.216329
6,6,714.368054,843.097909,856.054896
7,7,697.512765,848.27201,865.465149
8,8,691.803082,862.661506,883.804916
9,9,677.849377,899.95788,882.064168


In [154]:
fig = px.line(df_loss, x="Sinusoid", y=["Train RMSE", "Validation RMSE"], labels={"variable": ""}, color_discrete_sequence=["rgba(234,143,129,0.5)", "rgb(32,115,171)"])


fig.update_layout(
    xaxis_title="Sinusoids",
    yaxis_title="Loss [RMSE]",
    legend=dict(orientation="h",
                yanchor="top",
                y=1.1,
                xanchor="left",
                x=0.0,),

)

fig.show()

In [155]:
predictions = fourierExtrapolation(data=x,
                                   number_of_predictions=n_predict + TEST.size,
                                   n_sinusoids=1)
train_loss = np.sqrt(mean_squared_error(predictions[0], TRAIN))
validation_loss = np.sqrt(mean_squared_error(predictions[1], VALIDATION))

In [156]:
predictions[2]

array([1678.47064402, 1678.2841035 , 1678.0974698 , 1677.91074307,
       1677.72392344, 1677.53701106, 1677.35000609, 1677.16290866,
       1676.97571892, 1676.78843701, 1676.60106309, 1676.4135973 ,
       1676.22603978, 1676.03839069, 1675.85065016, 1675.66281834,
       1675.47489539, 1675.28688144, 1675.09877665, 1674.91058116,
       1674.72229511, 1674.53391866, 1674.34545195, 1674.15689513,
       1673.96824835, 1673.77951175, 1673.59068548, 1673.40176969,
       1673.21276453, 1673.02367014, 1672.83448667, 1672.64521427,
       1672.45585309, 1672.26640328, 1672.07686498, 1671.88723834,
       1671.69752352, 1671.50772065, 1671.3178299 , 1671.1278514 ,
       1670.93778531, 1670.74763177, 1670.55739093, 1670.36706295,
       1670.17664797, 1669.98614614, 1669.79555761, 1669.60488253,
       1669.41412105, 1669.22327332, 1669.03233949, 1668.84131971,
       1668.65021413, 1668.45902289, 1668.26774616, 1668.07638407,
       1667.88493678, 1667.69340444, 1667.50178721, 1667.31008