In [112]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

import DataRetriever as dr

RETRIEVER = dr.DataRetriever()
CON_ATTRIBUTES = RETRIEVER.get_attributes(file_name='consuming_attributes.pkl')
DATA = RETRIEVER.get_data(file_name='All-Subsystems-hour-Year2.pkl')[CON_ATTRIBUTES].sum(axis=1).clip(lower=0)

TRAIN_SIZE = int(DATA.size * 0.7)
VALIDATION_SIZE = int(DATA.size * 0.9)

TRAIN, VALIDATION, TEST = DATA[0:TRAIN_SIZE].to_numpy(), DATA[TRAIN_SIZE: VALIDATION_SIZE].to_numpy(), DATA[VALIDATION_SIZE: DATA.size].to_numpy()
TRAIN_INDEX, VALIDATION_INDEX, TEST_INDEX = list(range(0, len(TRAIN))), list(range(0, len(VALIDATION))), list(range(0, len(TEST)))

In [113]:
def fourierExtrapolation(data: np.array, number_of_predictions: int, n_sinusoids: int) -> np.array:
    """
    Predict {number_of_predictions} observations after the index data.size of {data}, using {n_sinusoids} sinusoids.
    :param data: The data on which to train the model. Corresponds to {_x = x_0, x_1, ... x_(n-1)} in the theory
    :param number_of_predictions: The amount of predictions to output. Corresponds to {x_((n-1)+1), x_((n-1)+2), ..., x_((n-1)+{number_of_predictions})}
    :param n_sinusoids: The amount of sinusoids on which to base the predictions. I.e. the data probably contains many sinusoids, but we only wish to make predictions based on the {n_sinusoids} largest frequencies.
    :return: A numpy array of length data.size + number_of_predictions, containing the transformed original data + predictions
    """
    data_size = data.size  # n
    sample_index = np.arange(0, data_size)  # sum-limits (0) to (n-1)

    # Fit a linear regression line to data
    linear_trend = np.polyfit(x=sample_index, y=data, deg=1)
    # Subtract the learned line (linear_trend[0] * sample_index) from original data (data).
        # Here, linear_trend[0] are the coefficients of the linear regression, # and sample_index is equivalent to X;
        # thereby giving the, well-known from STAT, form (beta * X)
    x_data_detrended = data - linear_trend[0] * sample_index  # The series of numbers x = {x1, x2, ...}

    X_frequency_domain = np.fft.fft(x_data_detrended)  # The series of complex numbers X = {X1, X2, ...}
    frequencies = np.fft.fftfreq(data_size)  # Some frequencies, e.g. {4, 3, -7, 8, -5, ...}
    indexes = list(range(data_size))  # {0, 1, ..., n-1}
    indexes.sort(key=lambda idx: np.absolute(X_frequency_domain[idx]), reverse=True)  # ascendingly sort indices by frequency
    
    sample_index = np.arange(0, data_size + number_of_predictions)  # sample_index = {0, 1, ..., n-1, n, n+1, ..., (n-1)+number_of_predictions}
    x_restored_sig = np.zeros(sample_index.size)  # Prepare a numpy array to receive x reconstructed from its Fourier Transform


    if n_sinusoids == 0:
        return x_restored_sig + data.mean() + linear_trend[0] * sample_index

    for i in indexes[:n_sinusoids]:
        amplitude = np.absolute(X_frequency_domain[i])
        phase = np.angle(X_frequency_domain[i])
        x_restored_sig += amplitude * np.cos(2 * np.pi * frequencies[i] * sample_index + phase)

    x_restored = 1 / data_size * x_restored_sig + linear_trend[0] * sample_index

    return x_restored

In [119]:
x = TRAIN
n_predict = VALIDATION.size
n_harmonics = 100

predictions = fourierExtrapolation(data=x,
                                   number_of_predictions=n_predict + TEST.size,
                                   n_sinusoids=n_harmonics)

fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(predictions[:TRAIN.size].size),
                         y=predictions[:DATA.size],
                         name='Prediction [Train]',
                         mode='lines'))

fig.add_trace(go.Scatter(x=np.arange(start=predictions[:TRAIN.size].size, stop=predictions[:TRAIN.size].size + predictions[TRAIN.size:-TEST.size].size),
                         y=predictions[TRAIN.size:-TEST.size],
                         name='Prediction [Validation]',
                         mode='lines'))

fig.add_trace(go.Scatter(x=np.arange(TRAIN.size),
                         y=TRAIN,
                         name='Training Data',
                         mode='lines'))

fig.add_trace(go.Scatter(x=np.arange(start=TRAIN.size, stop=DATA.size),
                         y=VALIDATION,
                         name='Validation Data',
                         mode='lines'))


fig.show()

# Hyperparameter tuning

In [115]:
from sklearn.metrics import mean_squared_error

In [116]:
x = TRAIN
n_predict = VALIDATION.size

In [120]:
data_loss = []
for sinusoid in list(range(0, 51)):
    predictions = fourierExtrapolation(data=x,
                                       number_of_predictions=n_predict + TEST.size,
                                       n_sinusoids=sinusoid)
    train_loss = np.sqrt(mean_squared_error(predictions[:TRAIN.size], TRAIN))
    validation_loss = np.sqrt(mean_squared_error(predictions[TRAIN.size:-TEST.size], VALIDATION))
    test_loss = np.sqrt(mean_squared_error(predictions[:TEST.size], TEST))

    data_loss.append([sinusoid, train_loss, validation_loss, test_loss])

df_loss = pd.DataFrame(data=data_loss, columns=["Sinusoid", "Train RMSE", "Validation RMSE", "Test RMSE"])
df_loss

Unnamed: 0,Sinusoid,Train RMSE,Validation RMSE,Test RMSE
0,0,808.69612,710.947135,876.8681
1,1,795.052644,721.225374,850.364331
2,2,770.03492,746.18106,861.263892
3,3,761.513056,779.773447,874.421716
4,4,742.731181,818.49744,869.778946
5,5,736.364107,866.245791,881.399417
6,6,719.930074,850.07078,864.216329
7,7,714.368054,843.097909,856.054896
8,8,697.512765,848.27201,865.465149
9,9,691.803082,862.661506,883.804916


In [121]:
fig = px.line(df_loss, x="Sinusoid", y=["Train RMSE", "Validation RMSE"], labels={"variable": ""}, color_discrete_sequence=["rgba(234,143,129,0.5)", "rgb(32,115,171)"])


fig.update_layout(
    xaxis_title="Sinusoids",
    yaxis_title="Loss [RMSE]",
    legend=dict(orientation="h",
                yanchor="top",
                y=1.1,
                xanchor="left",
                x=0.0,),

)

fig.show()