In [446]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

import DataRetriever as dr

RETRIEVER = dr.DataRetriever()
CON_ATTRIBUTES = RETRIEVER.get_attributes(file_name='consuming_attributes.pkl')
PV_ATTRIBUTES = RETRIEVER.get_attributes(file_name='producing_attributes.pkl')
FLEX_ATTRIBUTES = ["Load_ClothesWasherPowerWithStandby", "Elec_PowerDishwasher", "Load_DryerPowerTotal"]
FIXED_ATTRIBUTES = list(set(CON_ATTRIBUTES) - set(FLEX_ATTRIBUTES))

DATA = RETRIEVER.get_data(file_name='All-Subsystems-hour-Year2.pkl')[FIXED_ATTRIBUTES].sum(axis=1).clip(lower=0)

# DATA_INIT = DATA[0]
#
# DATA = DATA - DATA.shift(1).bfill()

In [447]:
def fourierExtrapolation(data: np.array, number_of_predictions: int, n_sinusoids: int) -> np.array:
    """
    Predict {number_of_predictions} observations after the index data.size of {data}, using {n_sinusoids} sinusoids.
    :param data: The data on which to train the model. Corresponds to {_x = x_0, x_1, ... x_(n-1)} in the theory
    :param number_of_predictions: The amount of predictions to output. Corresponds to {x_((n-1)+1), x_((n-1)+2), ..., x_((n-1)+{number_of_predictions})}
    :param n_sinusoids: The amount of sinusoids on which to base the predictions. I.e. the data probably contains many sinusoids, but we only wish to make predictions based on the {n_sinusoids} largest frequencies.
    :return: A numpy array of length data.size + number_of_predictions, containing the transformed original data + predictions
    """
    data_size = data.size  # n
    sample_index = np.arange(0, data_size)  # sum-limits (0) to (n-1)

    # Fit a linear regression line to data
    linear_trend = np.polyfit(x=sample_index, y=data, deg=1)
    # Subtract the learned line (linear_trend[0] * sample_index) from original data (data).
        # Here, linear_trend[0] are the coefficients of the linear regression, # and sample_index is equivalent to X;
        # thereby giving the, well-known from STAT, form (beta * X)
    x_data_detrended = data - linear_trend[0] * sample_index  # The series of numbers x = {x1, x2, ...}

    X_frequency_domain = list(np.fft.fft(x_data_detrended))  # The series of complex numbers X = {X1, X2, ...}
    frequencies = list(np.fft.fftfreq(data_size, d=1))  # Some frequencies, e.g. {4, 3, -7, 8, -5, ...}


    # # Experimental!!!
    # for dom, freq, idx in zip(reversed(X_frequency_domain), reversed(frequencies), reversed(sample_index)):
    #     if abs(freq) < 0.25:
    #         X_frequency_domain.pop(idx)
    #         frequencies.pop(idx)


    indexes = list(range(len(X_frequency_domain)))  # {0, 1, ..., n-1}
    indexes.sort(key=lambda idx: np.absolute(X_frequency_domain[idx]), reverse=True)  # ascendingly sort indices by frequency
    
    sample_index = np.arange(0, data_size + number_of_predictions)  # sample_index = {0, 1, ..., n-1, n, n+1, ..., (n-1)+number_of_predictions}
    x_restored_sig = np.zeros(sample_index.size)  # Prepare a numpy array to receive x reconstructed from its Fourier Transform

    if n_sinusoids == 0:
        return x_restored_sig + data.mean() + linear_trend[0] * sample_index

    for i in indexes[:n_sinusoids]:
        amplitude = np.absolute(X_frequency_domain[i])
        phase = np.angle(X_frequency_domain[i])
        x_restored_sig += amplitude * np.cos(2 * np.pi * frequencies[i] * sample_index + phase)

    x_restored = 1 / data_size * x_restored_sig + linear_trend[0] * sample_index

    return x_restored

In [448]:
def r2(a, b):
    return np.corrcoef(a, b)[0, 1]**2

In [449]:
DATA_SIZE = DATA.size

WEEK = 24 * 7

test_range = WEEK
VALIDATION_RANGE = 24 * 3

SINUSOIDS = list(range(0, 50)) + list(range(50, 210, 10))

In [450]:
np.seterr(invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [451]:
cv_DataFrame = pd.DataFrame(index=SINUSOIDS)
fold = 0
while test_range + VALIDATION_RANGE <= DATA_SIZE:
    sinusoids_r2 = []
    for n_sinusoids in SINUSOIDS:
        fft = fourierExtrapolation(data=DATA[test_range - WEEK:test_range], number_of_predictions=VALIDATION_RANGE, n_sinusoids=n_sinusoids)
        prediction = pd.Series(data=fft[WEEK:], index=DATA[test_range:test_range + VALIDATION_RANGE].index)

        actual = DATA[test_range:test_range+VALIDATION_RANGE]

        # Rework impossible values
        for idx, i in prediction.iteritems():
            if idx.hour > 18 or idx.hour < 6 or i < 0:
                prediction.at[idx] = 0

        sinusoids_r2.append(r2(prediction, actual))

    cv_DataFrame[f"Fold_{fold}"] = sinusoids_r2

    test_range += 24 * 14
    fold += 1

In [452]:
cv_DataFrame.mean(axis=1).idxmax()

160

In [453]:
cv_DataFrame.mean(axis=1)

0      0.121556
1      0.122551
2      0.128755
3      0.133546
4      0.141850
         ...   
160    0.310828
170    0.310782
180    0.310782
190    0.310782
200    0.310782
Length: 66, dtype: float64

In [454]:
fft = fourierExtrapolation(data=DATA[0:WEEK], number_of_predictions=VALIDATION_RANGE, n_sinusoids=cv_DataFrame.mean(axis=1).idxmax())
prediction = fft[WEEK:]
pred_test = fft[:WEEK]

actual = DATA[WEEK:WEEK+VALIDATION_RANGE]

In [455]:
# fft = fourierExtrapolation(data=DATA[0:WEEK], number_of_predictions=VALIDATION_RANGE, n_sinusoids=np.argmax(cv_DataFrame.mean(axis=1)))
# prediction = pd.Series(fft[WEEK:], index=DATA[WEEK:WEEK+VALIDATION_RANGE].index)
# pred_test = fft[:WEEK]
#
# # Rework impossible values
# for idx, i in prediction.iteritems():
#     if idx.hour > 18 or idx.hour < 6 or i < 0:
#         prediction.at[idx] = 18
#
# actual = DATA[WEEK:WEEK+VALIDATION_RANGE]

In [456]:
fig = go.Figure()

# fig.add_trace(go.Scatter(x=np.arange(start=0, stop=WEEK),
#                          y=pred_test,
#                          name='Training Data [Prediction]',
#                          mode='lines',
#                          line=dict(color='rgb(84, 0, 84)')))

fig.add_trace(go.Scatter(x=np.arange(start=0, stop=WEEK),
                         y=DATA[0:WEEK],
                         name='Training Data',
                         mode='lines',
                         line=dict(color='rgb(84, 84, 84)')))

fig.add_trace(go.Scatter(x=np.arange(start=WEEK, stop=WEEK + VALIDATION_RANGE),
                         y=actual,
                         name='Actual',
                         mode='lines',
                         line=dict(color='rgb(234,143,129)')))

fig.add_trace(go.Scatter(x=np.arange(start=WEEK, stop=WEEK + VALIDATION_RANGE),
                         y=prediction,
                         name='Predicted',
                         mode='lines',
                         line=dict(color='rgb(32,115,171)')))

fig.update_layout(
    xaxis_title="Index",
    yaxis_title="Consumption [kWh]",
    legend=dict(orientation="h",
                yanchor="top",
                y=1.11,
                xanchor="right",
                x=1))

fig.show()

In [457]:
set([1, 2, 3, 4]) - set([2, 4])

{1, 3}