In [None]:
import os
import sys
sys.path.append("../")

import math

import pandas as pd
import numpy as np
import datetime as dt
import re

import pymc3 as pm

import altair as alt
alt.data_transformers.disable_max_rows()

from src.preprocess.preprocess import load_data, split_last
from src.plot.altair import plot_total, plot_estimate
from src.plot.formatting import dummy_forecast
from src.utils.preprocess import MinMaxScaler, downcast
from src.model.model import det_dot, drift_model, seasonality_model

In [None]:
boxid = [
    "ESD.000088-1",
    "063.623-1",
    "VRY.CHOPS-1",
    "HVT.111153-1",
    "TTR.251049-1",
    "BGL.CROLA-1",
][3]

In [None]:
df_data, df_meta = load_data(boxid=boxid)
# extreme = "max"
# df_data = df_data.query(f"extreme == '{extreme}'")
df_train, df_test = split_last(df_data.copy())
df_test["period"] = "future"
plot_total(df_data=df_train, df_meta=df_meta)

## functions

In [None]:
def extrapolate_data(df, horizon=dt.timedelta(weeks=26)):
    t_start = df["date"].max() + dt.timedelta(weeks=1)
    t_end = t_start + horizon + dt.timedelta(weeks=1)
    t_extra = np.arange(t_start, t_end, dt.timedelta(weeks=1))
    df_extra = pd.DataFrame(data=t_extra, columns=["date"]).assign(
        boxid=df["boxid"].iloc[0], l=df["l"].iloc[0], extreme=df["extreme"].iloc[0], period="future"
    )
    df_extra[["year", "week"]] = df_extra["date"].dt.isocalendar().iloc[:, :-1]

    return df_extra

In [None]:
def create_model(t, y, p_fourier, n_fourier=5, n_polynomial=2):
    
    with pm.Model() as m:
    
        drift = drift_model(t, n=2)
        yearly = seasonality_model(t, p=p_fourier)

        σ_ε = pm.Uniform('σ_ε', lower=0, upper=1)
        Σ = pm.Normal("Σ", mu = drift + yearly, sd=σ_ε, observed=y)
        
    return m

In [None]:
def make_quantile_bands(df_base, samples, quantiles=[5, 15, 50, 85, 95]):
    # get quantiles
    q_data = np.quantile(samples, [q / 100 for q in quantiles], axis=0)
    boundaries = ["upper", "lower"]
    df_bands = pd.DataFrame()

    # create bands from two quantile boundaries (median: upper=lower)
    for ci in range(math.ceil(len(quantiles) / 2)):
        # name band
        band_range = f"Q{quantiles[ci]}-Q{quantiles[-ci-1]}".replace(
            "Q50-Q50", "median"
        )

        df_band = df_base.copy()
        df_band[boundaries] = q_data[[-ci - 1, ci]].T
        df_band["band"] = band_range

        # append to other bands
        df_bands = pd.concat([df_bands, df_band], axis=0)

    # in long format
    df_bands = df_bands.melt(
        id_vars=df_bands.columns.difference(boundaries),
        value_vars=boundaries,
        var_name="boundary",
        value_name="value",
    )

    return df_bands

In [None]:
def format_model_estimates(df_base, pp):
    df_vars = pd.DataFrame()
    for var, samples in pp.items():
        # get per model variable the quantile bands
        df_var = make_quantile_bands(df_base, samples=samples).assign(model_var=var)
        df_vars = pd.concat([df_vars, df_var], axis=0)

    return df_vars

In [None]:
def determine_estimates(df_observed):

    # extend date
    df_future = extrapolate_data(df_observed)
    df_full_range = pd.concat([df_observed, df_future])

    # scale t, y, p
    t_scaler = MinMaxScaler(lower=0)
    t = t_scaler.fit_transform(X=df_full_range["date"])
    y_scaler = MinMaxScaler(lower=0)
    y_observed = y_scaler.fit_transform(X=df_full_range["value"])
    # y_observed = np.ma.masked_invalid(y_observed)
    p = t_scaler.transform(t_scaler.min + dt.timedelta(weeks=52.1775))

    # create model 
    m = create_model(t=t, y=y_observed, p_fourier=p)
    # display(pm.model_to_graphviz(m))
    # tune model
    trace = pm.sample(model=m, draws=500, tune=500, init="adapt_diag")
    # extract posterior predictions
    pp = pm.sample_posterior_predictive(model=m, trace=trace, samples=1000, var_names=["drift", "yearly", "Σ"])

    # inverse scale samples
    for k in pp.keys():
            pp[k] = y_scaler.inverse_transform(pp[k])
            if k == "yearly":
                pp[k] -= y_scaler.min  

    # create base df and join it with posterior predictive quantiles
    df_base=df_full_range.drop(columns=["value", "processed_on", "model_var"]).copy()
    df_estimates = format_model_estimates(df_base, pp)
    
    return df_estimates

## forecasts

In [None]:
df_total = pd.concat([df_train, df_test], axis=0)
for extreme in ["min", "max"]:
    df_observed = df_train.query(f"extreme == '{extreme}'")
    df_estimates = determine_estimates(df_observed)

    df_total = pd.concat([df_total, df_estimates], axis=0)

In [None]:
df_total

In [None]:
plot_total(df_data=df_total.query("model_var=='observed' | (model_var=='Σ' & period=='future')"), df_meta=df_meta)

In [None]:
plot_estimate(df_total.query("model_var=='drift'"), legend=None)

In [None]:
def plot_decomp(df):
    plot_vars = []
    for var in ["Σ", "drift", "yearly"]:
        plot_var = plot_estimate(
            df.query(f"model_var=='{var}'"), legend=None
        ).properties(
            title=var,
            height=100,
        )
        plot_vars.append(plot_var)
    return alt.vconcat(*plot_vars)

In [None]:
plot_decomp(df_total.query("extreme == 'max'"))