In [None]:
import os
import sys
sys.path.append("../")

import pandas as pd
import numpy as np
import datetime as dt
import re

import logging
logger = logging.getLogger('SPARK')
logger.setLevel(level=logging.WARNING)

import altair as alt
alt.data_transformers.disable_max_rows()

from src.data.query_snowflake import update_week_extremes, read_week_extremes, read_meta
from src.plot.altair import plot_limits, plot_history
from src.plot.formatting import format_limits, format_history

In [None]:
df_meta_full = read_meta()

In [None]:
# sample_box = df_meta_full.sample()["boxid"].squeeze()
# sample_box = 'TBB.581265-1'

In [None]:
# %%time
# df_meta = read_meta(sample_box)
# df_limits = format_limits(df_meta=df_meta)
# df_data = read_week_extremes(boxid=sample_box, L="sumli")
# df_plot = format_history(df_data)
# plot_limits(df_limits) + plot_history(df_plot)

## clean data

In [None]:
def too_short(df_data, threshold=52):
    logger.info(f"checking number of data points (<{threshold})")
    if len(df_data)<threshold:
        logger.info(f"number of data points ({len(df_data)}) under threshold ({threshold})")
        return True
    else:
        return False

In [None]:
def too_small(df_data, capacity, threshold=0.25):
    logger.info(f"checking absolute values (<{threshold})")
    if df_data[["max", "min"]].abs().max().max() < capacity * threshold:
        logger.info(
            f"value of data points are smaller than {threshold} times capacity ({capacity})"
        )
        return True
    else:
        return False

In [None]:
def remove_leading_idling(df_data, capacity, threshold=0.01):
    logger.info(f"removing leading low values (<{threshold})")
    df_data = df_data.sort_values(["year", "week"])
    df_mask = df_data[["max", "min"]].abs().max(axis=1) > capacity * threshold
    df_mask[df_mask.argmax() :] = True
    return df_data.loc[df_mask]

In [None]:
# remove_idle_values(df_data, capacity, threshold=0.01).reset_index(drop=True)["max"].plot()

In [None]:
def load_data(boxid):
    go = True
    if go:
        # load meta data and check availability
        df_meta = read_meta(boxid=boxid)
        if len(df_meta) == 0:
            logger.info(f"no meta data available for boxid: {boxid}")
            go = False

    if go:
        # load week extremes and check availability
        df_data = read_week_extremes(boxid=boxid, L="sumli")
        if len(df_data) == 0:
            logger.info(f"no week extreme data available for boxid: {boxid}")
            go = False
        else:
            capacity = df_meta["vermogen_nominaal"].squeeze()

    min_rows = 52 * 2
    max_loading = 0.50
    threshold_idling = 0.01
    # check data requirements and clean data
    if go:
        go = not too_short(df_data, threshold=min_rows)
    if go:
        go = not too_small(df_data, capacity, threshold=max_loading)
    if go:
        df_data = remove_leading_idling(df_data, capacity, threshold=threshold_idling)
        go = not too_short(df_data, threshold=min_rows)

    if go:
        return df_data, df_meta

In [None]:
result=None
while result is None:
    sample_box = df_meta_full.sample()["boxid"].squeeze()
    sample_box = "GNG.ts0521-1"
    result = load_data(boxid=sample_box)
    
# if result is not None:
df_data, df_meta = result
df_limits = format_limits(df_meta=df_meta)
df_plot = format_history(df_data)
display(plot_limits(df_limits) + plot_history(df_plot))
print(sample_box)

In [None]:
def plot_total(df_data=None, df_meta=None, df_forecast=None):
    plots = []

    if df_forecast is not None:
        forecast_plot = plot_forecast(format_forecast(df_forecast))
        plots.append(forecast_plot)

    if df_data is not None:
        history_plot = plot_history(format_history(df_data))
        plots.append(history_plot)

    if df_meta is not None:
        limits_plot = plot_limits(format_limits(df_meta=df_meta, df_data=None))
        plots.append(limits_plot)

    return (
        alt.layer(*plots)
        .resolve_scale(color="independent", shape="independent")
        .interactive()
    )

In [None]:
df_data

In [None]:
dt.timedelta(weeks=26)
dt.datetime.now().date() - dt.timedelta(weeks=26)

In [None]:
df_data["date"].max() - dt.timedelta(weeks=26)

In [None]:
def split_last(df_data, period=dt.timedelta(weeks=26)):
    split = df_data["date"].max() - period
    df_train = df_data[df_data["date"] < split]
    df_test = df_data[df_data["date"] >= split]
    return df_train, df_test

In [None]:
df_train, df_test = split_last(df_data)

In [None]:
extremes = value_vars = ["max", "min"]
df_test = df_test.melt(
    id_vars=df_test.columns.difference(extremes),
    value_vars=extremes,
    var_name="extreme",
)
df_test["forecast"] = "Q10-Q90"
df_test["upper"] = df_test["value"]+50
df_test["lower"] = df_test["value"]-50
df_test

df_median = df_test.copy()
df_median["forecast"] = "median"
df_median["upper"] = df_median["value"]
df_median["lower"] = df_median["value"]
df_test = pd.concat([df_test, df_median])

In [None]:
df_test

In [None]:
# df_test_max = (
#     df_test
#     .assign(Q10=df_test["max"] - 50)
#     .assign(Q90=df_test["max"] + 50)
#     .assign(Q50=df_test["max"])
#     .assign(limit="max")
# )
# df_test_min = (
#     df_test
#     .assign(Q10=df_test["min"] - 50)
#     .assign(Q90=df_test["min"] + 50)
#     .assign(Q50=df_test["min"])
#     .assign(limit="min")
# )
# df_test = pd.concat([df_test_min, df_test_max])
# df_test= df_test[["boxid", "date", "Q10", "Q50", "Q90", "limit"]]
# df_test.sample(5)

In [None]:
df_test.melt(
    id_vars=["boxid", "date", "limit"],
    value_vars=[c for c in df_test.columns if re.match(r"Q\d{2,3}", c)],
)

In [None]:
df_test

In [None]:
# df_test = df_test.query("extreme =='max'")
# alt_band = alt.Chart(df_test).mark_area(line=True).encode(
#     x=alt.X("date:T"),
#     y=alt.Y("lower:Q", stack=None, title=""),
#     y2=alt.Y2("upper:Q", title=""),
#     opacity=alt.Opacity("forecast:N", scale=alt.Scale(domain=["Q10-Q90", "median"], range=[.3, .8])),
#     detail="extreme:N"
# ).interactive()
# alt_band

In [None]:
alt_band = alt.Chart(df_test).mark_area(opacity=0.3).encode(
    x=alt.X("date:T"),
    y=alt.Y("Q10:Q", stack=None, title=""),
    y2=alt.Y2("Q90:Q", title=""),
    detail=("limit:N"),
).interactive()

alt_median = alt.Chart(df_test).mark_line().encode(
    x=alt.X("date:T"),
    y=alt.Y("Q50:Q"),
    detail=("limit:N"),
)

alt_band + alt_median + plot_limits(df_limits)

In [None]:
def dummy_forecast(df_data):
    extremes = value_vars = ["max", "min"]
    df = df.melt(
        id_vars=df_test.columns.difference(extremes),
        value_vars=extremes,
        var_name="extreme",
    )
    df["forecast"] = "Q10-Q90"
    df["upper"] = df["value"]+50
    df["lower"] = df["value"]-50
    df

    df_median = df.copy()
    df_median["forecast"] = "median"
    df_median["upper"] = df_median["value"]
    df_median["lower"] = df_median["value"]
    df_forecast = pd.concat([df, df_median])
    return df_forecast

In [None]:
df_forecast = dummy_forecast(df_test)

In [None]:
df_forecast

In [None]:
plot_total(df_data=df_data, df_meta=df_meta, df_forecast=df_test)

In [None]:
format_limits(df_meta=df_meta, df_data=df_data)

In [None]:
df_plot = pd.DataFrame(columns=["date"], data=df_data.sort_values(["year", "week"]).iloc[[1, -1]].apply(
    lambda df: dt.datetime.fromisocalendar(df["year"], df["week"], 1), axis=1
))
df_plot["lower"] = -1
df_plot["upper"] = 1
df_plot.melt(id_vars=["date"], value_vars=["lower", "upper"], var_name=["limit"])

In [None]:
[
    "ESD.000088-1",
    "063.623-1",
    "VRY.CHOPS-1",
    "HVT.111153-1",
    "TTR.251049-1",
    "BGL.CROLA-1",
    "ESD.000028-1",
    "HVT.371157-1",
    "GNG.ts0521-1",
    "166.631-1",
]