# Overview

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os.path import abspath

from darts import TimeSeries
from darts.utils.missing_values import fill_missing_values
from darts.utils.statistics import extract_trend_and_seasonality, ModelMode

from utils import find_flat_periods

## Loading data

In [None]:
path_to_inflow = "../observations/wwtp.csv"
path_to_dmi_data = "../observations/dmi.csv"
path_to_merged_data = "../processed/data.csv"

inflow = pd.read_csv(abspath(path_to_inflow), parse_dates=True, index_col=0, sep=";")
inflow.index.name = "time"
inflow.index = inflow.index.tz_localize(
    "Europe/Copenhagen", ambiguous="NaT"
).tz_convert("utc")

DMI always provides data in UTC (https://opendatadocs.dmi.govcloud.dk/en/Data/Climate_Data)

In [None]:
climate = pd.read_csv(abspath(path_to_dmi_data), parse_dates=True, index_col=0, sep=",")
climate.index = climate.index.tz_localize("utc")

data = inflow.merge(climate, left_index=True, right_index=True, how="outer").iloc[:-1]
data.index = data.index.tz_convert(None)

## Gaps

All instances of flow measurements that are exactly or just larger than 0 are set to nan, as they are considered as missing values.

In [None]:
threshold_flow = 2
data[data["flow"] < threshold_flow] = np.nan

Also, there are periods that are a flat line, we consider those nans as well

In [None]:
period_length = 6  # hours
min_delta = 5  # flow
stable_periods = find_flat_periods(
    data["flow"], min_span_length=period_length, min_delta=min_delta
)
stable_idx = [k for idx in stable_periods for k in list(range(*idx))]

data.loc[data.index[stable_idx], "flow"] = np.nan

In [None]:
ts = TimeSeries.from_dataframe(data, freq="h", fill_missing_dates=False)
ts = fill_missing_values(ts, method="linear", limit=3, limit_area="inside")

In [None]:
fig, axes = plt.subplots(ts.n_components, 1, figsize=(20, 10), sharex="col")

for i, component in enumerate(ts.components):
    gaps = ts[component].gaps(mode="any")
    ts[component].plot(ax=axes[i], linewidth=0.8)
    for _, row in gaps.iterrows():
        axes[i].axvspan(xmin=row["gap_start"], xmax=row["gap_end"], color="tomato")
    axes[i].grid(visible=False)
    axes[i].set_xlabel("")

- Saving data after merging

In [None]:
ts.to_csv(path_to_merged_data)

## Trends

The following plot shows the daily pattern of the inflow. Notice that the flow values have been normalized to cancel the effects of a potential yearly trend.

In [None]:
df = data[["flow"]].copy()
df["hour"] = df.index.hour

df = df.loc[df.index.year == 2024, :].copy()

ncols = 6
fig, axes = plt.subplots(2, ncols, sharex=True, sharey=True, figsize=(11, 5))

for i, month in enumerate(range(1, 13)):
    df_i = df.loc[df.index.month == month]
    df_i = (df_i - df_i.mean()) / df_i.std()  # Normalizing

    daily_pattern = df_i.groupby("hour").mean()

    row = i // ncols
    col = i - row * ncols
    axes[row, col].plot(daily_pattern.index, daily_pattern.values.ravel())
    axes[row, col].set_title(f"month: {month}")

plt.suptitle("Daily pattern by month")
plt.tight_layout()

## Correlations

In [None]:
data.corr().style.background_gradient(cmap="coolwarm").format(precision=3)

## Precipitation pre-processing

As shown above, the hourly accumulated precipitation is highly correlated with the inflow, but the effect of the precipitation to the runoff and eventually the inflow can have slower dynamics. In order to extract all the information contained in the precipitation series, we suggest to create additional features that better capture the delayed effect of the precipitation. For example, we here compute the rolling sum of the precipitation, taking into account the rain measured in the last 24 hours. We invite the reader to test other aggregation periods as well. The figure below compares the daily aggregated rain to the water inflow.

In [None]:
subts = TimeSeries.from_dataframe(
    data[["flow", "acc_precip"]], freq="h"
).longest_contiguous_slice(mode="any")

In [None]:
trend, seasonality = extract_trend_and_seasonality(
    subts["flow"], model=ModelMode.ADDITIVE, method="STL"
)

n_hours = 24

fig, ax = plt.subplots(1, 1, figsize=(10, 3))
pd.DataFrame(
    {
        "flow (trend)": trend.values().ravel(),
        "flow (observed)": subts["flow"].values().ravel(),
        "daily_precip": subts["acc_precip"].to_series().rolling(n_hours).sum().values,
    },
    index=subts.time_index,
).plot(ax=ax, secondary_y="daily_precip");