# Feature engineering

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures

from darts import TimeSeries
from darts.utils.missing_values import extract_subseries

In [3]:
def holt_smoother(x: np.array, alpha: float) -> np.array:
    # https://empslocal.ex.ac.uk/people/staff/dbs202/cag/courses/MT37C/course/node102.html
    y = np.zeros_like(x)
    y[0] = x[0]

    for t in range(1, len(x)):
        y[t] = alpha * x[t] + (1 - alpha) * y[t - 1]

    return y


def generate_datetime_ts(ts: TimeSeries) -> TimeSeries:
    """Generates time series of date time features using one hot encoding"""
    dayofweek = (
        pd.get_dummies(ts.time_index.day_of_week)
        .rename(columns=lambda x: f"dow_{x}")
        .set_index(ts.time_index)
    )
    hours = (
        pd.get_dummies(ts.time_index.hour)
        .rename(columns=lambda x: f"hour_{x}")
        .set_index(ts.time_index)
    )

    datetime_features = dayofweek.merge(
        hours, left_index=True, right_index=True
    ).astype(int)

    return TimeSeries.from_dataframe(datetime_features)


def generate_poly_ts(ts: TimeSeries, degree: int) -> TimeSeries:
    poly_values = PolynomialFeatures(degree, include_bias=False).fit_transform(
        ts.values()
    )
    poly_features = pd.DataFrame(
        poly_values,
        columns=[f"feature_{i}" for i in range(poly_values.shape[1])],
        index=ts.time_index,
    )

    return TimeSeries.from_dataframe(
        poly_features,
        freq="h",
    )

In [None]:
data = pd.read_csv("../processed/data.csv", index_col=0, parse_dates=True)
data.rename(columns={"FB20F11_81": "flow"}, inplace=True)

# These variables have a big gap at the end, so we discard them
data.drop(columns=["temp_grass", "temp_soil_30"], inplace=True)

- We transform the target and precipitation by computing their logarithm. This step will _squeeze_ their values which can help during training.

In [None]:
data[["flow", "acc_precip"]] = data[["flow", "acc_precip"]].apply(np.log1p)

- For convenience we transform to `TimeSeries`. Then, we extract subseries without gaps and remove subseries that are too short.

In [None]:
# For later convenience, we will work with darts TimeSeries
ts = TimeSeries.from_dataframe(data, freq="h")
subseries = extract_subseries(ts, mode="any")

min_length = 24 * 7  # 7 days
subseries = [s for s in subseries if len(s) >= min_length]

- Splitting subseries between target and covariates

In [7]:
target_subseries = [s["flow"] for s in subseries]
covariates_subseries = [s.drop_columns("flow") for s in subseries]

- The dynamics of observed precipitation are faster than their effects. We smooth the precipitation by using a Holt exponential smoother; the value of the parameter $\alpha$ is chosen based on the correlation study in the overview [notebook](./overview.ipynb).

In [8]:
alpha = 0.2
for i, series in enumerate(subseries):
    lp_precip = holt_smoother(series["acc_precip"].values().ravel(), alpha)
    ts_lp = TimeSeries.from_series(
        pd.Series(lp_precip, index=series.time_index, name="smooth_precip")
    )
    subseries[i] = series.concatenate(ts_lp, axis=1)

- We include polynomial functions of the weather features
- We include datetime features, using one hot encoding

In [9]:
deg = 3  # polynomial degree
poly_covariates = [generate_poly_ts(s, deg) for s in covariates_subseries]
datetime_covariates = [generate_datetime_ts(s) for s in covariates_subseries]

expanded_covariates = [
    plc_i.concatenate(dtc_i, axis=1)
    for plc_i, dtc_i in zip(poly_covariates, datetime_covariates)
]