## Time series processing

A time series is a sequence of data points recorded or measured at successive points in time, typically at equally spaced intervals. They are a fundamental source of information on dynamical systems across the different water domains. 

In this notebook we will cover some data pre-processing techniques that are unique to timeseries data and can help structuring the data in a way that improves the performance of machine learning models.

**1. Load libraries and data**

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt


In [None]:
# Read data
file_url = 'https://github.com/DHI/Intro_ML_course/raw/main/module_5/danube_discharge.csv'
df = pd.read_csv(file_url, parse_dates=True, index_col=0)

df.head()

In [None]:
df.tail()

The dataset contains daily values recorded between 1998 and 2008:
* average discharge in m3/s (Ceatal Izmail station, https://www.hidro.ro/)
* average daily precipitation in mm/day (large area above lower Danube catchment, from ECMWF Reanalysis v5 (ERA5) - https://www.ecmwf.int/en/forecasts/dataset/ecmwf-reanalysis-v5)
* average daily temperature in C degrees (same area and source as precipitation)

In [None]:
# Plot the three time series in subplots
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.07)

fig.add_trace(go.Scatter(x=df.index, y=df['discharge'], name='Discharge (m3/s)'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['precipitation'], name='Precipitation (mm)'), row=2, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['temperature'], name='Temperature (C)'), row=3, col=1)

fig.update_yaxes(title_text="Discharge (m3/s)", row=1, col=1)
fig.update_yaxes(title_text="Precipitation (mm)", row=2, col=1)
fig.update_yaxes(title_text="Temperature (C)", row=3, col=1)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), showlegend=False)

fig.show()

**2 Data trasnformations**

Some common pre-processing techniques for timeseries data include:
* Differencing: it consists in subtracting the previous observation from the current observation and is used to transform a time-series dataset to make it stationary
* Rolling windows: used to perform moving aggregate computations, such as moving averages, sums, or other statistics.
* Sin/Cos Transformations of Cyclic Features: trigonometric functions to encode these features in a way that preserves their cyclical nature.

In [None]:
# Differencing
df['discharge_diff'] = df['discharge'].diff()

# Plot the differenced time series
fig = make_subplots(rows=2, cols=2, shared_yaxes=True, shared_xaxes=True, vertical_spacing=0.07, horizontal_spacing=0.03)

fig.add_trace(go.Scatter(x=df.index, y=df['discharge'], name='Discharge (m3/s)'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['discharge_diff'], name='Discharge diff (m3/s)'), row=2, col=1)

fig.add_trace(go.Histogram(y=df['discharge'], name='Discharge (m3/s)'), row=1, col=2)
fig.add_trace(go.Histogram(y=df['discharge_diff'], name='Discharge diff (m3/s)'), row=2, col=2)

fig.update_yaxes(title_text="Discharge (m3/s)", row=1, col=1)
fig.update_yaxes(title_text="Discharge diff (m3/s)", row=2, col=1)

fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), showlegend=False)

fig.show()

In [None]:
# Rolling window - sum
df['precip_monthly_sum'] = df['precipitation'].rolling(30).sum()

# Plot the rolling window
fig = make_subplots(rows=2, cols=1, shared_yaxes=True, shared_xaxes=True, vertical_spacing=0.07, horizontal_spacing=0.03)

fig.add_trace(go.Scatter(x=df.index, y=df['precipitation']), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['precip_monthly_sum']), row=2, col=1)

fig.update_yaxes(title_text="Precip. daily (mm)", row=1, col=1)
fig.update_yaxes(title_text="Precip. monthly (mm)", row=2, col=1)

fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), showlegend=False)

fig.show()

In [None]:
# Roilling window - mean
df['temperature_weekly_mean'] = df['temperature'].rolling(7).mean()

# Plot the rolling window
fig = make_subplots(rows=2, cols=1, shared_yaxes=True, shared_xaxes=True, vertical_spacing=0.07, horizontal_spacing=0.03)

fig.add_trace(go.Scatter(x=df.index, y=df['temperature']), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['temperature_weekly_mean']), row=2, col=1)

fig.update_yaxes(title_text="Temp. (C)", row=1, col=1)
fig.update_yaxes(title_text="Temp. weekly mean (C)", row=2, col=1)

fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), showlegend=False)

fig.show()

In [None]:
# Cylic features - sin and cos transform
df['month_sin'] = np.sin(2*np.pi*df.index.month/12)
df['month_cos'] = np.cos(2*np.pi*df.index.month/12)

# Plot the cyclic features
fig = make_subplots(rows=3, cols=1, shared_yaxes=True, shared_xaxes=True, vertical_spacing=0.07, horizontal_spacing=0.03)

fig.add_trace(go.Scatter(x=df.index, y=df.index.month), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['month_sin']), row=2, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['month_cos']), row=3, col=1)

fig.update_yaxes(title_text="Month", row=1, col=1)
fig.update_yaxes(title_text="Sin(month)", row=2, col=1)
fig.update_yaxes(title_text="Cos(month)", row=3, col=1)

fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), showlegend=False)

fig.show()

**3 Data alignment**

When dealing with time series, it's important that the input and target features are correctly aligned in the time dimension to meet the desired objective. See the pre-recorded presentation for additional details on the motivation behind these data alignments.

In [None]:
# Step-wise
df_step = df[['discharge']].copy()
df_step.columns = ['x_t']
df_step['x_t+1'] = df_step['x_t'].shift(-1)

df_step.head(6)

In [None]:
df_step.tail(6)

In [None]:
# Window-input
df_window = df[['discharge']].copy()
df_window.columns = ['x_t']
df_window['x_t+1'] = df_window['x_t'].shift(-1)
df_window['x_t-1'] = df_window['x_t'].shift(1)
df_window['x_t-2'] = df_window['x_t'].shift(2)

# Reorder columns
df_window = df_window[['x_t-2', 'x_t-1', 'x_t', 'x_t+1']]

df_window.head(6)

In [None]:
df_window.tail(6)

In [None]:
# Forecast horizon
df_horizon = df[['discharge']].copy()
df_horizon.columns = ['x_t']
df_horizon['x_t+1'] = df_horizon['x_t'].shift(-1)
df_horizon['x_t+2'] = df_horizon['x_t'].shift(-2)
df_horizon['x_t+3'] = df_horizon['x_t'].shift(-3)
df_horizon['x_t-1'] = df_horizon['x_t'].shift(1)
df_horizon['x_t-2'] = df_horizon['x_t'].shift(2)

# Reorder columns
df_horizon = df_horizon[['x_t-2', 'x_t-1', 'x_t', 'x_t+1', 'x_t+2', 'x_t+3']]

df_horizon.head(6)

In [None]:
df_horizon.tail(6)

In [None]:
# Multivariate input
df_multi = df[['discharge', 'precipitation', 'temperature']].copy()
df_multi.columns = ['y_t', 'x1_t', 'x2_t']
df_multi['y_t+1'] = df_multi['y_t'].shift(-1)

# Reorder columns and drop y_t
df_multi = df_multi[['x1_t', 'x2_t', 'y_t', 'y_t+1']]

df_multi.head(6)

In [None]:
df_multi.tail(6)