# Recurrent Neural Networks

We'll introduce the concept of recurrent neural networks, which explicitly model the time-dependency of their inputs to facilitate the analysis of sequential data. We'll demonstrate this functionality using a weather forecasting dataset in a classic time-series analysis approach.


# Implement a Simple Recurrent Neural Network

We start with an example implementing the feed-forward pass of a recurrent neural network in NumPy. To keep things simple, we assume that the signal is a scalar value containing a single feature, corresponding to a binary signal. 


We can use a recurrent relation to implement a feedback "comb" filter.

In [None]:
input_series = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
output_series = []

state = 0
U = 1 / 2
V = 1 - U
for input_data in input_series:
    hidden = U * input_data + V * state
    output_data = hidden
    state = output_data
    output_series.append(output_data)

print(f"Input Series: {[f'{x:.2f}' for x in input_series]}")
print(f"Output Series: {[f'{x:.2f}' for x in output_series]}")

In [None]:
from matplotlib import pyplot as plt

plt.plot(input_series, label="input signal")
plt.plot(output_series, label="output signal")
plt.legend()
plt.show()

We modify the previous code to implement a more general recurrent neuron.

In [None]:
import numpy as np


def sigmoid(x):
    """Simple implementation of sigmoid function."""
    return 1 / (1 + np.exp(-x))


input_series = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]
output_series = []

state = 0
U, V, W, b = np.random.normal(size=4)
for input_data in input_series:
    hidden = sigmoid(U * input_data + V * state + b)
    output_data = sigmoid(hidden * W)
    state = output_data
    output_series.append(output_data)

print(f"Input Series: {[f'{x:.2f}' for x in input_series]}")
print(f"Output Series: {[f'{x:.2f}' for x in output_series]}")

## Load Weather Dataset

We download the uncompressed dataset of the Jena Climate Dataset from https://s3.amazonaws.com/keras-datasets/jena_climate_2009_2016.csv.zip.

More information on this dataset can be found at https://www.bgc-jena.mpg.de/wetter/.

In [None]:
import os
from torchvision.datasets.utils import download_url, _extract_zip

dataset_path = os.path.join(".", "weather_dataset")
if not os.path.exists(dataset_path):
    url = "https://s3.amazonaws.com/keras-datasets/" "jena_climate_2009_2016.csv.zip"
    download_url(url, ".")
    _extract_zip("jena_climate_2009_2016.csv.zip", dataset_path, None)
    os.remove("jena_climate_2009_2016.csv.zip")

This dataset contains 14 weather measurements in a `.csv` file. Their meaning is shown in the header.

In [None]:
import pandas as pd

filename = os.path.join(dataset_path, "jena_climate_2009_2016.csv")
dataframe = pd.read_csv(filename, index_col=0)
data = dataframe.values
header = dataframe.columns.tolist()
dataframe.head()

## Visualize Data

We now visualize the 14 measured features of the dataset. For this, we use the `plot_data()` function.

```python
import matplotlib.pyplot as plt
import numpy as np

def plot_data(data, header, start=0, samples_per_cycle=144, cycles=14):
    """Plot data highlighting periodic cycles."""

    fig, axes = plt.subplots(7, 2, figsize=(16, 12), sharex=True)

    for i, ax in enumerate(axes.flatten()):
        ax.plot(np.arange(start, start + samples_per_cycle * cycles),
                data[start:start + samples_per_cycle * cycles, i], 
                label=header[i])
        ax.legend()
        ax.set_xlim(start, start + samples_per_cycle * cycles)
        
        for cycle in range(1, cycles):
            ax.axvline(x=start + cycle * samples_per_cycle, 
                    color="gray", linestyle="--", linewidth=0.5)

    plt.tight_layout()
    plt.show()
```

In [None]:
from for_rnn import plot_data

daily_samples = 144
n_days = 14

plot_data(data, header, samples_per_cycle=daily_samples, cycles=n_days)

## Preprocess Data

We now preprocess the data into time series to input into the neural network model, that is, they need to be of shape (batch size, number of samples, number of feature).

In [None]:
n_samples = data.shape[0]
n_features = data.shape[1]
past_seq = 2 * daily_samples  # Length of the sequences to be fed to the RNN.
lag = 71  # How many time_steps ahead in time the RNN should predict temperature.
temp_idx = 1  # Temperature (Celsius) index.

inputs, targets = [], []
for i in np.random.permutation(range(0, n_samples - past_seq - lag, daily_samples)):
    inputs.append(data[i : i + past_seq, :])
    targets.append(data[i + past_seq + lag : i + past_seq + lag + 1, temp_idx])
inputs = np.asarray(inputs)
targets = np.asarray(targets)

print(inputs.shape)
print(targets.shape)

we use deeptrack to handle the data

In [None]:
import deeptrack as dt

sources = dt.sources.Source(inputs=inputs, targets=targets)
train_sources, val_sources = dt.sources.random_split(sources, [0.8, 0.2])

print(f"Number of training inputs = {len(train_sources)} / {len(sources)}")

We start by removing the mean from the data and normalizing them by their standard deviation.

In [None]:
import torch

train_mean = np.mean([source["inputs"] for source in train_sources], axis=(0, 1))
train_std = np.std([source["inputs"] for source in train_sources], axis=(0, 1))

inputs_pl = dt.Value(sources.inputs - train_mean) / (train_std) >> dt.pytorch.ToTensor(
    dtype=torch.float
)

targets_pl = dt.Value(sources.targets - train_mean[temp_idx]) / (train_std[temp_idx])

We start by creating a dataset and the respective data loaders.

In [None]:
from torch.utils.data import DataLoader

train_dataset = dt.pytorch.Dataset(inputs_pl & targets_pl, inputs=train_sources)
val_dataset = dt.pytorch.Dataset(inputs_pl & targets_pl, inputs=val_sources)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## Creating a Common-Sense Benchmark

We then create a baseline common-sense benchmark, generate benchmark for comparison, i.e., predict $T_t = T_{t-1}$.

In [None]:
temperature = data[:, temp_idx]
benchmark = np.mean(
    np.abs(
        temperature[lag + daily_samples :: daily_samples]
        - temperature[lag : -(daily_samples - lag) : daily_samples]
    )
)
print(benchmark)
std_bm = benchmark / (train_std[temp_idx])
print(std_bm)

## Implement PyTorch RNN Model

This example introduces the `RNN` module in PyTorch.

In [None]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Define the RNN and Linear layers separately.
rnn = nn.RNN(input_size=inputs.shape[2], hidden_size=2, batch_first=True)
fc = nn.Linear(in_features=2, out_features=1)
rnn.to(device)
fc.to(device)

We now train the model.

In [None]:
criterion = nn.L1Loss()  # MAE loss.
optimizer = torch.optim.Adam(
    list(rnn.parameters()) + list(fc.parameters()), lr=0.001
)  # Optimizer.
epochs = 100

train_losses, val_losses = [], []
for epoch in range(epochs):

    # Training.
    running_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()

        inputs = inputs.to(device)
        targets = targets.to(device)
        rnn_out, _ = rnn(inputs)  # RNN layer.
        rnn_out = rnn_out[:, -1, :]  # Select the last output for each sequence.
        outputs = fc(rnn_out)  # Linear layer.

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    train_losses.append(running_loss / len(train_loader))
    print(f"Epoch {epoch} Training Loss: {train_losses[-1]:.4f}")

    # Validation.
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            rnn_out, _ = rnn(inputs)  # RNN layer.
            rnn_out = rnn_out[:, -1, :]  # Selecting the last output for each sequence.
            outputs = fc(rnn_out)  # Linear layer.

            loss = criterion(outputs, targets)
            val_loss += loss.item()

    val_losses.append(val_loss / len(val_loader))
    print(f"Epoch {epoch} Validation Loss: {val_losses[-1]:.4f}")

We then validate the model for which we write the `plot_training()` function.

```python
def plot_training(epochs, train_losses, val_losses, benchmark):
    """Plot the training and validation losses."""
    
    plt.plot(range(epochs), train_losses, label="Training Loss")
    plt.plot(range(epochs), val_losses, label="Validation Loss")
    plt.plot([0, epochs - 1], [benchmark, benchmark], 
            linestyle="--", color="k", label="Benchmark")
    plt.legend()
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.xlim([0, epochs - 1])
    plt.show()
```

In [None]:
from for_rnn import plot_training

plot_training(epochs, train_losses, val_losses, std_bm)

## Implement the RNN in Deeplay

We now implement the RNN in deeplay.

In [None]:
import deeplay as dl

rnn_dl = dl.RecurrentModel(
    in_features=14,
    hidden_features=[2],
    out_features=1,
    rnn_type="RNN",
)
rnn_simple = dl.Regressor(rnn_dl, optimizer=dl.Adam(lr=0.001)).create()

print(rnn_simple)

We define a trainer that saves training and validation losses at the end of each epoch. The learning curves are plotted together with the common-sense benchmark.

In [None]:
trainer = dl.Trainer(max_epochs=epochs, accelerator="auto")
trainer.fit(rnn_simple, train_loader, val_loader)

train_losses = trainer.history.history["train_loss_epoch"]["value"]
val_losses = trainer.history.history["val_loss_epoch"]["value"][1:]

plot_training(epochs, train_losses, val_losses, std_bm)

### Stacked RNN

In [None]:
rnn_dl = dl.RecurrentModel(
    in_features=n_features,
    hidden_features=[16, 16, 16],
    out_features=1,
    rnn_type="RNN",
)
rnn_stacked = dl.Regressor(rnn_dl, optimizer=dl.Adam(lr=0.0001)).create()

In [None]:
trainer = dl.Trainer(max_epochs=epochs)
trainer.fit(rnn_stacked, train_loader, val_loader)

train_losses = trainer.history.history["train_loss_epoch"]["value"]
val_losses = trainer.history.history["val_loss_epoch"]["value"][1:]

plot_training(epochs, train_losses, val_losses, std_bm)

### Stacked GRU

In [None]:
gru_dl = dl.RecurrentModel(
    in_features=n_features,
    hidden_features=[8, 8, 8],
    out_features=1,
    rnn_type="GRU",
    dropout=0.2,
)
gru_stacked = dl.Regressor(gru_dl, optimizer=dl.Adam(lr=0.001)).create()

print(gru_stacked)

In [None]:
trainer = dl.Trainer(max_epochs=epochs)
trainer.fit(gru_stacked, train_loader, val_dataloaders=val_loader)

train_losses = trainer.history.history["train_loss_epoch"]["value"]
val_losses = trainer.history.history["val_loss_epoch"]["value"][1:]

plot_training(epochs, train_losses, val_losses, std_bm)

### Stacked LSTM

In [None]:
lstm_dl = dl.RecurrentModel(
    in_features=n_features,
    hidden_features=[8, 8, 8],
    out_features=1,
    rnn_type="LSTM",
    dropout=0.3,
)
lstm_stacked = dl.Regressor(lstm_dl, optimizer=dl.Adam(lr=0.001)).create()

print(lstm_stacked)

In [None]:
trainer = dl.Trainer(max_epochs=epochs)
trainer.fit(lstm_stacked, train_loader, val_dataloaders=val_loader)

train_losses = trainer.history.history["train_loss_epoch"]["value"]
val_losses = trainer.history.history["val_loss_epoch"]["value"][1:]

plot_training(epochs, train_losses, val_losses, std_bm)

## Implementing a Preprocessor

We now introduce a dense preprocessor in feature space.

In [None]:
preprocessor = dl.MultiLayerPerceptron(
    in_features=None,
    hidden_features=[32],
    out_features=8,
)
preprocessor.dropout.configure(p=0.2)

lstm_dl = dl.RecurrentModel(
    in_features=8,
    hidden_features=[8, 8, 8],
    out_features=1,
    dropout=0.2,
    rnn_type="GRU",
)

from deeplay import DeeplayModule


class Reshape(DeeplayModule):
    """Custom reshape layer."""

    def __init__(self, new_shape):
        """Custom reshape layer."""
        super().__init__()

        self.new_shape = new_shape

    def forward(self, x):
        """Define the forward pass for the reshape layer."""
        return x.view(self.new_shape)


lstm_pre_dl = dl.Sequential(
    torch.nn.Flatten(
        start_dim=0, end_dim=1
    ),  # HENRIK, shouldn't this be included in a deeplay layer?
    preprocessor,
    Reshape((-1, pred_window, 8)),
    lstm_dl,
)
lstm_pre = dl.Regressor(lstm_pre_dl).create()

print(lstm_pre)

trainer_pre = dl.Trainer(
    max_epochs=epochs,
    accelerator="cpu",
)
trainer_pre.fit(lstm_pre, train_loader, val_loader)

train_losses = trainer_pre.history.history["train_loss_epoch"]["value"]
val_losses = trainer_pre.history.history["val_loss_epoch"]["value"][1:]
plot_training(
    epochs,
    train_losses,
    val_losses,
    benchmark / std_train[temp_idx],
)

In [None]:
plot_training(epochs, train_losses, val_losses, std_bm)