<a href="https://colab.research.google.com/github/DJCordhose/ml-workshop/blob/master/notebooks/intro/2021/intro-rnn-basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequences and RNNs

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (20, 8)

In [3]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [4]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [5]:
# https://github.com/AndreasMadsen/python-lrcurve
!pip install -q lrcurve

In [6]:
from lrcurve import KerasLearningCurve

In [7]:
# https://keras.io/metrics/#custom-metrics
# https://www.tensorflow.org/tutorials/customization/performance
  
# ported to TF 2 from 
# * https://stackoverflow.com/a/42351397/1756489 and
# * https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34019 (for use of epsilon to avoid strange inf or -inf)

# only works properly on tensors

@tf.function
def r2_metric(y_true, y_pred):
  total_error = tf.reduce_sum(tf.square(tf.subtract(y_true, tf.reduce_mean(y_true))))
  unexplained_error = tf.reduce_sum(tf.square(tf.subtract(y_true, y_pred)))
  R_squared = tf.subtract(1.0, tf.divide(unexplained_error, tf.add(total_error, tf.keras.backend.epsilon())))
  
  return R_squared

# Univariate Sequences

just one variable per time step

### Challenge

We have a known series of events, possibly in time and you want to know what is the next event. Like this

[10, 20, 30, 40, 50, 60, 70, 80, 90]

In [8]:
import numpy as np

# derived from here: https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
# something like this is also possible, but timeseries_dataset_from_array is a bit weird
# dataset = tf.keras.preprocessing.timeseries_dataset_from_array(data=data[:-n_steps], targets=data[n_steps:], sequence_length=n_steps)
# for batch in dataset:
#     inputs, targets = batch
#     print(inputs)
#     print(targets)
#     break

# split sequence into samples
def split_sequence(sequence, n_steps_in, n_steps_out=1):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out
		# check if we are beyond the sequence
		if out_end_ix > len(sequence):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)
 

In [9]:
raw_seq = np.arange(10, 100, 10)
raw_seq

array([10, 20, 30, 40, 50, 60, 70, 80, 90])

In [10]:
#@title Prediction from n past steps { run: "auto", display-mode: "both" }

# https://colab.research.google.com/notebooks/forms.ipynb

n_steps = 3 #@param {type:"slider", min:1, max:10, step:1}

# split into samples
X, y = split_sequence(raw_seq, n_steps)

# summarize the data
list(zip(X, y))

[(array([10, 20, 30]), array([40])),
 (array([20, 30, 40]), array([50])),
 (array([30, 40, 50]), array([60])),
 (array([40, 50, 60]), array([70])),
 (array([50, 60, 70]), array([80])),
 (array([60, 70, 80]), array([90]))]

### Converting shapes

* one of the most frequent, yet most tedious steps
* match between what you have and what an interface needs
* expected input of RNN: 3D tensor with shape (samples, timesteps, input_dim)
* we have: (samples, timesteps)
* reshape on np arrays can do all that

In [11]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))
X

array([[[10],
        [20],
        [30]],

       [[20],
        [30],
        [40]],

       [[30],
        [40],
        [50]],

       [[40],
        [50],
        [60]],

       [[50],
        [60],
        [70]],

       [[60],
        [70],
        [80]]])

In [12]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN, Bidirectional
from tensorflow.keras.models import Sequential, Model

model = Sequential()
model.add(SimpleRNN(units=50, activation='relu', input_shape=(n_steps, n_features), name="RNN_Input"))
model.add(Dense(units=1, name="Linear_Output"))
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
RNN_Input (SimpleRNN)        (None, 50)                2600      
_________________________________________________________________
Linear_Output (Dense)        (None, 1)                 51        
Total params: 2,651
Trainable params: 2,651
Non-trainable params: 0
_________________________________________________________________


In [13]:
EPOCHS = 1000

%time history = model.fit(X, y, epochs=EPOCHS, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 8.53 s, sys: 463 ms, total: 9 s
Wall time: 11.6 s


In [14]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2

(2.580937461971189e-06, 1.0)

### Let's try this on a few examples

In [15]:
# this does not look too bad
X_sample = np.array([[10, 20, 30], [70, 80, 90]]).astype(np.float32)
X_sample = X_sample.reshape((X_sample.shape[0], X_sample.shape[1], n_features))
X_sample

array([[[10.],
        [20.],
        [30.]],

       [[70.],
        [80.],
        [90.]]], dtype=float32)

In [16]:
y_pred = model.predict(X_sample)
y_pred

array([[ 39.999992],
       [100.35217 ]], dtype=float32)

In [17]:
def predict(model, samples, n_features=1):
  input = np.array(samples).astype(np.float32)
  input = input.reshape((input.shape[0], input.shape[1], n_features))
  y_pred = model.predict(input)
  return y_pred

In [18]:
# do not look too close, though
predict(model, [[100, 110, 120], [200, 210, 220], [200, 300, 400]])

array([[131.47188],
       [236.65259],
       [491.84326]], dtype=float32)

# Hands-On: Traing your first RNN

* go through the notebook as it is
* Try to improve the model
  * Change the number of values used as input
  * Change activation function
  * More nodes? less nodes?
  * What else might help improving the results?


# Input and output of an RNN layer

In [19]:
# https://keras.io/layers/recurrent/
# input: (samples, timesteps, input_dim)
# output: (samples, units)

# let's have a look at the actual output for an example
rnn_layer = model.get_layer("RNN_Input")
model_stub = Model(inputs = model.input, outputs = rnn_layer.output)
hidden = predict(model_stub, [[10, 20, 30]])
hidden.shape, hidden

((1, 50), array([[0.0000000e+00, 7.8041782e+00, 1.6119865e+01, 1.2455173e+01,
         9.8151913e+00, 1.2958351e+01, 1.3758382e+01, 0.0000000e+00,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         2.0090609e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         3.5585618e+00, 3.1492071e+00, 0.0000000e+00, 1.4230196e+01,
         1.4767935e+01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         1.1289589e+01, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         2.8133392e-05, 3.1005487e+00, 3.2934904e-01, 4.7683716e-07,
         1.2071203e+01, 0.0000000e+00, 4.2187605e+00, 0.0000000e+00,
         0.0000000e+00, 3.5156360e+00, 0.0000000e+00, 0.0000000e+00,
         9.1665459e+00, 0.0000000e+00, 1.3669498e+01, 0.0000000e+00,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 3.4096699e+00,
         1.4458612e+01, 0.0000000e+00]], dtype=float32))

#### What do we see?
* each unit (50) has a single output
* as a sidenote you nicely see the RELU nature of the output
* so the timesteps of the input are lost
* we are only looking at the final output
* still with each timestep, the layer does produce a unique output we could potentially use

### We need to look into RNNs a bit more deeply now

#### RNNs - Networks with Loops
<img src='https://djcordhose.github.io/ai/img/nlp/colah/RNN-rolled.png' height=200>

http://colah.github.io/posts/2015-08-Understanding-LSTMs/
#### Unrolling the loop
<img src='https://djcordhose.github.io/ai/img/nlp/colah/RNN-unrolled.png'>

http://colah.github.io/posts/2015-08-Understanding-LSTMs/
#### Simple RNN internals

<img src='https://djcordhose.github.io/ai/img/nlp/fchollet_rnn.png'>

##  $output_t = \tanh(W input_t + U output_{t-1} + b)$

From Deep Learning with Python, Chapter 6, François Chollet, Manning: https://livebook.manning.com/#!/book/deep-learning-with-python/chapter-6/129

#### Activation functions

<img src='https://djcordhose.github.io/ai/img/sigmoid-activation.png' height=200>

Sigmoid compressing between 0 and 1

<img src='https://djcordhose.github.io/ai/img/tanh-activation.png' height=200>

Hyperbolic tangent, like sigmoind, but compressing between -1 and 1, thus allowing for negative values as well

# Multi Layer RNNs

In [20]:
# one output for each input timestep
# ideal for feeding into something that *expects* timesteps
rnn_units = 1

from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN, Bidirectional
from tensorflow.keras.models import Sequential, Model

model = Sequential([
    SimpleRNN(units=rnn_units, activation='relu', return_sequences=True),
    # SimpleRNN(units=rnn_units, activation='relu')
])

# https://keras.io/layers/recurrent/
# input: (samples, timesteps, input_dim)
# output with return_sequences: (samples, timesteps, units)

predict(model, [[10, 20, 30]])

array([[[0.],
        [0.],
        [0.]]], dtype=float32)

In [21]:
rnn_units = 50

model = Sequential([
    SimpleRNN(units=rnn_units, activation='relu', input_shape=(n_steps, n_features), return_sequences=True, name="RNN_Input"),
    SimpleRNN(units=rnn_units, activation='relu', name="RNN_Latent"),
    Dense(units=1, name="Linear_Output")
])
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
RNN_Input (SimpleRNN)        (None, 3, 50)             2600      
_________________________________________________________________
RNN_Latent (SimpleRNN)       (None, 50)                5050      
_________________________________________________________________
Linear_Output (Dense)        (None, 1)                 51        
Total params: 7,701
Trainable params: 7,701
Non-trainable params: 0
_________________________________________________________________


In [22]:
%time history = model.fit(X, y, epochs=2000, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 19.2 s, sys: 1.09 s, total: 20.3 s
Wall time: 25 s


In [23]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2

(6.885214247631666e-07, 1.0)

In [24]:
predict(model, [[10, 20, 30], [70, 80, 90], [100, 110, 120], [200, 210, 220], [200, 300, 400]])

array([[ 39.998753],
       [100.49813 ],
       [131.99095 ],
       [239.36157 ],
       [488.8227  ]], dtype=float32)

### LSMTs / GRUs

* mainly beneficial for long sequences
* but also 3-4 times more expensive
* might not have better results for short sequences like these

### LSTM

In [25]:
rnn_units = 50

model = Sequential([
    LSTM(units=rnn_units, activation='relu', input_shape=(n_steps, n_features), name="RNN_Input"),
    Dense(units=1, name="Linear_Output")
])
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
RNN_Input (LSTM)             (None, 50)                10400     
_________________________________________________________________
Linear_Output (Dense)        (None, 1)                 51        
Total params: 10,451
Trainable params: 10,451
Non-trainable params: 0
_________________________________________________________________


In [26]:
%time history = model.fit(X, y, epochs=2000, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 19 s, sys: 1.03 s, total: 20.1 s
Wall time: 24.8 s


In [27]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2

(4.3247739085927606e-05, 0.9999998807907104)

In [28]:
predict(model, [[10, 20, 30], [70, 80, 90], [100, 110, 120], [200, 210, 220], [200, 300, 400]])



array([[ 39.999866],
       [100.18643 ],
       [132.04584 ],
       [244.41617 ],
       [355.5434  ]], dtype=float32)

### GRU

In [29]:
rnn_units = 50

model = Sequential([
    GRU(units=rnn_units, activation='relu', input_shape=(n_steps, n_features), name="RNN_Input"),
    Dense(units=1, name="Linear_Output")
])
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
RNN_Input (GRU)              (None, 50)                7950      
_________________________________________________________________
Linear_Output (Dense)        (None, 1)                 51        
Total params: 8,001
Trainable params: 8,001
Non-trainable params: 0
_________________________________________________________________


In [30]:
%time history = model.fit(X, y, epochs=2000, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 20.3 s, sys: 1.14 s, total: 21.5 s
Wall time: 26.2 s


In [31]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2

(0.0003745203430298716, 0.9999986886978149)

In [32]:
predict(model, [[10, 20, 30], [70, 80, 90], [100, 110, 120], [200, 210, 220], [200, 300, 400]])



array([[ 39.99813],
       [100.34613],
       [134.40718],
       [251.59517],
       [414.1259 ]], dtype=float32)

# Multivariate LSTM Models

## Multiple Input Series

In [33]:
in_seq1 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
in_seq2 = [15, 25, 35, 45, 55, 65, 75, 85, 95]
out_seq = [in1 + in2 for in1, in2 in zip(in_seq1, in_seq2)]
out_seq

[25, 45, 65, 85, 105, 125, 145, 165, 185]

In [34]:
# convert to [rows, columns] structure
in_seq1 = np.array(in_seq1).reshape((len(in_seq1), 1))
in_seq2 = np.array(in_seq2).reshape((len(in_seq2), 1))
out_seq = np.array(out_seq).reshape((len(out_seq), 1))
out_seq

array([[ 25],
       [ 45],
       [ 65],
       [ 85],
       [105],
       [125],
       [145],
       [165],
       [185]])

In [35]:
# horizontally stack columns
dataset = np.hstack((in_seq1, in_seq2, out_seq))
dataset

array([[ 10,  15,  25],
       [ 20,  25,  45],
       [ 30,  35,  65],
       [ 40,  45,  85],
       [ 50,  55, 105],
       [ 60,  65, 125],
       [ 70,  75, 145],
       [ 80,  85, 165],
       [ 90,  95, 185]])

In [36]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the dataset
		if end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [37]:
#@title Prediction from n past steps { run: "auto", display-mode: "both" }

# https://colab.research.google.com/notebooks/forms.ipynb

n_steps = 3 #@param {type:"slider", min:1, max:10, step:1}

# convert into input/output
X, y = split_sequences(dataset, n_steps)

# summarize the data
list(zip(X, y))

[(array([[10, 15],
         [20, 25],
         [30, 35]]), 65), (array([[20, 25],
         [30, 35],
         [40, 45]]), 85), (array([[30, 35],
         [40, 45],
         [50, 55]]), 105), (array([[40, 45],
         [50, 55],
         [60, 65]]), 125), (array([[50, 55],
         [60, 65],
         [70, 75]]), 145), (array([[60, 65],
         [70, 75],
         [80, 85]]), 165), (array([[70, 75],
         [80, 85],
         [90, 95]]), 185)]

In [38]:
# the dataset knows the number of features, e.g. 2
n_features = X.shape[2]

# define model
model = Sequential()
model.add(GRU(units=50, activation='relu', input_shape=(n_steps, n_features), name="RNN_Input"))
model.add(Dense(units=1, name="Linear_Output"))
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])

In [39]:
%time history = model.fit(X, y, epochs=2000, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 21 s, sys: 1.12 s, total: 22.2 s
Wall time: 26.8 s


In [40]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2



(0.0061339414678514, 0.9999961853027344)

In [41]:
def predict_multi(model, samples):
  input = np.array(samples).astype(np.float32)
  input = input.reshape(1, input.shape[0], input.shape[1])
  y_pred = model.predict(input)
  return y_pred

In [42]:
predict_multi(model, [[80, 85], [90, 95], [100, 105]])



array([[205.25598]], dtype=float32)

In [43]:
predict_multi(model, [[10, 15], [20, 25], [30, 35]])

array([[65.03133]], dtype=float32)

In [44]:
predict_multi(model, [[180, 185], [190, 195], [200, 205]])

array([[414.45667]], dtype=float32)

#### Let's make this a little bit harder

* output y can be inferred from final timestep
* now we try to infer following ouput

In [45]:
y += 20
list(zip(X, y))

[(array([[10, 15],
         [20, 25],
         [30, 35]]), 85), (array([[20, 25],
         [30, 35],
         [40, 45]]), 105), (array([[30, 35],
         [40, 45],
         [50, 55]]), 125), (array([[40, 45],
         [50, 55],
         [60, 65]]), 145), (array([[50, 55],
         [60, 65],
         [70, 75]]), 165), (array([[60, 65],
         [70, 75],
         [80, 85]]), 185), (array([[70, 75],
         [80, 85],
         [90, 95]]), 205)]

In [46]:
model = Sequential()
model.add(GRU(units=50, activation='relu', input_shape=(n_steps, n_features), name="RNN_Input"))
model.add(Dense(units=1, name="Linear_Output"))
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])

%time history = model.fit(X, y, epochs=2000, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 21.1 s, sys: 1.17 s, total: 22.2 s
Wall time: 27.1 s


In [47]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2



(0.0019750401843339205, 0.9999987483024597)

In [48]:
predict_multi(model, [[80, 85], [90, 95], [100, 105]])



array([[225.65814]], dtype=float32)

In [49]:
predict_multi(model, [[10, 15], [20, 25], [30, 35]])

array([[84.99657]], dtype=float32)

In [50]:
predict_multi(model, [[180, 185], [190, 195], [200, 205]])

array([[453.07364]], dtype=float32)

## Multi-Step LSTM Models
* this might just as well be an encoder / decoder approach

In [51]:
raw_seq = np.arange(10, 100, 10)
raw_seq

array([10, 20, 30, 40, 50, 60, 70, 80, 90])

In [52]:

#@title Prediction from n past steps { run: "auto", display-mode: "both" }

# https://colab.research.google.com/notebooks/forms.ipynb

n_steps_in = 3 #@param {type:"slider", min:1, max:10, step:1}
n_steps_out = 2 #@param {type:"slider", min:1, max:10, step:1}

X, y = split_sequence(raw_seq, n_steps_in, n_steps_out)

for input, output in zip(X, y):
  print (input, output)

[10 20 30] [40 50]
[20 30 40] [50 60]
[30 40 50] [60 70]
[40 50 60] [70 80]
[50 60 70] [80 90]


In [53]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))
# define model
model = Sequential()
model.add(GRU(100, activation='relu', input_shape=(n_steps_in, n_features)))
# model.add(GRU(100, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
# model.add(GRU(100, activation='relu'))
model.add(Dense(n_steps_out))
model.compile(optimizer='adam', loss='mse', metrics=[r2_metric])

%time history = model.fit(X, y, epochs=2000, callbacks=[KerasLearningCurve()], verbose=0)

CPU times: user 22.2 s, sys: 1.19 s, total: 23.4 s
Wall time: 28.3 s


In [54]:
loss, r2 = model.evaluate(X, y, verbose=0)
loss, r2



(3.2752868150964787e-07, 1.0)

In [55]:
X_sample = np.array([70, 80, 90]).reshape((1, n_steps_in, n_features)).astype(np.float32)
y_pred = model.predict(X_sample)
print(y_pred)

[[101.23799 111.71567]]


In [56]:
X_sample = np.array([10, 20, 30]).reshape((1, n_steps_in, n_features)).astype(np.float32)
y_pred = model.predict(X_sample)
print(y_pred)

[[40.00001  49.999958]]
