In [None]:
# As usual, a bit of setup
import time
import numpy as np
import torch
import matplotlib.pyplot as plt


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2
%autosave 180

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 180 seconds


In [None]:
import torch.nn as nn

# set sizes 
time_steps = 12
batch_size = 4
input_size = 3
hidden_size = 2

# create input data with shape [batch_size, time_steps, num_features]
np.random.seed(137)
input_data = torch.randn(batch_size, time_steps, input_size, dtype = torch.float32)

initial_state = torch.randn(batch_size, hidden_size, dtype = torch.float32).unsqueeze(0)

In [None]:
t_rnn = nn.RNN(input_size, hidden_size, num_layers = 1, batch_first = True)



t_gru = nn.GRU(input_size, hidden_size, num_layers = 1, batch_first = True)

with torch.no_grad():
    t_rnn_outputs, t_rnn_final_state = t_rnn(input_data, initial_state)
    t_gru_outputs, t_gru_final_state = t_gru(input_data, initial_state)

In [None]:
from rnn_param_helper import get_rnn_params, get_gru_params

wt_h, wt_x, bias = get_rnn_params(t_rnn)



linear_trans_r, linear_trans_z, linear_trans_n = get_gru_params(t_gru)

In [None]:
from implementation import rnn,gru

nprnn_outputs, nprnn_final_state = rnn(wt_h, wt_x, bias, initial_state.numpy(), input_data.numpy())


print("Difference between your RNN implementation and tf RNN", 
                     rel_error(t_rnn_outputs.numpy(), nprnn_outputs) + rel_error(t_rnn_final_state.numpy(), nprnn_final_state))

npgru_outputs, npgru_final_state = gru(linear_trans_r, linear_trans_z, linear_trans_n, initial_state.numpy(), input_data.numpy())

print("Difference between your GRU implementation and tf GRU", 
      rel_error(t_gru_outputs.numpy(), npgru_outputs) + rel_error(t_gru_final_state.numpy(), npgru_final_state))


Difference between your RNN implementation and tf RNN 3.4710562192909463e-06
Difference between your GRU implementation and tf GRU 5.0752533014423674e-06


In [None]:

from implementation import init_gru_with_rnn

linear_trans_r, linear_trans_z, linear_trans_n = init_gru_with_rnn(wt_h, wt_x, bias)

# concatenate these parameters to initialize GRU kernels
kernel_init = np.concatenate([linear_trans_r[0], linear_trans_z[0], linear_trans_n[0]], axis=1).T
rec_kernel_init = np.concatenate([linear_trans_r[2], linear_trans_z[2], linear_trans_n[2]], axis=1).T
bias_init0 = np.concatenate([linear_trans_r[1], linear_trans_z[1], linear_trans_n[1]], axis=0)
bias_init1 = np.concatenate([linear_trans_r[3], linear_trans_z[3], linear_trans_n[3]])

grurnn = nn.GRU(input_size, hidden_size, num_layers = 1, batch_first = True)
wt_x1, wt_h1, bias_ih1, bias_hh1 = grurnn._flat_weights

wt_x1.data = torch.tensor(kernel_init, dtype =torch.float32)
wt_h1.data = torch.tensor(rec_kernel_init, dtype = torch.float32)
bias_ih1.data = torch.tensor(bias_init0, dtype = torch.float32)
bias_hh1.data = torch.tensor(bias_init1, dtype = torch.float32)



with torch.no_grad():
    t_rnn_outputs, t_rnn_final_state = t_rnn(input_data, initial_state)
    grurnn_outputs, grurnn_final_state = grurnn(input_data, initial_state)


print("Difference between RNN and a special GRU", rel_error(t_rnn_outputs.numpy(), grurnn_outputs.numpy()))

Difference between RNN and a special GRU 3.2550126e-06


In [None]:
from implementation import init_gru_with_long_term_memory



linear_trans_r, linear_trans_z, linear_trans_n = init_gru_with_long_term_memory(input_size, hidden_size)

kernel_init = np.concatenate([linear_trans_r[0], linear_trans_z[0], linear_trans_n[0]], axis=1).T
rec_kernel_init = np.concatenate([linear_trans_r[2], linear_trans_z[2], linear_trans_n[2]], axis=1).T
bias_init0 = np.concatenate([linear_trans_r[1], linear_trans_z[1], linear_trans_n[1]], axis=0)
bias_init1 = np.concatenate([linear_trans_r[3], linear_trans_z[3], linear_trans_n[3]])

gru2 = nn.GRU(input_size, hidden_size, num_layers = 1, batch_first = True)
wt_xg, wt_hg, bias_ihg, bias_hhg = gru2._flat_weights


wt_xg.data = torch.tensor(kernel_init, dtype =torch.float32)
wt_hg.data = torch.tensor(rec_kernel_init, dtype = torch.float32)
bias_ihg.data = torch.tensor(bias_init0, dtype = torch.float32)
bias_hhg.data = torch.tensor(bias_init1, dtype = torch.float32)




with torch.no_grad():
    outputs, _ = gru2(input_data, initial_state)
    outputs = outputs.numpy()
    
    
    print('Difference between a later hidden state and the initial state is', np.mean(np.abs(outputs[:, 10, :] - initial_state[0, :, :].numpy())))
    

Difference between a later hidden state and the initial state is 0.0


In [None]:
from rnn_param_helper import get_mha_params
from implementation import mha


batch_size = 4
time_steps = 8
input_size = 10
num_heads = 5

input_data = torch.randn(batch_size, time_steps, input_size, dtype = torch.float32)


with torch.no_grad():

    t_mha = nn.MultiheadAttention(embed_dim=input_size, num_heads=num_heads, dropout=0.0, bias=False, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, batch_first=True)

    t_output, _ = t_mha(input_data, input_data, input_data, need_weights=False)


Wq, Wk, Wv, Wo = get_mha_params(t_mha)

output = mha(Wq, Wk, Wv, Wo, input_data )

print('Difference between my output and torch output is ', np.mean(np.abs(output - t_output.numpy())))
    


Difference between my output and torch output is  2.0640801e-08
