<a href="https://colab.research.google.com/github/DavoodSZ1993/Dive_into_Deep_Learning/blob/main/09_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install d2l==1.0.0-alpha1.post0 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.0/93.0 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.0/121.0 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.6/83.6 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## 9.1 Working with Sequences

### 9.1.3 Training

In [None]:
%matplotlib inline
import torch
from torch import nn
from d2l import torch as d2l

In [None]:
class Data(d2l.DataModule):
  def __init__(self, batch_size=16, T=1000, num_train=600, tau=4):
    self.save_hyperparameters()
    self.time = torch.arange(1, T + 1, dtype=torch.float32)      # T = 1, 0000, 1000
    self.x = torch.sin(0.01 * self.time) + torch.randn(T) * 0.2  # x = sin(0.01T) + N(0, 1) * 0.2

In [None]:
data = Data()
d2l.plot(data.time, data.x, 'time', 'x', xlim=[1, 1000], figsize=(6, 3))

In [None]:
@d2l.add_to_class(Data)
def get_dataloader(self, train):
  features = [self.x[i : self.T - self.tau + i] for i in range(self.tau)]
  self.features = torch.stack(features, 1)
  self.labels = self.x[self.tau:].reshape((-1, 1))
  i = slice(0, self.num_train) if train else slice(self.num_train, None)
  return self.get_tensorloader([self.features, self.labels], train, i)


In [None]:
model = d2l.LinearRegression(lr=0.01)
data = Data()
trainer = d2l.Trainer(max_epochs=5)
trainer.fit(model, data)

### 9.1.4 Prediction

In [None]:
onestep_preds = model(data.features).detach().numpy()
d2l.plot(data.time[data.tau:], [data.labels, onestep_preds], 'time', 'x',
         legend=['labels', '1-step preds'], figsize=(6, 3))

In [None]:
multistep_preds = torch.zeros(data.T)
multistep_preds [:] = data.x

for i in range(data.num_train + data.tau, data.T):
  multistep_preds[i] = model(multistep_preds[i - data.tau:i].reshape((1, -1)))

multistep_preds = multistep_preds.detach().numpy()

In [None]:
d2l.plot([data.time[data.tau:], data.time[data.num_train + data.tau:]],
         [onestep_preds, multistep_preds[data.num_train + data.tau:]],
         'time', 'x', legend=['1-step preds', 'multi-step preds'], figsize=(6, 3))

In [None]:
def k_step_pred(k):
  features = []
  for i in range(data.tau):
    features.append(data.x[i: i+data.T - data.tau-k+1])

  for i in range(k):
    preds = model(torch.stack(features[i : i+data.tau], 1))
    features.append(preds.reshape(-1))

  return features[data.tau:]

In [None]:
steps = (1, 4, 16, 64)
preds = k_step_pred(steps[-1])

d2l.plot(data.time[data.tau+steps[-1]-1:],
         [preds[k-1].detach().numpy() for k in steps], 'time', 'x',
         legend=[f'{k}-step preds' for k in steps], figsize=(6, 3))

## 9.2 Converting Raw Text into Sequence Data

In [None]:
import collections
import random
import re
import torch
from d2l import torch as d2l

### 9.2.1 Reading the Dataset

In [None]:
class TimeMachine(d2l.DataModule):
  def _download(self):
    fname = d2l.download(d2l.DATA_URL + 'timemachine.txt', self.root,
                         '090b5e7e70c295757f55df93cb0a180b9691891a')
    with open(fname) as f:
      return f.read()

In [None]:
data = TimeMachine()
raw_text = data._download()
raw_text[:60]

Downloading ../data/timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...


'The Time Machine, by H. G. Wells [1898]\n\n\n\n\nI\n\n\nThe Time Tra'

In [None]:
@d2l.add_to_class(TimeMachine)
def _preprocess(self, text):
  return re.sub('[^A-Za-z]+', ' ', text).lower()

In [None]:
text = data._preprocess(raw_text)
text[0:60]

'the time machine by h g wells i the time traveller for so it'

### 9.2.2 Tokenization

In [None]:
@d2l.add_to_class(TimeMachine)
def _tokenize(self, text):
  return list(text)

In [None]:
tokens = data._tokenize(text)
','.join(tokens[:30]), tokens[0]   # A 1D list

('t,h,e, ,t,i,m,e, ,m,a,c,h,i,n,e, ,b,y, ,h, ,g, ,w,e,l,l,s, ', 't')

### 9.2.3 Vocabulary

In [None]:
class Vocab:
  def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
    # Flatten a 2D list if needed.
    if tokens and isinstance(tokens[0], list):
      tokens = [token for line in tokens for token in line]    # nested-list comprehension
    
    # Count token frequencies
    counter = collections.Counter(tokens)
    self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                              reverse=True)
    
    # The list of unique tokens
    self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
        token for token, freq in self.token_freqs if freq >= min_freq])))
    self.token_to_idx = {token: idx
                         for idx, token in enumerate(self.idx_to_token)}

  def __len__(self):
    return len(self.idx_to_token)

  def __getitem__(self, tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    return [self.__getitem__(token) for token in tokens]

  def to_tokens(self, indices):
    if hasattr(indices, '__len__') and len(indices) > 1:
      return [self.idx_to_token[int(index)] for index in indices]
    return self.idx_to_token[indices]

  @property
  def unk(self):
    return self.token_to_idx['<unk>']

In [None]:
vocab = Vocab(tokens)
indices = vocab[tokens[:10]] # __getitem__() magic method 
print('indices: ', indices)
print('words: ', vocab.to_tokens(indices))

indices:  [21, 9, 6, 0, 21, 10, 14, 6, 0, 14]
words:  ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm']


### 9.2.4 Putting It All Together

In [None]:
@d2l.add_to_class(TimeMachine)
def build(self, raw_text, vocab=None):
  tokens = self._tokenize(self._preprocess(raw_text))
  if vocab is None: vocab = Vocab(tokens)
  corpus = [vocab[token] for token in tokens]
  return corpus, vocab

In [None]:
corpus, vocab = data.build(raw_text)
len(corpus), len(vocab) # __len__() magic method

(173428, 28)

### 9.2.5 Exploratory Language Statistics

In [None]:
words = text.split()
vocab = Vocab(words)
vocab.token_freqs[:10]

[('the', 2261),
 ('i', 1267),
 ('and', 1245),
 ('of', 1155),
 ('a', 816),
 ('to', 695),
 ('was', 552),
 ('in', 541),
 ('that', 443),
 ('my', 440)]

In [None]:
freqs = [freq for token, freq in vocab.token_freqs]
d2l.plot(freqs, xlabel='token: x', ylabel='frequncy: n(x)',
         xscale='log', yscale='log')

In [None]:
bigram_tokens = ['--'.join(pair) for pair in zip(words[:-1], words[1:])]
bigram_vocab = Vocab(bigram_tokens)
bigram_vocab.token_freqs[:10]

[('of--the', 309),
 ('in--the', 169),
 ('i--had', 130),
 ('i--was', 112),
 ('and--the', 109),
 ('the--time', 102),
 ('it--was', 99),
 ('to--the', 85),
 ('as--i', 78),
 ('of--a', 73)]

In [None]:
trigram_tokens = ['--'.join(triple) for triple in zip(
    words[:-2], words[1:-1], words[2:])]
trigram_vocab = Vocab(trigram_tokens)
trigram_vocab.token_freqs[:10]

[('the--time--traveller', 59),
 ('the--time--machine', 30),
 ('the--medical--man', 24),
 ('it--seemed--to', 16),
 ('it--was--a', 15),
 ('here--and--there', 15),
 ('seemed--to--me', 14),
 ('i--did--not', 14),
 ('i--saw--the', 13),
 ('i--began--to', 13)]

In [None]:
bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs]
trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs]

d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x',
         ylabel='frequency: n(x)', xscale='log', yscale='log',
         legend=['unigram', 'bigram', 'trigram'])

## 9.3 Language Models

### 9.3.3 Partitioning Sequences

In [None]:
import torch
from d2l import torch as d2l

In [None]:
@d2l.add_to_class(d2l.TimeMachine)
def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000):
  super(d2l.TimeMachine, self).__init__()
  self.save_hyperparameters()

  corpus, self.vocab = self.build(self._download())
  array = torch.tensor([corpus[i:i+num_steps+1]
                        for i in range(len(corpus)-num_steps)])
  self.X, self.Y = array[:, :-1], array[:, 1:]

In [None]:
@d2l.add_to_class(d2l.TimeMachine)
def get_dataloader(self, train):
  idx = slice(0, self.num_train) if train else slice(
      self.num_train, self.num_train + self.num_val)
  return self.get_tensorloader([self.X, self.Y], train, idx)

In [None]:
data = d2l.TimeMachine(batch_size=2, num_steps=10)
for X, Y in data.train_dataloader():
  print('X: ', X,'\nY: ', Y)
  break

## 9.4 Recurrent Neural Network

### 9.4.2 Recurrent Neural Networks with Hidden State

In [None]:
import torch
from d2l import torch as d2l

In [None]:
X, W_xh = torch.randn(3, 1), torch.randn(1, 4)
H, W_hh = torch.randn(3, 4), torch.randn(4, 4)
torch.matmul(X, W_xh) + torch.matmul(H, W_hh)  # XW_xh + HW_hh

tensor([[ 3.4947, -2.2974,  0.3234, -1.8619],
        [-3.5675, -0.9275,  0.3090,  0.7030],
        [ 4.4709, -2.4468,  1.0109, -2.5766]])

In [None]:
torch.matmul(torch.cat((X, H), 1), torch.cat((W_xh, W_hh), 0))

tensor([[ 3.4947, -2.2974,  0.3234, -1.8619],
        [-3.5675, -0.9275,  0.3090,  0.7030],
        [ 4.4709, -2.4468,  1.0109, -2.5766]])

## 9.5 Recurrent Neural Network Implementation from Scratch

In [None]:
%matplotlib inline
import math
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

### 9.5.1 RNN Model

In [None]:
class RNNScratch(d2l.Module):
  def __init__(self, num_inputs, num_hiddens, sigma=0.01):
    super().__init__()
    self.save_hyperparameters()

    self.W_xh = nn.Parameter(torch.randn(num_inputs, num_hiddens) * sigma)
    self.W_hh = nn.Parameter(torch.randn(num_hiddens, num_hiddens) * sigma)
    self.b_h = nn.Parameter(torch.zeros(num_hiddens))

In [None]:
@d2l.add_to_class(RNNScratch)
def forward(self, inputs, state=None):
  if state is None:
    state = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
  else:
    state, = state
  outputs = []
  for X in inputs:
    state = torch.tanh(torch.matmul(X, self.W_xh) + 
                       torch.matmul(state, self.W_hh) + self.b_h)
    outputs.append(state)
  return outputs, state

In [None]:
batch_size, num_inputs, num_hiddens, num_steps = 2, 16, 32, 100
rnn = RNNScratch(num_inputs, num_hiddens)

X = torch.ones((num_steps, batch_size, num_inputs))
outputs, state = rnn(X)

In [None]:
def check_len(a, n):
  assert len(a) == n, f'list\'s len {len(a)} != expected length {n}'

def check_shape(a, shape):
  assert a.shape == shape, f'tensor\'s shape {a.shape} != expected shape {shape}'

In [None]:
check_len(outputs, num_steps)                        # number of outputs is equal to number of time steps
check_shape(outputs[0], (batch_size, num_hiddens))  # Initial State
check_shape(state, (batch_size, num_hiddens))       # Intermediate states

### 9.5.2 RNN-based Language Model

In [None]:
class RNNLMScratch(d2l.Classifier):
  def __init__(self, rnn, vocab_size, lr=0.01):
    super().__init__()
    self.save_hyperparameters()
    self.init_params()

  def init_params(self):
    self.W_hq = nn.Parameter(torch.randn(                               # Initializes outputs parameters (O_t = H_tW_hq + b_q)
        self.rnn.num_hiddens, self.vocab_size) * self.rnn.sigma)
    self.b_q = nn.Parameter(torch.zeros(self.vocab_size))

  def training_step(self, batch):
    l = self.loss(self(*batch[:-1]), batch[-1])
    self.plot('ppl', torch.exp(l), train=True)
    return l

  def validation_step(self, batch):
    l = self.loss(self(*batch[:-1]), batch[-1])
    self.plot('ppl', torch.exp(l), train=False)

#### One-Hot Encoding

In [None]:
F.one_hot(torch.tensor([0, 2]), 5)

tensor([[1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0]])

In [None]:
@d2l.add_to_class(RNNLMScratch)
def one_hot(self, X):
  return F.one_hot(X.T, self.vocab_size).type(torch.float32)   # the output elements are integer and should be converted to float

#### Transforming RNN Outputs

In [None]:
@d2l.add_to_class(RNNLMScratch)
def output_layer(self, rnn_outputs):
  outputs = [torch.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs]        # O_t = H_tW_hq + b_q
  return torch.stack(outputs, 1)                                                # outputs should be the same size

@d2l.add_to_class(RNNLMScratch)
def forward(self, X, state=None):
  embs = self.one_hot(X)
  rnn_outputs, _ = self.rnn(embs, state)
  return self.output_layer(rnn_outputs)

In [None]:
model = RNNLMScratch(rnn, num_inputs)                                           # num_inputs = vocab_size
outputs = model(torch.ones((batch_size, num_steps), dtype=torch.int64))         # one_hot only gets integer inputs and float numbers should be coverted to int
check_shape(outputs, (batch_size, num_steps, num_inputs))

### 9.5.3 Gradient Clipping

In [None]:
@d2l.add_to_class(d2l.Trainer)
def clip_gradients(self, grad_clip_val, model):
  params = [p for p in model.parameters() if p.requires_grad]
  norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
  if norm > grad_clip_val:
    for param in params:
      param.grad[:] *= grad_clip_val / norm 

### 9.5.4 Training

In [None]:
data = d2l.TimeMachine(batch_size=1024, num_steps=32)
rnn = RNNScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(rnn, vocab_size=len(data.vocab), lr=1)
trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1, num_gpus=1)
trainer.fit(model, data)

### 9.5.5 Decoding

In [None]:
@d2l.add_to_class(RNNLMScratch)
def predict(self, prefix, num_preds, vocab, device=None):
  state, outputs = None, [vocab[prefix[0]]]
  for i in range(len(prefix) + num_preds -1):              # Indexing starts from zero
    X = torch.tensor([[outputs[-1]]], device=device)
    embs = self.one_hot(X)
    rnn_outputs, state = self.rnn(embs, state)
    if i < len(prefix) -1: # warm-up period
      outputs.append(vocab[prefix[i+1]])
    else:
      Y = self.output_layer(rnn_outputs)
      outputs.append(int(Y.argmax(axis=2).reshape(1)))
  return ''.join([vocab.idx_to_token[i] for i in outputs]) 

In [None]:
model.predict('it has', 20, data.vocab, d2l.try_gpu())

'it has of the mere the the'

## 9.6 Concise Implementation of Recurrent Neural Networks

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

### 9.6.1 Defining the Model

In [None]:
class RNN(d2l.Module):
  def __init__(self, num_inputs, num_hiddens):
    super().__init__()
    self.save_hyperparameters()
    self.rnn = nn.RNN(num_inputs, num_hiddens)

  def forward(self, inputs, H=None):   # H: Initial hidden state
    return self.rnn(inputs, H)

In [None]:
class RNNLM(d2l.RNNLMScratch):
  def init_params(self):
    self.linear = nn.LazyLinear(self.vocab_size)

  def output_layer(self, hiddens):
    return self.linear(hiddens).swapaxes(0, 1)        # alias to torch.transpose()

### 9.6.2 Training and Predicting

In [None]:
data = d2l.TimeMachine(batch_size=1024, num_steps=32)
rnn = RNN(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(rnn, vocab_size=len(data.vocab), lr=1)
model.predict('it has ', 20, data.vocab)



'it has mmmacmmlmmmacmmlmmma'

In [None]:
trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1, num_gpus=1)
trainer.fit(model, data)