<a href="https://colab.research.google.com/github/DmitriySechkin/ds-learning-sb/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
data_dir = 'drive/My Drive/'
train_lang = 'en2'

In [5]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class DatasetSeq(Dataset):

    def __init__(self, data_dir, train_lang='en'):

	      #open file
        with open(data_dir + train_lang + '.train', 'r') as f:
          self.dataset = f.read().split('\n\n')

        # delete extra tag markup
        self.dataset = [x for x in self.dataset if not '_ ' in x]

	    #init vocabs of tokens for encoding {<str> token: <int> id}
        self.target_vocab = {} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {} # {c: 1, a: 2, t: 3, ' ': 4, s: 5}

        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1

        for line in self.dataset:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)


    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }

In [6]:
train_dataset = DatasetSeq(data_dir, train_lang)

In [None]:
#padding
# seq1 = [1, 2, 3, 4]
# seq2 = [9, 7, 6, 4, 3, 7, 5]
# pad seq1 equal seq2
# seq1 = [1, 2, 3, 4, 0, 0, 0]
# concat(seq1, seq2) [[1, 2, 3, 4, 0, 0, 0],
#                     [9, 7, 6, 4, 3, 7, 5]]

In [7]:
def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}

In [54]:
class GruPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, bidirectional=False):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.mult_indx = 2 if bidirectional else 1

        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.classifier = nn.Linear(hidden_dim * self.mult_indx, n_classes)
        self.hidden_dim = hidden_dim
        self.do = nn.Dropout(0.2)

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.gru(emb)

        classes = self.classifier(self.do(hidden))

        return classes

In [55]:
class RNNPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, bidirectional=False):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.mult_indx = 2 if bidirectional else 1

        self.gru = nn.RNN(emb_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.classifier = nn.Linear(hidden_dim * self.mult_indx, n_classes)
        self.hidden_dim = hidden_dim
        self.do = nn.Dropout(0.2)

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.gru(emb)

        classes = self.classifier(self.do(hidden))

        return classes

In [56]:
class LstmPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, bidirectional=False):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.mult_indx = 2 if bidirectional else 1

        self.gru = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.classifier = nn.Linear(hidden_dim * self.mult_indx, n_classes)
        self.hidden_dim = hidden_dim
        self.do = nn.Dropout(0.2)

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.gru(emb)

        classes = self.classifier(self.do(hidden))

        return classes

In [49]:
# T x B
# len([[первые слова], [вторые слова], .. [последние слова]]) - длина предложения T
# len([n-ые слова]) - размер батча B

# B x T
# len([[первое предложение], [второе предложение] .. ]) - размер батча B
# len([первое предложение]) - длина предложения T

In [65]:
#hyper params
vocab_size = len(train_dataset.word_vocab) + 1
n_classes = len(train_dataset.target_vocab) + 1
n_chars = len(train_dataset.char_vocab) + 1

emb_dim = 256
hidden = 256
n_epochs = 7
batch_size = 64
cuda_device = 0
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

In [66]:
class PredictorTrainer:

  def __init__(self, model, name):
    self.name = name
    self.model = model
    self.model.train()
    self.optim = torch.optim.Adam(model.parameters(), lr=0.001)
    self.loss_func = nn.CrossEntropyLoss()

  def train(self):
    train_losses = []
    test_losses = []

    for epoch in range(n_epochs):
      train_loss = 0
      test_loss = 0

      train_dataloader = DataLoader(train_dataset,
                              batch_size,
                              shuffle=True,
                              collate_fn=collate_fn,
                              drop_last = True,
                              )

      train_loss += self._train_model(train_dataloader, self.model)


      train_losses.append(train_loss / len(train_dataloader))

      print(f'Epoch [{epoch+1}/{n_epochs}], '
              f'Train Loss: {train_losses[-1]:.4f}, ')

      torch.save(self.model.state_dict(), f'./{self.name}_chkpt_{epoch}.pth')
    return sum(train_losses) / n_epochs

  def _train_model(self, train_dataloader, model):
    train_loss = 0

    model.train()

    for i, batch in enumerate(train_dataloader):
      self.optim.zero_grad()

      predict = model(batch['data'].to(device))
      loss = self.loss_func(predict.view(-1, n_classes),
                        batch['target'].to(device).view(-1),
                        )
      loss.backward()
      self.optim.step()

      train_loss += loss.item()

    return train_loss

  def _validate_model(self, test_dataloader, model):
    test_loss = 0

    model.eval()

    with torch.no_grad():
      for i, batch in enumerate(test_dataloader):
        predict = model(batch['data'].to(device))
        target = batch['target'].to(device).long()

        loss = self.loss_func(predict.view(-1, n_classes),
                        batch['target'].to(device).view(-1),
                        )

        test_loss += loss.item()

    return test_loss

  def inference(self, dataset, phrase):
    words = phrase.split(' ')
    tokens = [dataset.word_vocab[w] for w in words]

    with torch.no_grad():
        self.model.eval()
        predict = self.model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
        labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()

    target_labels = list(dataset.target_vocab.keys())
    print([target_labels[l-1] for l in labels])



In [67]:
model_rnn = RNNPredictor(vocab_size, emb_dim, hidden, n_classes).to(device)
trainer_rnn = PredictorTrainer(model_rnn, 'RNN')

model_lstm = LstmPredictor(vocab_size, emb_dim, hidden, n_classes).to(device)
trainer_lstm = PredictorTrainer(model_lstm, 'LSTM')

model_gru = GruPredictor(vocab_size, emb_dim, hidden, n_classes).to(device)
trainer_gru = PredictorTrainer(model_gru, 'GRU')

model_birnn = RNNPredictor(vocab_size, emb_dim, hidden, n_classes, True).to(device)
trainer_birnn = PredictorTrainer(model_birnn, 'BiRNN')

model_bilstm = LstmPredictor(vocab_size, emb_dim, hidden, n_classes, True).to(device)
trainer_bilstm = PredictorTrainer(model_bilstm, 'BiLSTM')

model_bigru = GruPredictor(vocab_size, emb_dim, hidden, n_classes, True).to(device)
trainer_bigru = PredictorTrainer(model_bigru, 'BiGRU')

In [68]:
results = {
    'модель': [],
    'время обучения': [],
    'loss на обучении': [],
    'время инференса': []
}

phrase = 'He ran quickly after the red bus and caught it .'

In [69]:
trainers = [trainer_rnn, trainer_lstm, trainer_gru, trainer_birnn, trainer_bilstm, trainer_bigru]

for model_tr in trainers:
  results['модель'].append(model_tr.name)
  print(model_tr.name)

  start = datetime.datetime.now()
  loss = model_tr.train()
  results['loss на обучении'].append(loss)

  end = datetime.datetime.now() - start
  results['время обучения'].append(end)

  start = datetime.datetime.now()
  model_tr.inference(train_dataset, phrase)
  end = datetime.datetime.now() - start
  results['время инференса'].append(end)

  print('-' * 200)

RNN
Epoch [1/7], Train Loss: 0.3516, 
Epoch [2/7], Train Loss: 0.1604, 
Epoch [3/7], Train Loss: 0.1206, 
Epoch [4/7], Train Loss: 0.0959, 
Epoch [5/7], Train Loss: 0.0791, 
Epoch [6/7], Train Loss: 0.0677, 
Epoch [7/7], Train Loss: 0.0593, 
['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON', 'PUNCT']
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
LSTM
Epoch [1/7], Train Loss: 0.4039, 
Epoch [2/7], Train Loss: 0.1601, 
Epoch [3/7], Train Loss: 0.1162, 
Epoch [4/7], Train Loss: 0.0899, 
Epoch [5/7], Train Loss: 0.0741, 
Epoch [6/7], Train Loss: 0.0612, 
Epoch [7/7], Train Loss: 0.0526, 
['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON', 'PUNCT']
--------------------------------------------------------------------------------------------------------------------------------------------

In [42]:
import pandas as pd

In [71]:
pd.DataFrame(results)

Unnamed: 0,модель,время обучения,loss на обучении,время инференса
0,RNN,0 days 00:00:14.705274,0.133505,0 days 00:00:00.002221
1,LSTM,0 days 00:00:22.074058,0.136864,0 days 00:00:00.001586
2,GRU,0 days 00:00:20.314892,0.127346,0 days 00:00:00.002140
3,BiRNN,0 days 00:00:18.286657,0.105393,0 days 00:00:00.002161
4,BiLSTM,0 days 00:00:35.735210,0.100034,0 days 00:00:00.001994
5,BiGRU,0 days 00:00:30.755490,0.093083,0 days 00:00:00.003159


*Вывод - модель RNN работает быстрее всех, но loss на обучении чуть выше LSTM и GRU. LSTM дольше всех, но loss как у gru. Bidirectional работает дольше во всех моделях, loss при этом также лучше. На инференсе не ошиблась только GRU.*