In [1]:
import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [19]:
from google.colab import drive
drive.mount('drive', force_remount=True)

Mounted at drive


In [25]:
data_dir = './drive/MyDrive/'
train_lang = 'en'

In [5]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):
	#open file
        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
	    #init vocabs of tokens for encoding {<str> token: <int> id}
        self.target_vocab = {} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {} # {c: 1, a: 2, t: 3, ' ': 4, s: 5}
	    
        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }

In [21]:
import os 
os.listdir('drive/MyDrive')

['Colab Notebooks', 'en.train']

In [26]:
dataset = DatasetSeq(data_dir)

In [27]:
def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}

In [122]:
#LSTM predictor

class LSTMPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True )
        self.classifier = nn.Linear(hidden_dim, n_classes)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.lstm(emb)

        classes = self.classifier(hidden)

        return classes

In [123]:
#RNN predictor

class RNNPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, n_classes)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.rnn(emb)

        classes = self.classifier(hidden)

        return classes

In [124]:
#GRU predictor

class GRUPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, n_classes)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.gru(emb)

        classes = self.classifier(hidden)

        return classes

In [126]:
#hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
#TODO try to use other model parameters
emb_dim = 64
hidden = 128
n_epochs = 1
cuda_device = -1
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

# LSTM

In [146]:
lstm_model = LSTMPredictor(vocab_size, emb_dim, hidden, n_classes).to(device)
lstm_model.train()
lstm_optim = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
lstm_loss_func = nn.CrossEntropyLoss()

In [147]:
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    start = datetime.datetime.now()
    for i, batch in enumerate(dataloader):
        lstm_optim.zero_grad()
        predict = lstm_model(batch['data'].to(device))
        loss = lstm_loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1))
        loss.backward()
        lstm_optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
    lstm_end = datetime.datetime.now() - start
   
    torch.save(lstm_model.state_dict(), f'./rnn_chkpt_{epoch}.pth')

epoch: 0, step: 0, loss: 2.931715965270996
epoch: 0, step: 100, loss: 0.38616907596588135
epoch: 0, step: 200, loss: 0.2701267600059509


# RNN

In [143]:
rnn_model = RNNPredictor(vocab_size, emb_dim, hidden, n_classes).to(device)
rnn_model.train()
rnn_optim = torch.optim.Adam(rnn_model.parameters(), lr=0.001)
rnn_loss_func = nn.CrossEntropyLoss()

In [144]:
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    start = datetime.datetime.now()
    for i, batch in enumerate(dataloader):
        rnn_optim.zero_grad()
        predict = rnn_model(batch['data'].to(device))
        loss = rnn_loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1))
        loss.backward()
        rnn_optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
    rnn_end = datetime.datetime.now() - start
   
    torch.save(rnn_model.state_dict(), f'./rnn_chkpt_{epoch}.pth')

epoch: 0, step: 0, loss: 2.892315626144409
epoch: 0, step: 100, loss: 0.30005085468292236
epoch: 0, step: 200, loss: 0.10036728531122208


# GRU

In [135]:
gru_model = GRUPredictor(vocab_size, emb_dim, hidden, n_classes).to(device)
gru_model.train()
gru_optim = torch.optim.Adam(gru_model.parameters(), lr=0.001)
gru_loss_func = nn.CrossEntropyLoss()

In [136]:
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    start = datetime.datetime.now()
    for i, batch in enumerate(dataloader):
        gru_optim.zero_grad()

        predict = gru_model(batch['data'].to(device))
        loss = gru_loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        gru_optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
    gru_end = datetime.datetime.now() - start
   
    torch.save(gru_model.state_dict(), f'./rnn_chkpt_{epoch}.pth')

epoch: 0, step: 0, loss: 2.9341320991516113
epoch: 0, step: 100, loss: 0.27032533288002014
epoch: 0, step: 200, loss: 0.16760151088237762


# Sample

In [207]:
with open('./drive/MyDrive/en.train') as f:
  file = f.readlines()

In [210]:
targets = []
for el in phrase.split(' '):
  for line in file:
    next = line.split(' ')
    if el == next[0] :
      targets.append(next[1].strip())
      break

In [219]:
def acc(predict, targets):
  correct = 0
  all = 0
  for i in range(len(predict)):
    all += 1
    if predict[i] == targets[i]:
      correct += 1

  return correct/all
    

In [231]:
#example
phrase = 'More over you can also find 10 lines on the selected far topic in English for the speeches in school programs These Ten lines in English will assist students and teachers at the time of school speeches on special events Hence students can refer to the below provided massive list of essays in English and participate in any kind of events conducted by school'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    lstm_model.eval()
    predict = lstm_model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    lstm_iend = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
lstm_targets = [target_labels[l-1] for l in labels]
print(acc(lstm_targets,targets))
print(f'lstm speed: {lstm_iend}')

0.75
lstm speed: 0:00:00.012415


In [232]:
#example rnn
phrase = 'More over you can also find 10 lines on the selected far topic in English for the speeches in school programs These Ten lines in English will assist students and teachers at the time of school speeches on special events Hence students can refer to the below provided massive list of essays in English and participate in any kind of events conducted by school'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    rnn_model.eval()
    predict = rnn_model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    rnn_iend = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
rnn_targets = [target_labels[l-1] for l in labels]
print(acc(rnn_targets,targets))
print(f'rnn inference: {rnn_iend}')

0.75
rnn inference: 0:00:00.003450


In [233]:
#example gru
phrase = 'More over you can also find 10 lines on the selected far topic in English for the speeches in school programs These Ten lines in English will assist students and teachers at the time of school speeches on special events Hence students can refer to the below provided massive list of essays in English and participate in any kind of events conducted by school'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    gru_model.eval()
    predict = gru_model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    gru_iend = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
gru_targets = [target_labels[l-1] for l in labels]
print(acc(gru_targets,targets))
print(f'gru inference: {gru_iend}')

0.828125
gru inference: 0:00:00.010734


In [235]:
print(f'lstm training for 1 epoch: {lstm_end}')
print(f'rnn training for 1 epoch: {rnn_end}')
print(f'gru training for 1 epoch: {gru_end}')

print(f'lstm inference: {lstm_iend}')
print(f'rnn inference: {rnn_iend}')
print(f'gru inference: {gru_iend}')

print(f'lstm acc: {acc(lstm_targets,targets)}')
print(f'rnn acc: {acc(rnn_targets,targets)}')
print(f'gru acc: {acc(gru_targets,targets)}')


lstm training for 1 epoch: 0:04:10.413886
rnn training for 1 epoch: 0:01:13.282854
gru training for 1 epoch: 0:03:16.439727
lstm inference: 0:00:00.012415
rnn inference: 0:00:00.003450
gru inference: 0:00:00.010734
lstm acc: 0.75
rnn acc: 0.75
gru acc: 0.828125
