In [None]:

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import datetime
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd

In [None]:
class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):

        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]

        self.target_vocab = {}
        self.word_vocab = {}
        self.char_vocab = {}

        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }
# seq1 = [1, 2, 3] -> [1, 2, 3, 0]
# seq2 = [7, 5, 4, 2]

def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}


def collate_fn_char(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)):
        for i in range(max_len):
            if len(chars[j]) > i:
                chars_seq[i][j] = torch.as_tensor(chars[j][i])
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}

In [None]:
data_dir = ''
train_lang = 'en'
dataset = DatasetSeq(data_dir)

In [None]:
vocab_len = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
cuda_device = -1
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

ds_item = dataset[500]
#model
decode_words = [k for k in dataset.word_vocab]
print([decode_words[i] for i in ds_item['data']])

           

['Judge', 'come', 'preacher', '1983', 'town', 'preacher', 'ruling', 'allowed', 'testify', 'they', 'respected', 'dismisses', 'CIA', 'detonated', 'then', 'pseudonym', 'cluster', 'respected', 'opening', 'the', 'closely', 'cross-examination', 'Mosul', 'come', 'Wilson', '2', 'prosecution', 'the', 'preacher', 'convince', 'busted', 'respected', 'mollifying', '[']


In [None]:
 class POS_predictor(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru_cell = nn.GRUCell(emb_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)
        self.hidden_dim = hidden_dim

    def forward(self, x):  
        b, t = x.size()
        emb_x = self.emb(x) # B x T x V
        hidden = torch.zeros((b, self.hidden_dim))
        gru_out = []
        for i in range(t):
            hidden = self.gru_cell(emb_x[:, i, :], hidden) # B x Hid
            gru_out.append(hidden.unsqueeze(1)) # B x 1 x Hid
        gru_out = torch.cat(gru_out, dim=1) # B x T x Hid
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred

In [None]:
%%time
model = POS_predictor(vocab_len, 200, 512, n_classes)
model.train()
model = model.to(device)

#optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001)
#lr scheduler


#loss
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(10):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        data = batch['data'].to(device)  # B x T
        pred = model(data)
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
        loss.backward()
        # if step % 5:
        optim.step()

        if step % 50:
            print(loss)
    print(epoch)

0
1
2
3
4
5
6
7
8
9
CPU times: user 33min 28s, sys: 53.3 s, total: 34min 22s
Wall time: 34min 12s


In [None]:
class POS_predictorV2(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim//2, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        gru_out, _ = self.gru(emb_x)
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred

In [None]:
%%time
model = POS_predictorV2(vocab_len, 200, 512, n_classes)
model.train()
model = model.to(device)

#optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001)
#lr scheduler


#loss
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(10):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        data = batch['data'].to(device)  # B x T
        pred = model(data)
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
        loss.backward()
        # if step % 5:
        optim.step()

        if step % 1000:
            print(loss)
    print(epoch)
    #torch.save({'model': model.state_dict()}, 'C:\Users\Metall\Downloads\dv\epoch_%d.pth.tar' % epoch)

tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.0101, grad_fn=<NllLossBackward0>)
tensor(0.0094, grad_fn=<NllLossBackward0>)
tensor(0.0132, grad_fn=<NllLossBackward0>)
9
CPU times: user 18min 8s, sys: 21.6 s, total: 18min 30s
Wall time: 18min 17s

In [None]:
# inference
sequence = [23,10,15,14,4,19]
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(sequence).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = pred.argmax(-1)


#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'NUM', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']


In [None]:
#model
class CharModel(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        _, out = self.rnn(emb_x) # 1 x B x Hid

        return out


class POS_predictorV2Chars(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 n_chars: int,
                 char_emb_dim: int,
                 char_hidden_dim: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim + char_hidden_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)
        self.char_rnn = CharModel(n_chars, char_emb_dim, char_hidden_dim)

    def forward(self, x, x_chars):  # B x T
        emb_x = self.emb(x)  # B x T x V
        chars = [self.char_rnn(word.to(emb_x.device)).squeeze().unsqueeze(1) for word in x_chars]
        chars = torch.cat(chars, dim=1)
        gru_out, _ = self.gru(torch.cat((emb_x, chars), dim=-1))
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred


In [None]:
model = POS_predictorV2Chars(vocab_len, 200, 256, n_classes, n_chars, 32, 64)
model.train()
model = model.to(device)

#optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001)
#lr scheduler


#loss
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(10):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn_char,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        #
        optim.zero_grad()
        data = batch['data'].to(device)  # B x T
        pred = model(data, batch['chars'])
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
        loss.backward()
        # if step % 5:
        optim.step()
        #
        if step % 50:
            print(loss)
    print(epoch)

tensor(0.0062, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0085, grad_fn=<NllLossBackward0>)
19