In [0]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence, PackedSequence
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import random
import numpy as np
from time import time
from datetime import datetime

from pathlib import Path
from collections import Counter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [38]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz

--2020-03-27 08:29:50--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 2606:4700:10::6816:4b8e, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1306357571 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ru.300.vec.gz’


2020-03-27 08:31:29 (12.7 MB/s) - ‘cc.ru.300.vec.gz’ saved [1306357571/1306357571]



In [39]:
!gunzip cc.ru.300.vec.gz
!mkdir vector_cache/
!mv cc.ru.300.vec vector_cache/

mkdir: cannot create directory ‘vector_cache/’: File exists


In [40]:
!git clone https://github.com/UniversalDependencies/UD_Russian-Taiga.git

Cloning into 'UD_Russian-Taiga'...
remote: Enumerating objects: 76, done.[K
remote: Total 76 (delta 0), reused 0 (delta 0), pack-reused 76[K
Unpacking objects: 100% (76/76), done.


In [41]:
!mkdir data/
!mv UD_Russian-Taiga/ data/

mkdir: cannot create directory ‘data/’: File exists
mv: cannot move 'UD_Russian-Taiga/' to 'data/UD_Russian-Taiga': Directory not empty


In [0]:
PAD = '<PAD>'
PAD_ID = 0
UNK = '<UNK>'
UNK_ID = 1
VOCAB_PREFIX = [PAD, UNK]

VEC_PATH = Path('vector_cache') / 'cc.ru.300.vec'
DATA_PATH = Path('data') / 'UD_Russian-Taiga'
MAX_VOCAB = 25000

batch_size = 64
validation_split = .3
shuffle_dataset = True
random_seed = 42

In [0]:
class BaseVocab:
    def __init__(self, data, idx=0, lower=False):
        self.data = data
        self.lower = lower
        self.idx = idx
        self.build_vocab()
        
    def normalize_unit(self, unit):
        if self.lower:
            return unit.lower()
        else:
            return unit
        
    def unit2id(self, unit):
        unit = self.normalize_unit(unit)
        if unit in self._unit2id:
            return self._unit2id[unit]
        else:
            return self._unit2id[UNK]
    
    def id2unit(self, id):
        return self._id2unit[id]
    
    def map(self, units):
        return [self.unit2id(unit) for unit in units]

    def unmap(self, ids):
        return [self.id2unit(idx) for idx in ids]
        
    def build_vocab(self):
        NotImplementedError()
        
    def __len__(self):
        return len(self._unit2id)

In [0]:
class PretrainedWordVocab(BaseVocab):
    def build_vocab(self):
        self._id2unit = VOCAB_PREFIX + self.data
        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}

In [0]:
class WordVocab(BaseVocab):
    def build_vocab(self):
        if self.lower:
            counter = Counter([w[self.idx].lower() for sent in self.data for w in sent])
        else:
            counter = Counter([w[self.idx] for sent in self.data for w in sent])

        self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}

In [0]:
class Pretrain:
    def __init__(self, vec_filename, max_vocab=-1):
        self._vec_filename = vec_filename
        self._max_vocab = max_vocab
        
    @property
    def vocab(self):
        if not hasattr(self, '_vocab'):
            self._vocab, self._emb = self.read()
        return self._vocab
    
    @property
    def emb(self):
        if not hasattr(self, '_emb'):
            self._vocab, self._emb = self.read()
        return self._emb
        
    def read(self):
        if self._vec_filename is None:
            raise Exception("Vector file is not provided.")
        print(f"Reading pretrained vectors from {self._vec_filename}...")
        
        words, emb, failed = self.read_from_file(self._vec_filename, open_func=open)
        
        if failed > 0: # recover failure
            emb = emb[:-failed]
        if len(emb) - len(VOCAB_PREFIX) != len(words):
            raise Exception("Loaded number of vectors does not match number of words.")
            
        # Use a fixed vocab size
        if self._max_vocab > len(VOCAB_PREFIX) and self._max_vocab < len(words):
            words = words[:self._max_vocab - len(VOCAB_PREFIX)]
            emb = emb[:self._max_vocab]
                
        vocab = PretrainedWordVocab(words, lower=True)
        
        return vocab, emb
        
    def read_from_file(self, filename, open_func=open):
        """
        Open a vector file using the provided function and read from it.
        """
        first = True
        words = []
        failed = 0
        with open_func(filename, 'rb') as f:
            for i, line in enumerate(f):
                try:
                    line = line.decode()
                except UnicodeDecodeError:
                    failed += 1
                    continue
                if first:
                    # the first line contains the number of word vectors and the dimensionality
                    first = False
                    line = line.strip().split(' ')
                    rows, cols = [int(x) for x in line]
                    emb = np.zeros((rows + len(VOCAB_PREFIX), cols), dtype=np.float32)
                    continue

                line = line.rstrip().split(' ')
                emb[i+len(VOCAB_PREFIX)-1-failed, :] = [float(x) for x in line[-cols:]]
                words.append(' '.join(line[:-cols]))
        return words, emb, failed

In [0]:
FIELD_NUM = 10

class Word:
    def __init__(self, word):
        self._id = word[0]
        self._text = word[1]
        self._lemma = word[2]
        self._upos = word[3]
        self._xpos = word[4]
        self._feats = word[5]
        self._head = word[6]
        self._deprel = word[7]
        self._deps = word[8]
        self._misc = word[9]

    @property
    def id(self):
        return self._id

    @property
    def text(self):
        return self._text

    @property
    def lemma(self):
        return self._lemma

    @property
    def upos(self):
        return self._upos

    @property
    def xpos(self):
        return self._xpos

    @property
    def feats(self):
        return self._feats

    @property
    def head(self):
        return self._head

    @property
    def deprel(self):
        return self._deprel

    @property
    def deps(self):
        return self._deps

    @property
    def misc(self):
        return self._misc


class Sentence:
    def __init__(self, words):
        self._words = [Word(w) for w in words]

    @property
    def words(self):
        return self._words

class Document:
    def __init__(self, file_path):
        self._sentences = []
        self.load_conll(open(file_path, encoding='utf-8'))


    def load_conll(self, f, ignore_gapping=True):
        """ Load the file or string into the CoNLL-U format data.
        Input: file or string reader, where the data is in CoNLL-U format.
        Output: a list of list of list for each token in each sentence in the data, where the innermost list represents 
        all fields of a token.
        """
        # f is open() or io.StringIO()
        doc, sent = [], []
        for line in f:
            line = line.strip()
            if len(line) == 0:
                if len(sent) > 0:
                    doc.append(Sentence(sent))
                    sent = []
            else:
                if line.startswith('#'): # skip comment line
                    continue
                array = line.split('\t')
                if ignore_gapping and '.' in array[0]:
                    continue
                assert len(array) == FIELD_NUM, \
                        f"Cannot parse CoNLL line: expecting {FIELD_NUM} fields, {len(array)} found."
                sent += [array]
        if len(sent) > 0:
            doc.append(Sentence(sent))
        self._sentences = doc

    
    @property
    def sentences(self):
        return self._sentences


    def get(self, fields, as_sentences=False):
        assert isinstance(fields, list), "Must provide field names as a list."
        assert len(fields) >= 1, "Must have at least one field."

        results = []
        for sentence in self.sentences:
            cursent = []
            units = sentence.words
            for unit in units:
                if len(fields) == 1:
                    cursent += [getattr(unit, fields[0])]
                else:
                    cursent += [[getattr(unit, field) for field in fields]]

            # decide whether append the results as a sentence or a whole list
            if as_sentences:
                results.append(cursent)
            else:
                results += cursent
        return results

In [0]:
class CONLLUDataset(Dataset):
    def __init__(self, doc, pretrain, vocab=None, test=False):
        self.pretrain_vocab = pretrain.vocab
        self.test = test
        data = self.load_doc(doc)

        if vocab is None:
            self.vocab = self.init_vocab(data)
        else:
            self.vocab = vocab

        self.data = self.preprocess(data, self.vocab, self.pretrain_vocab)

    def init_vocab(self, data):
        wordvocab = WordVocab(data, idx=0)
        uposvocab = WordVocab(data, idx=1)
        vocab = {
            'word': wordvocab,
            'upos': uposvocab}
        return vocab

    def preprocess(self, data, vocab, pretrain_vocab):
        processed = []
        for sent in data:
            processed_sent = [vocab['word'].map([w[0] for w in sent])]
            processed_sent += [vocab['upos'].map([w[1] for w in sent])]
            processed_sent += [pretrain_vocab.map([w[0].lower() for w in sent])]
            processed.append(processed_sent)
        return processed
        
    def load_doc(self, doc):
        data = doc.get(['text', 'upos'], as_sentences=True)
        return data
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [0]:
pretrain = Pretrain(VEC_PATH, MAX_VOCAB)

In [0]:
train_doc = Document(DATA_PATH / 'ru_taiga-ud-train.conllu')
train_dataset = CONLLUDataset(train_doc, pretrain)

In [0]:
vocab = train_dataset.vocab
dev_doc = Document(DATA_PATH / 'ru_taiga-ud-dev.conllu')
dev_dataset = CONLLUDataset(dev_doc, pretrain, vocab=vocab, test=True)

In [51]:
train_dataset[0]

[[1507, 1508, 695, 2], [9, 4, 2, 3], [719, 13335, 16854, 2]]

In [0]:
def pad_collate(batch):
    (words, upos, pretrained) = zip(*batch)

    word_lens = [len(w) for w in words]
    upos_lens = [len(u) for u in upos]
    pretrained_lens = [len(p) for p in pretrained]

    words = [torch.LongTensor(w).to(device) for w in words]
    upos = [torch.LongTensor(u).to(device) for u in upos]
    pretrained = [torch.LongTensor(p).to(device) for p in pretrained]

    word_pad = pad_sequence(words, batch_first=True, padding_value=PAD_ID)
    upos_pad = pad_sequence(upos, batch_first=True, padding_value=PAD_ID)
    pretrained_pad = pad_sequence(pretrained, batch_first=True, padding_value=PAD_ID)

    word_pad = word_pad.to(device)
    upos_pad = upos_pad.to(device)
    pretrained_pad = pretrained_pad.to(device)

    return word_pad, upos_pad, pretrained_pad, word_lens, upos_lens, pretrained_lens

In [0]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_dataset, collate_fn=pad_collate)

In [0]:
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=shuffle_dataset, collate_fn=pad_collate)

In [0]:
class Tagger(nn.Module):
    def __init__(self, vocab, word_emb_dim, transformed_dim, emb_matrix, hidden_dim, upos_clf_hidden_dim, num_layers, dropout):
        super().__init__()

        self.vocab = vocab
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        input_size = 0

        self.word_emb = nn.Embedding(len(vocab['word']), word_emb_dim, padding_idx=0)
        input_size += word_emb_dim

        self.pretrained_emb = nn.Embedding.from_pretrained(torch.from_numpy(emb_matrix), freeze=True)
        self.trans_pretrained = nn.Linear(emb_matrix.shape[1], transformed_dim, bias=False)
        input_size += transformed_dim

        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.lstm_h_init = nn.Parameter(torch.zeros(2 * num_layers, 1, hidden_dim))
        self.lstm_c_init = nn.Parameter(torch.zeros(2 * num_layers, 1, hidden_dim))

        self.upos_hid = nn.Linear(2* hidden_dim, upos_clf_hidden_dim)
        self.upos_clf = nn.Linear(upos_clf_hidden_dim, len(vocab['upos']))

        self.crit = nn.CrossEntropyLoss(ignore_index=0)

        self.drop = nn.Dropout(dropout)

    
    def forward(self, word_pad, upos_pad, pretrained_pad, word_lens, upos_lens, pretrained_lens):
        inputs = []

        word_emb = self.word_emb(word_pad)
        word_emb = pack_padded_sequence(word_emb, word_lens, batch_first=True, enforce_sorted=False)
        inputs += [word_emb]

        pretrained_emb = self.pretrained_emb(pretrained_pad)
        pretrained_emb = self.trans_pretrained(pretrained_emb)
        pretrained_emb = pack_padded_sequence(pretrained_emb, pretrained_lens, batch_first=True, enforce_sorted=False)
        inputs += [pretrained_emb]

        lstm_inputs = torch.cat([x.data for x in inputs], 1)
        lstm_inputs = self.drop(lstm_inputs)
        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)

        lstm_outputs, (h0, c0) = self.lstm(
            lstm_inputs, 
            (self.lstm_h_init.expand(2 * self.num_layers, word_pad.size(0), self.hidden_dim).contiguous(), 
             self.lstm_c_init.expand(2 * self.num_layers, word_pad.size(0), self.hidden_dim).contiguous())
        )
        lstm_outputs = lstm_outputs.data

        upos_hid = F.relu(self.upos_hid(self.drop(lstm_outputs)))
        upos_pred = self.upos_clf(self.drop(upos_hid))

        pred = pad_packed_sequence(PackedSequence(upos_pred, word_emb.batch_sizes), batch_first=True)[0]
        pred = pred.max(2)[1]
        upos = pack_padded_sequence(upos_pad, upos_lens, batch_first=True, enforce_sorted=False).data
        loss = self.crit(upos_pred, upos)

        return loss, pred

In [0]:
class Trainer:
    def __init__(self, vocab, word_emb_dim, transformed_dim, emb_matrix, hidden_dim, upos_clf_hidden_dim, num_layers, dropout, use_cuda=False):
        self.use_cuda = use_cuda

        self.vocab = vocab
        self.model = Tagger(vocab, word_emb_dim, transformed_dim, emb_matrix, hidden_dim, upos_clf_hidden_dim, num_layers, dropout)
        self.parameters = [p for p in self.model.parameters() if p.requires_grad]

        self.model.to(device)

        self.optimizer = torch.optim.Adam(self.parameters)
 
    def update(self, batch, eval=False):
        word_pad, upos_pad, pretrained_pad, word_lens, upos_lens, pretrained_lens = batch

        if eval:
            self.model.eval()
        else:
            self.model.train()
            self.optimizer.zero_grad()

        loss, _ = self.model(word_pad, upos_pad, pretrained_pad, word_lens, upos_lens, pretrained_lens)
        loss_val = loss.data.item()
        if eval:
            return loss_val

        loss.backward()
        self.optimizer.step()

        return loss_val


    def predict(self, batch):
        word_pad, upos_pad, pretrained_pad, word_lens, upos_lens, pretrained_lens = batch

        self.model.eval()
        _, pred = self.model(word_pad, upos_pad, pretrained_pad, word_lens, upos_lens, pretrained_lens)
        pred = [self.vocab['upos'].unmap(sent) for sent in pred.tolist()]

        return pred

In [0]:
word_emb_dim = 75
transformed_dim = 125
emb_matrix = pretrain.emb
hidden_dim = 200
upos_clf_hidden_dim = 400
num_layers = 2
dropout = 0.5
use_cuda = True if device == 'cuda' else False

In [0]:
trainer = Trainer(vocab, word_emb_dim, transformed_dim, emb_matrix, hidden_dim, upos_clf_hidden_dim, num_layers, dropout, use_cuda)

In [0]:
global_step = 0
max_steps = 50000
dev_score_history = []
format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch)'
last_best_step = 0

log_step = 20
eval_interval = 100

In [235]:
train_loss = 0
while True:
    do_break = False
    for batch in train_loader:
        start_time = time()
        global_step += 1
        loss = trainer.update(batch, eval=False)
        train_loss += loss

        if global_step % log_step == 0:
            duration = time() - start_time
            print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                    max_steps, loss, duration))
            
        if global_step % eval_interval == 0:
            print("Evaluating on dev set...")
            dev_preds = []
            for batch in dev_loader:
                preds = trainer.predict(batch)
                dev_preds += preds
            
            train_loss = train_loss / eval_interval
            print("step {}: train_loss = {:.6f}".format(global_step, train_loss))
            train_loss = 0

        if global_step >= max_steps:
            do_break = True
            break

        if do_break:
            break

2020-03-27 10:04:12: step 20/50000, loss = 2.312118 (0.026 sec/batch)
2020-03-27 10:04:13: step 40/50000, loss = 1.894985 (0.026 sec/batch)
2020-03-27 10:04:13: step 60/50000, loss = 1.607767 (0.035 sec/batch)
2020-03-27 10:04:14: step 80/50000, loss = 1.212688 (0.037 sec/batch)
2020-03-27 10:04:14: step 100/50000, loss = 1.016654 (0.034 sec/batch)
Evaluating on dev set...
step 100: train_loss = 1.760188
2020-03-27 10:04:15: step 120/50000, loss = 0.937383 (0.028 sec/batch)
2020-03-27 10:04:16: step 140/50000, loss = 0.836219 (0.034 sec/batch)
2020-03-27 10:04:17: step 160/50000, loss = 0.796540 (0.025 sec/batch)
2020-03-27 10:04:17: step 180/50000, loss = 0.762427 (0.031 sec/batch)
2020-03-27 10:04:18: step 200/50000, loss = 0.807348 (0.032 sec/batch)
Evaluating on dev set...
step 200: train_loss = 0.844437
2020-03-27 10:04:19: step 220/50000, loss = 0.643031 (0.026 sec/batch)
2020-03-27 10:04:19: step 240/50000, loss = 0.705730 (0.026 sec/batch)
2020-03-27 10:04:20: step 260/50000, l

KeyboardInterrupt: ignored