## Посимвольная языковая модель.

В первом задании Вам нужно написать и обучить посимвольную нейронную языковую модель для вычисления вероятностей буквенных последовательностей (то есть слов). Такие модели используются в задачах словоизменения и распознавания/порождения звучащей речи. Для обучения модели используйте данные для русского языка из [репозитория](https://github.com/sigmorphon/conll2018/tree/master/task1/surprise).

**В процессе написания Вам нужно решить следующие проблемы:**
    
* как будет выглядеть обучающая выборка; что будет являться признаками, и что - метками классов.
* как сделать так, чтобы модель при предсказании символа учитывала все предыдущие символы слова.
* какие специальные символы нужно использовать.
* как передавать в модель текущее состояние рекуррентной сети

**Результаты:**

* предобработчик данных,
* генератор обучающих данных (батчей),
* обученная модель
* перплексия модели на настроечной выборке
* посимвольные вероятности слов в контрольной выборке

**Дополнительно:**

* дополнительный вход модели (часть речи слова, другие морфологические признаки), влияет ли его добавление на перплексию
* сравнение различных архитектур нейронной сети (FC, RNN, LSTM, QRNN, ...)

In [1]:
# it is better to do all imports at the first cell
import torch
from torch.nn import Module
from torch.nn import Embedding, RNN, GRU, LSTM, Linear
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np

import matplotlib.pyplot as plt
from re import findall

In [2]:
# Uncomment to download data
# !wget https://github.com/sigmorphon/conll2018/blob/master/task1/surprise/russian-train-high
# !wget https://github.com/sigmorphon/conll2018/blob/master/task1/surprise/russian-dev
# !wget https://github.com/sigmorphon/conll2018/blob/master/task1/surprise/russian-covered-test

In [3]:
def read_dataset(infile):
    words, tags = [], []
    with open(infile, "r", encoding="utf8") as f:
        for line in f:
            line = line.strip()
            splitted = line.split("\t")
            if len(splitted) == 3:
                words.append(findall(r'[а-яё]+', splitted[0].lower())[0])
                tags.append(splitted[2].split('<')[0].split(';')[:])
            if len(splitted) == 2:
                words.append(findall(r'[а-яё]+', splitted[0].lower())[0])
                tags.append(splitted[1].split('<')[0].split(';')[:])
    return words, tags

train_words, train_tags = read_dataset("russian-train-high")
dev_words, dev_tags = read_dataset("russian-dev")
test_words, test_tags = read_dataset("russian-covered-test")

Examination of the data

In [4]:
len(train_words), len(train_tags), len(dev_words), len(dev_tags), len(test_words), len(test_tags)

(10000, 10000, 1000, 1000, 1000, 1000)

In [5]:
for i in range(10):
    print('word:', dev_words[i], '; tags:', train_tags[i])

word: насылать ; tags: ['ADJ', 'DAT', 'NEUT', 'SG']
word: разостлать ; tags: ['ADJ', 'INS', 'NEUT', 'SG']
word: жэк ; tags: ['V.CVB', 'PST']
word: гастрольный ; tags: ['ADJ', 'ANIM', 'ACC', 'MASC', 'SG']
word: литьё ; tags: ['N', 'DAT', 'PL']
word: иноплеменный ; tags: ['ADJ', 'INS', 'NEUT', 'SG']
word: блестеть ; tags: ['N', 'NOM', 'PL']
word: копаться ; tags: ['V', 'PST', 'SG', 'FEM']
word: подъезд ; tags: ['V', 'FUT', '2', 'PL']
word: кувырнуться ; tags: ['ADJ', 'FEM', 'SG', 'LGSPEC1']


In [6]:
train_words[:5]

['валлонский', 'незаконченный', 'истрёпывать', 'личный', 'серьга']

Подумайте, какие вспомогательные токены могут быть вам полезны. Выдайте им индексы от `0` до `len(AUXILIARY) - 1`

I will add tokens BEG and END. They will help model to better understand beginnings and endings. Also I will add UNK token in the case if a symbol was missed in training data and PAD.

In [7]:
AUXILIARY = ['_', '@', '<', '>'] # padding, unknown, beggining, ending

In [8]:
class Vocabulary:
    def fit(self, data):
        """Extract unique symbols from the data, make itos (item to string) and stoi (string to index) objects"""
        symbols = set(x for elem in data for x in elem)
        self._symbols = AUXILIARY + sorted(symbols)
        # Запомните следующую строчку кода - она нужна примерно всегда
        self._s2i = {s: i for i, s in enumerate(self._symbols)}
        return self

    def __len__(self):
        return len(self._symbols)

    def transform(self, data):
        """Transform data to indices
        Input:
            - data, list of strings
        Output:
            - list of list of char indices

        >>> self.transform(['word1', 'token2'])
        >>> [[24, 2, 19, 13, 3], [8, 2, 9, 1, 7, 4]]
        """
        transformed_words = []
        for word in data:
            tr_word = []
            for s in word:
                tr_word.append(self._s2i[s] if s in self._s2i else self._s2i['@'])
            transformed_words.append([self._s2i['<']] + tr_word + [self._s2i['>']])
        return transformed_words
    
    def s2i(self, symbol):
        assert(symbol in self._s2i)
        return self._s2i[symbol]

In [9]:
char_voc = Vocabulary()
char_voc.fit(train_words)
char_voc.transform(['машина', 'самолёт'])

[[2, 16, 4, 28, 12, 17, 4, 3], [2, 21, 4, 16, 18, 15, 36, 22, 3]]

### Make dataset class

In [10]:
class CharDataset(Dataset):
    
    def __init__(self, data):
        self.data = data
        self.char_vocab = Vocabulary()
        self.char_vocab.fit(data)
        self.proc_data = self.char_vocab.transform(data)
        self.proc_data.sort(key = lambda w: len(w), reverse=True)
        self.proc_data = self._pad(self.proc_data)
        
    def __len__(self):
        return len(self.proc_data)
    
    def __getitem__(self, index):
        x = torch.tensor(self.proc_data[index][:-1], dtype=torch.long)
        y = torch.tensor(self.proc_data[index][1:], dtype=torch.long)
        return x, y
    
    def _pad(self, data, maxlen=None):
        if maxlen is None:
            maxlen = max([len(w) for w in data])
        for i in range(len(data)):
            data[i] = data[i] + [0] * (maxlen - len(data[i]))
        return data
    
    def get_word(self, index):
        return self.data[index]

In [11]:
train_dataset = CharDataset(train_words)
val_dataset = CharDataset(dev_words)
test_dataset = CharDataset(test_words)

In [12]:
print(len(train_dataset.char_vocab), len(val_dataset.char_vocab), len(test_dataset.char_vocab))

37 37 37


In [13]:
train_loader = DataLoader(train_dataset, 5)

In [14]:
for x, y in train_loader:
    print(x.shape, y.shape)
    print(x,'\n', y)
    break

torch.Size([5, 24]) torch.Size([5, 24])
tensor([[ 2,  6, 31, 21, 18, 14, 18, 14,  6,  4, 15, 12, 24, 12, 26, 12, 20, 18,
          6,  4, 17, 17, 31, 13],
        [ 2,  7, 12,  8, 20, 18, 16,  9, 22,  9, 18, 20, 18, 15, 18,  7, 12, 27,
          9, 21, 14, 12, 13,  3],
        [ 2, 20,  4,  8, 12, 18, 14, 18, 16, 16, 23, 17, 12, 14,  4, 26, 12, 18,
         17, 17, 31, 13,  3,  0],
        [ 2, 14, 18, 17, 22, 20, 22,  9, 20, 20, 18, 20, 12, 21, 22, 12, 27,  9,
         21, 14, 12, 13,  3,  0],
        [ 2,  6, 31, 21, 18, 14, 18, 25, 23,  8, 18, 10,  9, 21, 22,  6,  9, 17,
         17, 31, 13,  3,  0,  0]]) 
 tensor([[ 6, 31, 21, 18, 14, 18, 14,  6,  4, 15, 12, 24, 12, 26, 12, 20, 18,  6,
          4, 17, 17, 31, 13,  3],
        [ 7, 12,  8, 20, 18, 16,  9, 22,  9, 18, 20, 18, 15, 18,  7, 12, 27,  9,
         21, 14, 12, 13,  3,  0],
        [20,  4,  8, 12, 18, 14, 18, 16, 16, 23, 17, 12, 14,  4, 26, 12, 18, 17,
         17, 31, 13,  3,  0,  0],
        [14, 18, 17, 22, 20, 22,  9, 

In [15]:
class CharRnnModel(Module):
    
    def __init__(self, hidden_size, emb_dim, vocab_len, num_layers=1, dropout=0.5):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embed = Embedding(vocab_len, emb_dim)
        self.gru = GRU(emb_dim, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc = Linear(hidden_size, vocab_len)
        
    def forward(self, inputs, h0=None):
        if len(inputs.shape) == 0:
            inputs = inputs.unsqueeze(0).unsqueeze(0)
        if len(inputs.shape) == 1:
            inputs = inputs.unsqueeze(1)
        print(inputs.shape)
        emb = self.embed(inputs)
        print(emb.shape)
        if h0 is None:
            gru_out, gru_hidden = self.gru(emb)
        else:
            gru_out, gru_hidden = self.gru(emb, h0)
        print(gru_out.shape, gru_hidden.shape)
        out = self.fc(gru_out)
        print(out.shape)
        return out, gru_hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

In [16]:
model = CharRnnModel(100, 105, len(char_voc), 2)
criterion = torch.nn.CrossEntropyLoss()

In [18]:
for x, y in train_loader:
    h0 = model.init_hidden(train_loader.batch_size)
    print(x.shape, y.shape)
    out, h0 = model(x, h0)
    print(out.shape, y.shape)
    loss = criterion(out.permute(0,2,1), y)
#     print(loss)
#     print(h0)
    break

torch.Size([5, 24]) torch.Size([5, 24])
torch.Size([5, 24])
torch.Size([5, 24, 105])
torch.Size([5, 24, 100]) torch.Size([2, 5, 100])
torch.Size([5, 24, 37])
torch.Size([5, 24, 37]) torch.Size([5, 24])


In [None]:
torch.Size([10, 29]) 10
torch.Size([2, 10, 100])
torch.Size([10]) torch.Size([2, 10, 100])
torch.Size([10, 29, 300])

In [18]:
torch.log(torch.tensor(2, dtype=torch.float32))

tensor(0.6931)

In [19]:
def accuracy(pred, y):
    
    pred_lbl = F.softmax(pred, dim=2).argmax(dim=2)
    correct = (pred_lbl == y).float()
    acc = torch.mean(correct.sum(dim=1) / correct.shape[1])
    return acc

def perplexity(pred):
    
    pred_probs = torch.max(F.softmax(pred, dim=2), dim=2)[0]
    perplexity = torch.exp(-torch.mean(torch.log(pred_probs)))
    return perplexity

In [20]:
def train(model, loader, criterion, optimizer, device):
    model.train()
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_perplexity = 0
    
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        
        model.zero_grad()
        
        pred, _ = model(x)
        loss = criterion(pred.permute(0,2,1), y)
        acc = accuracy(pred, y)
        perplx = perplexity(pred)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_perplexity += perplx.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader), epoch_perplexity / len(loader)

In [21]:
def evaluate(model, loader, criterion, device):
    model.eval()
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_perplexity = 0
    
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)

            pred, _ = model(x)
            loss = criterion(pred.permute(0,2,1), y)
            acc = accuracy(pred, y)
            perplx = perplexity(pred)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_perplexity += perplx.item()

    return epoch_loss / len(loader), epoch_acc / len(loader), epoch_perplexity / len(loader)

In [22]:
BATCH_SIZE = 256
N_HIDDEN = 256
EMB_DIM = 200
EPOCHS = 60
N_LAYERS = 2

model = CharRnnModel(N_HIDDEN, EMB_DIM, len(char_voc), num_layers=N_LAYERS)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)
device = torch.device('cuda:1')

train_loader = DataLoader(train_dataset, BATCH_SIZE, num_workers=4)
val_loader = DataLoader(val_dataset, BATCH_SIZE, num_workers=4)

model.to(device)
for epoch in range(EPOCHS):
    
    train_loss, train_acc, train_perplexity = train(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc, val_perplexity = evaluate(model, val_loader, criterion, device)
    
    print('{} | train: loss {:.4f}, acc {:.3f}, perplexity: {:.4f} | val: loss {:.4f}, acc {:.3f}, perplexity {:.4f}'
          .format(epoch + 1, train_loss, train_acc, train_perplexity, val_loss, val_acc, val_perplexity))

1 | train: loss 1.3684, acc 0.637, perplexity: 4.0650 | val: loss 1.9079, acc 0.642, perplexity 1.3865
2 | train: loss 1.2171, acc 0.688, perplexity: 1.8636 | val: loss 0.9785, acc 0.716, perplexity 1.8028
3 | train: loss 0.9749, acc 0.716, perplexity: 1.8008 | val: loss 0.9518, acc 0.721, perplexity 1.6678
4 | train: loss 0.9454, acc 0.723, perplexity: 1.7406 | val: loss 0.9182, acc 0.731, perplexity 1.6325
5 | train: loss 0.9204, acc 0.728, perplexity: 1.7078 | val: loss 0.8974, acc 0.737, perplexity 1.6208
6 | train: loss 0.9022, acc 0.732, perplexity: 1.6857 | val: loss 0.8792, acc 0.741, perplexity 1.6107
7 | train: loss 0.8866, acc 0.736, perplexity: 1.6663 | val: loss 0.8652, acc 0.744, perplexity 1.6031
8 | train: loss 0.8730, acc 0.739, perplexity: 1.6513 | val: loss 0.8518, acc 0.746, perplexity 1.5943
9 | train: loss 0.8605, acc 0.741, perplexity: 1.6391 | val: loss 0.8420, acc 0.749, perplexity 1.5832
10 | train: loss 0.8494, acc 0.744, perplexity: 1.6275 | val: loss 0.8329

Get probabilities for symbols in each word from validation data.

In [23]:
loader = DataLoader(test_dataset, 1)
model.eval()
symbol_probs = []
for i, (x, y) in enumerate(loader):
    x, y = x.to(device), y.to(device)
    
    with torch.no_grad():
        pred, _ = model(x)
        probs = torch.softmax(pred, dim=2)
    
    word = test_dataset.get_word(i)
    symbol_probs.append([(word[j], probs[0][j][test_dataset.char_vocab.s2i(word[j])].item()) for j in range(len(word))])

for word in symbol_probs[:50]:
    for symbol in word:
        print('{}({:.2f}), '.format(symbol[0], symbol[1]), end='')
    print()

м(0.04), а(0.00), л(0.00), ь(0.00), т(0.03), и(0.04), й(0.02), с(0.01), к(0.00), и(0.01), й(0.00), 
р(0.04), а(0.05), с(0.21), ч(0.03), л(0.03), е(0.00), н(0.01), и(0.06), т(0.06), ь(0.00), 
л(0.03), о(0.16), п(0.00), а(0.01), т(0.01), ь(0.00), с(0.00), я(0.01), 
и(0.03), н(0.02), д(0.00), е(0.00), к(0.01), с(0.00), и(0.01), р(0.00), о(0.00), в(0.00), а(0.10), т(0.01), ь(0.00), 
с(0.10), в(0.02), о(0.00), е(0.18), в(0.00), р(0.00), е(0.00), м(0.00), е(0.00), н(0.00), н(0.00), ы(0.00), й(0.00), 
р(0.04), а(0.00), с(0.02), п(0.00), р(0.01), а(0.00), в(0.00), и(0.01), т(0.00), ь(0.00), 
з(0.04), а(0.10), т(0.00), о(0.00), р(0.01), о(0.00), п(0.00), и(0.00), т(0.00), ь(0.00), с(0.01), я(0.00), 
р(0.04), а(0.10), с(0.00), с(0.20), в(0.03), е(0.00), т(0.00), 
к(0.07), р(0.00), а(0.00), с(0.00), и(0.00), т(0.10), ь(0.19), 
п(0.12), е(0.11), р(0.00), е(0.00), у(0.29), с(0.12), т(0.00), р(0.01), о(0.64), и(0.00), т(0.00), ь(0.00), 
а(0.04), т(0.18), 
р(0.04), а(0.14), с(0.00), ч(0.01), л(0.00),

In [27]:
torch.save({
    'model_state_dict': model.state_dict(),
    'model_params': {
        'n_hidden': N_HIDDEN,
        'emb_dims': EMB_DIM,
        'n_layers': N_LAYERS,
        'dropout': 0.5
    },
    'vocab_info': {
        'vocab': char_voc._symbols,
        's2i': char_voc._s2i,
        'i2s': {i:s for s, i in char_voc._s2i.items()},
        'spec_tokens': AUXILIARY,
    }
}, 'charRNNmodel.pt')

In [25]:
model

CharRnnModel(
  (embed): Embedding(37, 200)
  (gru): GRU(200, 256, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=256, out_features=37, bias=True)
)