In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
import gensim
from gensim.models import KeyedVectors
from IPython.display import clear_output
from collections import Counter
from tqdm import tqdm
import seaborn as sns
import nltk
import json

%matplotlib inline

In [2]:
from pymystem3 import Mystem
from string import punctuation

In [31]:
def read_queries_with_lemmatization(path):
    f = open(path)
    queries = []
    tags = []
    mystem = Mystem()
    tokenizer = WordPunctTokenizer()
    
    for line in f:
        tmp = []
        tmp_ = []
        text = tokenizer.tokenize(line.lower())
        text_tagged = nltk.pos_tag(text, lang='rus')
        
        for i, q in enumerate(text):
            if not np.all(np.any(np.array(list(q)).reshape(-1, 1) == np.array(list(punctuation)).reshape(1, -1), axis=1)):
                q_ = mystem.lemmatize(q)
                tmp.append("".join(q_).split()[0])
                tmp_.append(text_tagged[i][1])
        queries.append(tmp)
        tags.append(tmp_)
    f.close()
    return (queries, tags)

In [35]:
path = 'data/requests.uniq.train'
train_lem = read_queries_with_lemmatization(path)
path = 'data/requests.uniq.test'
test_lem = read_queries_with_lemmatization(path)
train_lem[0][:5], train_lem[1][:5], test_lem[0][:5], test_lem[1][:5], len(train_lem[0]), len(test_lem[0])

([['сибирский', 'сеть', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карта', 'россия'],
  ['ноофен', 'для', 'какой', 'болезнь'],
  ['маус', 'хаус', 'спб']],
 [['A=pl', 'S', 'A=m', 'S', 'S'],
  ['NUM=ciph', 'V', 'S', 'NUM=ciph'],
  ['S', 'S', 'PR', 'S', 'S'],
  ['V', 'PR', 'A-PRO=pl', 'A=f'],
  ['NONLEX', 'NONLEX', 'NONLEX']],
 [['сбербанк', 'в', 'кунцево', 'плаза'],
  ['торт', 'дикий', 'вишня'],
  ['тася', 'кривун', 'танец', 'на', 'тнт'],
  ['рбт', 'ру'],
  ['toplü', 'vay', 'sexx']],
 [['V', 'PR', 'S', 'S'],
  ['S', 'A=f', 'S'],
  ['S', 'S', 'S', 'PR', 'S'],
  ['V', 'S'],
  ['NONLEX', 'NONLEX', 'NONLEX']],
 51353,
 21174)

In [33]:
count_words = Counter()

for d in [train_lem[0]]:
    for q in d:
        for word in q:
            count_words[word] += 1
        
freq, counts = np.unique(np.array(list(count_words.values())), return_counts=True) 
p = counts * freq 
p = p / p.sum()
p = np.cumsum(p)
freq[:10], p[:10]

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([0.12185658, 0.16966756, 0.20326518, 0.23018925, 0.2515358 ,
        0.27062838, 0.28792218, 0.30358516, 0.31885922, 0.32964299]))

In [50]:
tmp = np.unique(np.hstack(train_lem[1]))
tmp1 = np.unique(np.hstack(test_lem[1]))
tmp = np.hstack([tmp, tmp1])
tmp = np.unique(tmp)
tmp, len(tmp)

(array(['A', 'A-PRO', 'A-PRO=f', 'A-PRO=m', 'A-PRO=n', 'A-PRO=pl',
        'A-PRO=sg', 'A=brev', 'A=comp', 'A=comp2', 'A=f', 'A=m', 'A=n',
        'A=pl', 'A=sg', 'ADV', 'ADV-PRO', 'ADV-PRO=abbr', 'ADV-PRO=comp',
        'ADV-PRO=distort', 'ADV=abbr', 'ADV=comp', 'ADV=comp2',
        'ANUM=ciph', 'ANUM=f', 'ANUM=m', 'ANUM=n', 'ANUM=pl', 'ANUM=sg',
        'CONJ', 'INIT=abbr', 'INTJ', 'INTJ=distort', 'NONLEX',
        'NONLEX=abbr', 'NUM', 'NUM=acc', 'NUM=ciph', 'NUM=comp', 'NUM=dat',
        'NUM=f', 'NUM=gen', 'NUM=ins', 'NUM=loc', 'NUM=m', 'NUM=n',
        'NUM=nom', 'PARENTH', 'PART', 'PR', 'PRAEDIC', 'PRAEDIC-PRO',
        'PRAEDIC=comp', 'S', 'S-PRO', 'S-PRO=acc', 'S-PRO=dat',
        'S-PRO=gen', 'S-PRO=ins', 'S-PRO=loc', 'S-PRO=n=sg', 'S-PRO=pl',
        'S=m', 'V'], dtype='<U32'), 64)

In [51]:
tags = tmp
tags_to_ind = {}
ind = 0
for t in tags:
    tags_to_ind[t] = ind
    ind += 1

## Сетки

* Просто добавляем тег как фичу

In [34]:
emb_2 = KeyedVectors.load_word2vec_format("ft_native_300_ru_wiki_lenta_lower_case.vec")

In [53]:
import torch, torch.nn as nn
import torch.nn.functional as F

In [54]:
def calculate_n_tokens(emb):
    n_tokens = 0
    for word in count_words.keys():
        if word in emb.vocab and count_words[word] >= 3:
            n_tokens += 1
    return n_tokens + 1

In [73]:
def transform_to_features(emb, emb_size, ind_to_word, batch_x, batch_x_tags):
    to_emb = np.zeros((len(batch_x), len(batch_x[0])+1, emb_size + len(tags)))
    for i in range(len(batch_x)):
        to_emb[i][0] = np.ones(emb_size + len(tags))
        for j in range(len(batch_x[i])):
            if batch_x[i][j] != pad_id:
                to_emb[i][j+1][:emb_size] = emb[ind_to_word[batch_x[i][j]]]
                if batch_x_tags[i][j] >= 0:
                    to_emb[i][j+1][emb_size + batch_x_tags[i][j]] = 1
    return to_emb

In [74]:
class Net(nn.Module):
    def __init__(self, emb, ind_to_word, emb_size=300, lstm_units=256):
        super(self.__class__, self).__init__()
        n_tokens = calculate_n_tokens(emb)
        self.lstm = nn.LSTM(emb_size + len(tags), lstm_units, batch_first=True)
        self.logits = nn.Linear(lstm_units, n_tokens) 
        self.emb = emb
        self.emb_size = emb_size
        self.ind_to_word = ind_to_word
        
    def forward(self, batch_x, batch_x_tags):
        input_emb = transform_to_features(self.emb, self.emb_size, self.ind_to_word, batch_x, batch_x_tags)
        input_emb = torch.tensor(input_emb, dtype=torch.float32)
        lstm_out = self.lstm(input_emb)
        logits = self.logits(lstm_out[0])
        
        return logits

In [75]:
pad = '#PAD#'
pad_id = 0

def construct_vocab(emb, count_words):
    word_to_ind = dict()
    word_to_ind['#PAD#'] = 0
    ind_to_word = ['#PAD#', ]
    
    count = 1
    for word in count_words.keys():
        if count_words[word] >= 3 and word in emb.vocab:
            ind_to_word.append(word)
            word_to_ind[word] = count
            count += 1
    return ind_to_word, word_to_ind


def as_matrix(sequences, tags, word_to_ind, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((2, len(sequences), max_len), dtype=int)
    for i, seq in enumerate(sequences):
        for j, word in enumerate(seq[:max_len]):
            if word in word_to_ind.keys():
                matrix[0][i][j] = word_to_ind[word]
                matrix[1][i][j] = tags_to_ind[tags[i][j]]
            else:
                matrix[0][i][j] = pad_id
                matrix[1][i][j] = -1
        for j in range(max_len, len(seq)):
            matrix[0][i][j] = pad_id
            matrix[1][i][j] = -1
    
    return matrix

In [81]:
def compute_loss(network, batch):
    """
    use scalar crossentropy loss (neg llh) loss 
    """
    batch_x = batch[0]
    batch_tags = batch[1]
    batch_x = np.array(batch_x)
    batch_tags = np.array(batch_tags)

    batch_x_inp = batch_x[:, :-1]
    batch_x_next = batch_x[:, 1:]
    batch_tags_inp = batch_tags[:, :-1]
    batch_tags_next = batch_tags[:, 1:]
    
    logits_for_next = network.forward(batch_x_inp, batch_tags_inp)
    logits_for_next = logits_for_next[:, 1:]
    
    answers = torch.argmax(logits_for_next, dim=-1).numpy()
    logits_for_next = logits_for_next.contiguous()
    logits_for_next = logits_for_next.view(-1, logits_for_next.shape[-1])
    
    accr = np.array([answers == batch_x_next]) * np.array([answers != pad_id])
    accr = accr.sum()
    to_div = np.sum(np.array([batch_x_next != pad_id]))
    batch_x_next = torch.tensor(batch_x_next, dtype=torch.int64)
    batch_x_next = batch_x_next.view(-1)
    
    loss = F.cross_entropy(logits_for_next, batch_x_next, ignore_index=pad_id, reduction='mean')
    
    
    return loss, accr, to_div

In [82]:
from random import choice

def generate_batch(train, batch_size, word_to_ind, max_len=None):
    random_x = np.random.randint(0, len(train[0]), size=batch_size)
    batch_x = []
    batch_tags = []
    for x in random_x:
        batch_x.append(train[0][x])
        batch_tags.append(train[1][x])
    return as_matrix(batch_x, batch_tags, word_to_ind, max_len)

In [83]:
batch_size = 64 
n_epochs = 20 
n_batches_per_epoch = 400  
n_validation_batches = 160

In [85]:
from torch.optim import Adam
from tqdm import tqdm

ind_to_word, word_to_ind = construct_vocab(emb_2, count_words)
network = Net(emb_2, ind_to_word)
opt = Adam(network.parameters())

train_loss, val_loss, train_accr, val_accr = [], [], [], []

for epoch in range(n_epochs):
    train_loss_=0
    train_accr_=0
    to_div = 0
    network.train(True)
    for _ in tqdm(range(n_batches_per_epoch)):
        
        loss_t, accr_t, to_div_t = compute_loss(network, generate_batch(train_lem, batch_size, word_to_ind))
        
        loss_t.backward()
        opt.step()
        opt.zero_grad()
        
        train_loss_ += loss_t.item()
        train_accr_ += accr_t.item()
        to_div += to_div_t
        
    train_loss_ /= n_batches_per_epoch
    #train_accr_ /= n_batches_per_epoch
    train_accr_ /= to_div
    
    val_loss_=0
    val_accr_=0
    to_div = 0
    network.train(False)
    for _ in range(n_validation_batches):
        loss_t, accr_t, to_div_t = compute_loss(network, generate_batch(test_lem, batch_size, word_to_ind))
        
        val_loss_ += loss_t.item()
        val_accr_ += accr_t.item()
        to_div += to_div_t
        
    val_loss_ /= n_validation_batches
    #val_accr_ /= n_validation_batches
    val_accr_ /= to_div
    
    train_loss.append(train_loss_)
    val_loss.append(val_loss_)
    train_accr.append(train_accr_)
    val_accr.append(val_accr_)
    
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss_, val_loss_))
    print('\nEpoch: {}, train accr: {}, val accr: {}'.format(epoch, train_accr_, val_accr_))

print("Finished!")

100%|██████████| 400/400 [03:03<00:00,  2.61it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 0, train loss: 7.415038217306137, val loss: 7.029473960399628

Epoch: 0, train accr: 0.06012615392945126, val accr: 0.08210409128861676


100%|██████████| 400/400 [02:40<00:00,  2.90it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 1, train loss: 6.588179706335068, val loss: 6.555705967545509

Epoch: 1, train accr: 0.10989722424536699, val accr: 0.10720432751413818


100%|██████████| 400/400 [02:43<00:00,  1.74it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 2, train loss: 6.104913908243179, val loss: 6.365627136826515

Epoch: 2, train accr: 0.13640310414066315, val accr: 0.12206235872412184


100%|██████████| 400/400 [02:43<00:00,  1.97it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 3, train loss: 5.751491576433182, val loss: 6.201145070791244

Epoch: 3, train accr: 0.1518869828456105, val accr: 0.12976097595574154


100%|██████████| 400/400 [02:45<00:00,  2.75it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 4, train loss: 5.463841874599456, val loss: 6.125228527188301

Epoch: 4, train accr: 0.165665875974246, val accr: 0.13747347980869504


100%|██████████| 400/400 [02:42<00:00,  2.35it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 5, train loss: 5.223688576221466, val loss: 6.060811606049538

Epoch: 5, train accr: 0.18190424668650257, val accr: 0.1438562857244293


100%|██████████| 400/400 [02:45<00:00,  2.44it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 6, train loss: 5.02456667304039, val loss: 6.020030668377876

Epoch: 6, train accr: 0.1990015235607792, val accr: 0.1467916159954217


100%|██████████| 400/400 [02:43<00:00,  2.47it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 7, train loss: 4.8251614201068875, val loss: 5.995430633425713

Epoch: 7, train accr: 0.2121998000918497, val accr: 0.15825107793179632


100%|██████████| 400/400 [02:43<00:00,  2.48it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 8, train loss: 4.649763660430908, val loss: 5.9933482021093365

Epoch: 8, train accr: 0.22680231774693438, val accr: 0.1587067299244421


100%|██████████| 400/400 [02:47<00:00,  2.68it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 9, train loss: 4.489338699579239, val loss: 6.045060223340988

Epoch: 9, train accr: 0.24403225806451612, val accr: 0.15648024700021052


100%|██████████| 400/400 [02:42<00:00,  2.60it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 10, train loss: 4.325403891801834, val loss: 6.019878289103508

Epoch: 10, train accr: 0.26187012844439567, val accr: 0.16132177681473456


100%|██████████| 400/400 [02:47<00:00,  2.01it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 11, train loss: 4.19681582570076, val loss: 6.081585231423378

Epoch: 11, train accr: 0.27377579927155, val accr: 0.15985796653072704


100%|██████████| 400/400 [02:42<00:00,  2.76it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 12, train loss: 4.03578318297863, val loss: 6.066061696410179

Epoch: 12, train accr: 0.29419314742758323, val accr: 0.160538179768949


100%|██████████| 400/400 [02:41<00:00,  2.31it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 13, train loss: 3.9093258064985275, val loss: 6.077114847302437

Epoch: 13, train accr: 0.3083000203265804, val accr: 0.1633900956308436


100%|██████████| 400/400 [02:40<00:00,  2.30it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 14, train loss: 3.779133513569832, val loss: 6.045979431271553

Epoch: 14, train accr: 0.32664359861591696, val accr: 0.16685357818742552


100%|██████████| 400/400 [02:46<00:00,  2.33it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 15, train loss: 3.6739465874433517, val loss: 6.1165186882019045

Epoch: 15, train accr: 0.33969682230869, val accr: 0.16361121946030283


100%|██████████| 400/400 [02:47<00:00,  2.36it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 16, train loss: 3.5566707360744476, val loss: 6.144429913163185

Epoch: 16, train accr: 0.3544253050206465, val accr: 0.16442361894024804


100%|██████████| 400/400 [02:46<00:00,  2.56it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 17, train loss: 3.459779422879219, val loss: 6.2179121434688565

Epoch: 17, train accr: 0.3693340342080789, val accr: 0.16126997476871321


100%|██████████| 400/400 [02:42<00:00,  2.48it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 18, train loss: 3.383422926068306, val loss: 6.212906065583229

Epoch: 18, train accr: 0.38028302014056575, val accr: 0.16483516483516483


100%|██████████| 400/400 [02:51<00:00,  2.35it/s]



Epoch: 19, train loss: 3.2591949808597565, val loss: 6.271716690063476

Epoch: 19, train accr: 0.3971612212104981, val accr: 0.15462953340953964
Finished!


Нужно попробовать предсказывать тэг и ввести это в лосс, может качество поменяется

In [86]:
torch.save(network.state_dict(), 'add_pos_tagging_as_feature.pwf')

In [97]:
def compute_accr(network, batch):
    batch_x = batch[0]
    batch_tag = batch[1]
    
    batch_x = np.array(batch_x)
    batch_tag = np.array(batch_tag)
    batch_x_inp = batch_x[:, :-1]
    batch_x_next = batch_x[:, 1:]
    batch_tag_inp = batch_tag[:, :-1]
    batch_tag_next = batch_tag[:, 1:]
    
    logits_for_next = network.forward(batch_x_inp, batch_tag_inp)
    logits_for_next = logits_for_next[:, 1:]
    
    answers = torch.argmax(logits_for_next, dim=-1).numpy()
    
    accr = np.array([answers == batch_x_next]) * np.array([answers != pad_id])
    accr = accr[0]
    accr = accr.sum(axis=0)
    to_divide = np.array([batch_x_next != pad_id])[0].sum(axis=0)
    
    return accr, to_divide

def get_batch(data, left, right, batch_size, word_to_ind, max_len=None):
    slice_x = np.arange(left, right, 1)
    batch_x = []
    batch_tag = []
    for x in slice_x:
        batch_x.append(data[0][x])
        batch_tag.append(data[1][x])
    return as_matrix(batch_x, batch_tag, word_to_ind, max_len)

def try_lengthes(data):
    accr = np.zeros(np.max(list(map(len, data[0]))))
    to_div = np.zeros(np.max(list(map(len, data[0]))))
    
    for _ in tqdm_notebook(range(0, len(data[0])-batch_size, batch_size)):
        accr_t, div_t = compute_accr(network, get_batch(data, _, _+batch_size, batch_size, word_to_ind))
        accr[:len(accr_t)] += accr_t
        to_div[:len(div_t)] += div_t
    eps = 1
    return accr / (to_div + eps), accr.sum() / to_div.sum()

In [98]:
def approximate_pad(data):
    to_pad = 0
    all_ = 0
    for x in data:
        for word in x:
            all_ += 1
            if word not in word_to_ind.keys():
                to_pad += 1
    return to_pad / all_

In [99]:
from tqdm import tqdm_notebook

on_train, all_accr = try_lengthes(train_lem)
on_train[:10], all_accr

HBox(children=(IntProgress(value=0, max=802), HTML(value='')))




(array([0.22534229, 0.3419334 , 0.46927353, 0.51727253, 0.55219961,
        0.5572743 , 0.57848837, 0.58447489, 0.56936226, 0.54239257]),
 0.4007140359841863)

In [100]:
on_test, all_accr = try_lengthes(test_lem)
on_test[:10], all_accr

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))




(array([0.13774834, 0.14425538, 0.16985902, 0.1910342 , 0.18946509,
        0.19265442, 0.19807281, 0.18886861, 0.16057234, 0.11239193]),
 0.1608981155306269)

* попробуем учить предсказание следующего тега