In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
import gensim
from gensim.models import KeyedVectors
from IPython.display import clear_output
from collections import Counter
from tqdm import tqdm
import seaborn as sns

%matplotlib inline

In [2]:
from pymystem3 import Mystem
from string import punctuation

In [3]:
def read_queries(path):
    f = open(path)
    queries = []
    tokenizer = WordPunctTokenizer()
    
    for line in f:
        tmp = []
        for q in tokenizer.tokenize(line.lower()):
            if not np.all(np.any(np.array(list(q)).reshape(-1, 1) == np.array(list(punctuation)).reshape(1, -1), axis=1)):
                tmp.append(q)
        queries.append(tmp)
    f.close()
    return queries

def read_queries_with_lemmatization(path):
    f = open(path)
    queries = []
    mystem = Mystem()
    tokenizer = WordPunctTokenizer()
    
    for line in f:
        tmp = []
        for q in tokenizer.tokenize(line.lower()):
            if not np.all(np.any(np.array(list(q)).reshape(-1, 1) == np.array(list(punctuation)).reshape(1, -1), axis=1)):
                q_ = mystem.lemmatize(q)
                tmp.append("".join(q_).split()[0])
        queries.append(tmp)
    f.close()
    return queries

In [4]:
path = 'data/requests.uniq.train'
train = read_queries(path)
train_lem = read_queries_with_lemmatization(path)
path = 'data/requests.uniq.test'
test = read_queries(path)
test_lem = read_queries_with_lemmatization(path)
train[:5], train_lem[:5], test[:5], test_lem[:5], len(train), len(test)

([['сибирские', 'сети', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карте', 'россии'],
  ['ноофен', 'для', 'каких', 'болезней'],
  ['маус', 'хаус', 'спб']],
 [['сибирский', 'сеть', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карта', 'россия'],
  ['ноофен', 'для', 'какой', 'болезнь'],
  ['маус', 'хаус', 'спб']],
 [['сбербанк', 'в', 'кунцево', 'плаза'],
  ['торт', 'дикая', 'вишня'],
  ['тася', 'кривун', 'танцы', 'на', 'тнт'],
  ['рбт', 'ру'],
  ['toplü', 'vay', 'sexx']],
 [['сбербанк', 'в', 'кунцево', 'плаза'],
  ['торт', 'дикий', 'вишня'],
  ['тася', 'кривун', 'танец', 'на', 'тнт'],
  ['рбт', 'ру'],
  ['toplü', 'vay', 'sexx']],
 51353,
 21174)

In [5]:
were_changed = 0
all_ = 0
for i in range(len(train)):
    for j in range(len(train[i])):
        all_ += 1
        if train[i][j] != train_lem[i][j]:
            were_changed += 1
were_changed / all_

0.3257316609654124

In [6]:
count_words = Counter()

for d in [train]:
    for q in d:
        for word in q:
            count_words[word] += 1
        
freq, counts = np.unique(np.array(list(count_words.values())), return_counts=True) 
p = counts * freq 
p = p / p.sum()
p = np.cumsum(p)
freq[:10], p[:10]

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([0.17574006, 0.24519813, 0.29055625, 0.32405664, 0.35185578,
        0.37601319, 0.39457541, 0.41182944, 0.42762059, 0.4410561 ]))

In [7]:
np.sum(counts), np.sum(counts[2:])

(59343, 11721)

In [100]:
count_words = Counter()

for d in [train_lem]:
    for q in d:
        for word in q:
            count_words[word] += 1
        
freq, counts = np.unique(np.array(list(count_words.values())), return_counts=True) 
p = counts * freq 
p = p / p.sum()
p = np.cumsum(p)
freq[:10], p[:10]

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([0.12185658, 0.16966756, 0.20326518, 0.23018925, 0.2515358 ,
        0.27062838, 0.28792218, 0.30358516, 0.31885922, 0.32964299]))

In [101]:
np.sum(counts), np.sum(counts[2:])

(43111, 10130)

## LSTM с различными эмбеддингами

### с лемматизацией и без

In [8]:
emb = KeyedVectors.load_word2vec_format("wiki.ru.vec")

In [None]:
emb_1 = KeyedVectors.load_word2vec_format("cc.ru.300.vec")

In [None]:
emb_2 = KeyedVectors.load_word2vec_format("ft_native_300_ru_wiki_lenta_lower_case.vec")

In [9]:
import torch, torch.nn as nn
import torch.nn.functional as F

In [19]:
def calculate_n_tokens(emb):
    n_tokens = 0
    for word in count_words.keys():
        if word in emb.vocab and count_words[word] >= 3:
            n_tokens += 1
    return n_tokens + 1

In [125]:
calculate_n_tokens(emb), calculate_n_tokens(emb_1), calculate_n_tokens(emb_2) 

(11110, 10627, 11453)

In [20]:
def transform_to_features(emb, emb_size, ind_to_word, batch_x):
    to_emb = np.zeros((len(batch_x), len(batch_x[0])+1, emb_size))
    for i in range(len(batch_x)):
        to_emb[i][0] = np.ones(emb_size)
        for j in range(len(batch_x[i])):
            if batch_x[i][j] != pad_id:
                to_emb[i][j+1] = emb[ind_to_word[batch_x[i][j]]]
    return to_emb

In [21]:
class Net(nn.Module):
    def __init__(self, emb, ind_to_word, emb_size=300, lstm_units=256):
        super(self.__class__, self).__init__()
        n_tokens = calculate_n_tokens(emb)
        self.lstm = nn.LSTM(emb_size, lstm_units, batch_first=True)
        self.logits = nn.Linear(lstm_units, n_tokens) 
        self.emb = emb
        self.emb_size = emb_size
        self.ind_to_word = ind_to_word
        
    def forward(self, batch_x):
        input_emb = transform_to_features(self.emb, self.emb_size, self.ind_to_word, batch_x)
        input_emb = torch.tensor(input_emb, dtype=torch.float32)
        lstm_out = self.lstm(input_emb)
        logits = self.logits(lstm_out[0])
        
        return logits        

In [22]:
pad = '#PAD#'
pad_id = 0

def construct_vocab(emb, count_words):
    word_to_ind = dict()
    word_to_ind['#PAD#'] = 0
    ind_to_word = ['#PAD#', ]
    
    count = 1
    for word in count_words.keys():
        if count_words[word] >= 3 and word in emb.vocab:
            ind_to_word.append(word)
            word_to_ind[word] = count
            count += 1
    return ind_to_word, word_to_ind

def as_matrix(sequences, word_to_ind, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences), max_len), dtype=int)
    for i, seq in enumerate(sequences):
        for j, word in enumerate(seq[:max_len]):
            if word in word_to_ind.keys():
                matrix[i][j] = word_to_ind[word]
            else:
                matrix[i][j] = pad_id
        for j in range(max_len, len(seq)):
            matrix[i][j] = pad_id
    
    return matrix

In [23]:
ind_to_word, word_to_ind = construct_vocab(emb, count_words)
train[:5], as_matrix(train[:5], word_to_ind)

([['сибирские', 'сети', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карте', 'россии'],
  ['ноофен', 'для', 'каких', 'болезней'],
  ['маус', 'хаус', 'спб']],
 array([[ 0,  1,  2,  3,  0],
        [ 0,  0,  4,  0,  0],
        [ 5,  0,  6,  7,  8],
        [ 0,  9, 10, 11,  0],
        [12, 13, 14,  0,  0]]))

In [25]:
ind_to_word, word_to_ind = construct_vocab(emb, count_words)
network = Net(emb, ind_to_word)

In [26]:
dummy_batch_x = as_matrix(train[:5], word_to_ind)

dummy_logits = network.forward(dummy_batch_x)

print('shape:', dummy_logits.shape)

shape: torch.Size([5, 6, 11111])


In [27]:
def compute_loss(network, batch_x):
    """
    use scalar crossentropy loss (neg llh) loss 
    """
    batch_x = np.array(batch_x)
    batch_x_inp = batch_x[:, :-1]
    batch_x_next = batch_x[:, 1:]
    
    logits_for_next = network.forward(batch_x_inp)
    logits_for_next = logits_for_next[:, 1:]
    
    answers = torch.argmax(logits_for_next, dim=-1).numpy()
    logits_for_next = logits_for_next.contiguous()
    logits_for_next = logits_for_next.view(-1, logits_for_next.shape[-1])
    
    accr = np.array([answers == batch_x_next]) * np.array([answers != pad_id])
    accr = accr.mean()
    batch_x_next = torch.tensor(batch_x_next, dtype=torch.int64)
    batch_x_next = batch_x_next.view(-1)
    
    loss = F.cross_entropy(logits_for_next, batch_x_next, ignore_index=pad_id, reduction='mean')
    
    
    return loss, accr

In [28]:
dummy_loss, dummy_accr = compute_loss(network, dummy_batch_x)

assert dummy_loss.shape == torch.Size([]), 'loss must be scalar'
assert dummy_loss.data.numpy() > 0, "did you forget the 'negative' part of negative log-likelihood"

dummy_loss.backward()

assert all(param.grad is not None for param in network.parameters()), \
        'loss should depend differentiably on all neural network weights'

In [29]:
from torch.optim import Adam

opt = Adam(network.parameters())

In [30]:
from random import choice

def generate_batch(train, batch_size, word_to_ind, max_len=None):
    random_x = np.random.randint(0, len(train), size=batch_size)
    batch_x = []
    for x in random_x:
        batch_x.append(train[x])
    return as_matrix(batch_x, word_to_ind, max_len)

In [31]:
batch_size = 64 
n_epochs = 100  
n_batches_per_epoch = 400  
n_validation_batches = 160

In [32]:
len(train) / batch_size, len(test) / batch_size

(802.390625, 330.84375)

### emb и без лемматизации

In [35]:
from tqdm import tqdm

ind_to_word, word_to_ind = construct_vocab(emb, count_words)
network = Net(emb, ind_to_word)
opt = Adam(network.parameters())

train_loss, val_loss, train_accr, val_accr = [], [], [], []

for epoch in range(n_epochs):
    train_loss_=0
    train_accr_=0
    network.train(True)
    for _ in tqdm(range(n_batches_per_epoch)):
        
        loss_t, accr_t = compute_loss(network, generate_batch(train, batch_size, word_to_ind))
        
        loss_t.backward()
        opt.step()
        opt.zero_grad()
        
        train_loss_ += loss_t.item()
        train_accr_ += accr_t.item()
        
    train_loss_ /= n_batches_per_epoch
    train_accr_ /= n_batches_per_epoch
    
    val_loss_=0
    val_accr_=0
    network.train(False)
    for _ in range(n_validation_batches):
        loss_t, accr_t = compute_loss(network, generate_batch(test, batch_size, word_to_ind))
        
        val_loss_ += loss_t.item()
        val_accr_ += accr_t.item()
    val_loss_ /= n_validation_batches
    val_accr_ /= n_validation_batches
    
    train_loss.append(train_loss_)
    val_loss.append(val_loss_)
    train_accr.append(train_accr_)
    val_accr.append(val_accr_)
    
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss_, val_loss_))
    print('\nEpoch: {}, train accr: {}, val accr: {}'.format(epoch, train_accr_, val_accr_))

print("Finished!")

100%|██████████| 400/400 [03:09<00:00,  2.11it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 0, train loss: 7.622671223878861, val loss: 7.1394746661186215

Epoch: 0, train accr: 0.01339046165771652, val accr: 0.018097800048641612


100%|██████████| 400/400 [03:15<00:00,  1.96it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 1, train loss: 6.71602601647377, val loss: 6.575458806753159

Epoch: 1, train accr: 0.026087059246560535, val accr: 0.025448395451815924


100%|██████████| 400/400 [03:08<00:00,  1.55it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 2, train loss: 6.160353838205338, val loss: 6.264864519238472

Epoch: 2, train accr: 0.03143006894256096, val accr: 0.02903436289460306


100%|██████████| 400/400 [03:03<00:00,  2.42it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 3, train loss: 5.731523013114929, val loss: 6.107632464170456

Epoch: 3, train accr: 0.03726837527910878, val accr: 0.03154029908142869


100%|██████████| 400/400 [03:03<00:00,  2.32it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 4, train loss: 5.418812232017517, val loss: 6.057292622327805

Epoch: 4, train accr: 0.04167745368437054, val accr: 0.033277360663551046


100%|██████████| 400/400 [02:58<00:00,  2.07it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 5, train loss: 5.114300377368927, val loss: 5.950650975108147

Epoch: 5, train accr: 0.04387270713642712, val accr: 0.0360907957266926


100%|██████████| 400/400 [02:57<00:00,  2.11it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 6, train loss: 4.897996553182602, val loss: 5.935343831777573

Epoch: 6, train accr: 0.04868802166212914, val accr: 0.03443580243595918


100%|██████████| 400/400 [02:56<00:00,  2.10it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 7, train loss: 4.682642257809639, val loss: 5.898927417397499

Epoch: 7, train accr: 0.05201731820089961, val accr: 0.03742238480804301


100%|██████████| 400/400 [02:58<00:00,  2.17it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 8, train loss: 4.481962509155274, val loss: 5.922384345531464

Epoch: 8, train accr: 0.05687295924233221, val accr: 0.03637323670610014


100%|██████████| 400/400 [03:03<00:00,  2.39it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 9, train loss: 4.2960645943880085, val loss: 5.947059541940689

Epoch: 9, train accr: 0.05878392365857763, val accr: 0.0351608558363137


100%|██████████| 400/400 [03:06<00:00,  2.31it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 10, train loss: 4.114418806433678, val loss: 5.938294425606728

Epoch: 10, train accr: 0.06544354355130344, val accr: 0.03687505863359641


100%|██████████| 400/400 [03:02<00:00,  2.31it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 11, train loss: 3.9051584994792936, val loss: 5.996386176347732

Epoch: 11, train accr: 0.0710163598793967, val accr: 0.037152650490249024


100%|██████████| 400/400 [03:00<00:00,  2.44it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 12, train loss: 3.780373965501785, val loss: 5.971921709179878

Epoch: 12, train accr: 0.07353332700154942, val accr: 0.03703464060149644


100%|██████████| 400/400 [03:06<00:00,  2.49it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 13, train loss: 3.646620386838913, val loss: 6.007755082845688

Epoch: 13, train accr: 0.07838597734581547, val accr: 0.036371846066940085


100%|██████████| 400/400 [02:56<00:00,  2.13it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 14, train loss: 3.530631050467491, val loss: 6.024186983704567

Epoch: 14, train accr: 0.08195720765325726, val accr: 0.03820906911922527


100%|██████████| 400/400 [02:56<00:00,  2.40it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 15, train loss: 3.4070708465576174, val loss: 6.0635532438755035

Epoch: 15, train accr: 0.08526162803328177, val accr: 0.03573004715734735


100%|██████████| 400/400 [03:01<00:00,  1.63it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 16, train loss: 3.3217184180021286, val loss: 6.086744508147239

Epoch: 16, train accr: 0.087338901427904, val accr: 0.03626057822515387


100%|██████████| 400/400 [03:00<00:00,  2.23it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 17, train loss: 3.1840603721141814, val loss: 6.183137547969818

Epoch: 17, train accr: 0.09215102069543843, val accr: 0.03503935518796831


100%|██████████| 400/400 [03:12<00:00,  2.03it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 18, train loss: 3.1010964900255202, val loss: 6.121600332856178

Epoch: 18, train accr: 0.09503738744379858, val accr: 0.03606126724890567


100%|██████████| 400/400 [03:03<00:00,  2.49it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 19, train loss: 3.017573115229607, val loss: 6.150854456424713

Epoch: 19, train accr: 0.1000758219572616, val accr: 0.037481509818435045


100%|██████████| 400/400 [03:04<00:00,  2.07it/s]
  0%|          | 0/400 [00:00<?, ?it/s]


Epoch: 20, train loss: 2.9091727930307387, val loss: 6.201415035128593

Epoch: 20, train accr: 0.1013911625769927, val accr: 0.03894795439755198


 26%|██▌       | 103/400 [00:48<02:20,  2.12it/s]

KeyboardInterrupt: 

In [36]:
torch.save(network.state_dict(), 'emb_no_lemmatization.pwf')

In [96]:
def compute_accr(network, batch_x):
    """
    use scalar crossentropy loss (neg llh) loss 
    """
    batch_x = np.array(batch_x)
    batch_x_inp = batch_x[:, :-1]
    batch_x_next = batch_x[:, 1:]
    
    logits_for_next = network.forward(batch_x_inp)
    logits_for_next = logits_for_next[:, 1:]
    
    answers = torch.argmax(logits_for_next, dim=-1).numpy()
    accr = np.array([answers == batch_x_next]) * np.array([answers != pad_id])
    accr = accr[0]
    accr = accr.sum(axis=0)
    to_divide = np.array([batch_x_next != pad_id])[0].sum(axis=0)
    
    return accr, to_divide

In [98]:
accr, to_divide = compute_accr(network, generate_batch(test, batch_size, word_to_ind))
accr, to_divide

(array([ 5,  5, 10,  3,  3,  2,  0,  0,  0]),
 array([34, 32, 30,  9, 12,  6,  2,  1,  0]))

In [101]:
def get_batch(data, left, right, batch_size, word_to_ind, max_len=None):
    slice_x = np.arange(left, right, 1)
    batch_x = []
    for x in slice_x:
        batch_x.append(data[x])
    return as_matrix(batch_x, word_to_ind, max_len)

def try_lengthes(data):
    accr = np.zeros(np.max(list(map(len, data))))
    to_div = np.zeros(np.max(list(map(len, data))))
    
    for _ in tqdm_notebook(range(0, len(data)-batch_size, batch_size)):
        accr_t, div_t = compute_accr(network, get_batch(data, _, _+batch_size, batch_size, word_to_ind))
        accr[:len(accr_t)] += accr_t
        to_div[:len(div_t)] += div_t
    eps = 1
    return accr / (to_div + eps), accr.sum() / to_div.sum()

In [104]:
from tqdm import tqdm_notebook

on_train, all_accr = try_lengthes(train)
on_train

HBox(children=(IntProgress(value=0, max=802), HTML(value='')))

array([0.238299  , 0.39762369, 0.55320703, 0.61953927, 0.6672619 ,
       0.66792206, 0.68522664, 0.70788108, 0.70760234, 0.67776097,
       0.62015504, 0.63247863, 0.67142857, 0.66292135, 0.55737705,
       0.48214286, 0.53191489, 0.59375   , 0.42857143, 0.65384615,
       0.5625    , 0.5       , 0.4       , 0.54545455, 0.63636364,
       0.4       , 0.54545455, 0.5       , 0.75      , 0.2       ,
       0.57142857, 0.33333333, 0.5       , 0.33333333, 0.6       ,
       0.2       , 0.25      , 0.        , 0.5       , 0.4       ,
       0.6       , 0.75      , 0.5       , 0.5       , 0.66666667,
       0.5       , 0.66666667, 0.5       , 0.        , 0.66666667,
       0.33333333, 0.5       , 0.66666667, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.5       , 0.5       ,
       0.        , 0.5       , 0.5       , 0.5       , 0.        ,
       0.        , 0.        , 0.        ])

In [105]:
on_train[:10], all_accr

(array([0.238299  , 0.39762369, 0.55320703, 0.61953927, 0.6672619 ,
        0.66792206, 0.68522664, 0.70788108, 0.70760234, 0.67776097]),
 0.4645881199890744)

In [102]:
on_test, all_accr = try_lengthes(test)
on_test

HBox(children=(IntProgress(value=0, max=330), HTML(value='')))

array([0.14579439, 0.15652655, 0.18843735, 0.20594262, 0.20559548,
       0.21215933, 0.20164609, 0.19335706, 0.15778689, 0.104     ,
       0.12878788, 0.06024096, 0.02040816, 0.05555556, 0.12      ,
       0.05882353, 0.06666667, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.5       , 0.33333333, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [103]:
on_test[:10], all_accr

(array([0.14579439, 0.15652655, 0.18843735, 0.20594262, 0.20559548,
        0.21215933, 0.20164609, 0.19335706, 0.15778689, 0.104     ]),
 0.1729730865390676)

In [106]:
f = open('emb_no_lemmatization.txt', 'w')
for x in [train_loss, val_loss, train_accr, val_accr, on_train, on_test]:
    print(len(x), file=f)
    for y in x:
        print(y, file=f)
f.close()

In [109]:
def approximate_pad(data):
    to_pad = 0
    all_ = 0
    for x in data:
        for word in x:
            all_ += 1
            if word not in word_to_ind.keys():
                to_pad += 1
    return to_pad / all_

pad_train = approximate_pad(train)
pad_test = approximate_pad(test)
pad_train, pad_test

(0.3014991205041853, 0.34950137458897096)

In [111]:
on_train[:10] * (1 - pad_train), on_test[:10] * (1 - pad_test)

(array([0.16645206, 0.2777405 , 0.3864156 , 0.43274872, 0.46608303,
        0.46654414, 0.47863141, 0.49445555, 0.49426086, 0.47341663]),
 array([0.09483905, 0.1018203 , 0.12257824, 0.13396539, 0.13373958,
        0.13800935, 0.1311705 , 0.1257785 , 0.10264015, 0.06765186]))

# to try

добавить метрику в топ-5/10

попробовать для других эмбеддингов и с лемматизацией

In [None]:
the_model = TheModelClass(*args, **kwargs)
the_model.load_state_dict(torch.load(PATH))