In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
import gensim
from gensim.models import KeyedVectors
from IPython.display import clear_output
from collections import Counter
from tqdm import tqdm
import seaborn as sns

%matplotlib inline

In [50]:
from pymystem3 import Mystem
from string import punctuation

In [83]:
def read_queries(path):
    f = open(path)
    queries = []
    tokenizer = WordPunctTokenizer()
    
    for line in f:
        tmp = []
        for q in tokenizer.tokenize(line.lower()):
            if not np.all(np.any(np.array(list(q)).reshape(-1, 1) == np.array(list(punctuation)).reshape(1, -1), axis=1)):
                tmp.append(q)
        queries.append(tmp)
    f.close()
    return queries

def read_queries_with_lemmatization(path):
    f = open(path)
    queries = []
    mystem = Mystem()
    tokenizer = WordPunctTokenizer()
    
    for line in f:
        tmp = []
        for q in tokenizer.tokenize(line.lower()):
            if not np.all(np.any(np.array(list(q)).reshape(-1, 1) == np.array(list(punctuation)).reshape(1, -1), axis=1)):
                q_ = mystem.lemmatize(q)
                tmp.append("".join(q_).split()[0])
        queries.append(tmp)
    f.close()
    return queries

In [306]:
path = 'data/requests.uniq.train'
train = read_queries(path)
train_lem = read_queries_with_lemmatization(path)
path = 'data/requests.uniq.test'
test = read_queries(path)
test_lem = read_queries_with_lemmatization(path)
train[:5], train_lem[:5], test[:5], test_lem[:5], len(train), len(test)

([['сибирские', 'сети', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карте', 'россии'],
  ['ноофен', 'для', 'каких', 'болезней'],
  ['маус', 'хаус', 'спб']],
 [['сибирский', 'сеть', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карта', 'россия'],
  ['ноофен', 'для', 'какой', 'болезнь'],
  ['маус', 'хаус', 'спб']],
 [['сбербанк', 'в', 'кунцево', 'плаза'],
  ['торт', 'дикая', 'вишня'],
  ['тася', 'кривун', 'танцы', 'на', 'тнт'],
  ['рбт', 'ру'],
  ['toplü', 'vay', 'sexx']],
 [['сбербанк', 'в', 'кунцево', 'плаза'],
  ['торт', 'дикий', 'вишня'],
  ['тася', 'кривун', 'танец', 'на', 'тнт'],
  ['рбт', 'ру'],
  ['toplü', 'vay', 'sexx']],
 51353,
 21174)

In [87]:
were_changed = 0
all_ = 0
for i in range(len(train)):
    for j in range(len(train[i])):
        all_ += 1
        if train[i][j] != train_lem[i][j]:
            were_changed += 1
were_changed / all_

0.3257316609654124

In [301]:
count_words = Counter()

for d in [train]:
    for q in d:
        for word in q:
            count_words[word] += 1
        
freq, counts = np.unique(np.array(list(count_words.values())), return_counts=True) 
p = counts * freq 
p = p / p.sum()
p = np.cumsum(p)
freq[:10], p[:10]

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([0.17574006, 0.24519813, 0.29055625, 0.32405664, 0.35185578,
        0.37601319, 0.39457541, 0.41182944, 0.42762059, 0.4410561 ]))

In [121]:
np.sum(counts), np.sum(counts[2:])

(59343, 11721)

In [100]:
count_words = Counter()

for d in [train_lem]:
    for q in d:
        for word in q:
            count_words[word] += 1
        
freq, counts = np.unique(np.array(list(count_words.values())), return_counts=True) 
p = counts * freq 
p = p / p.sum()
p = np.cumsum(p)
freq[:10], p[:10]

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([0.12185658, 0.16966756, 0.20326518, 0.23018925, 0.2515358 ,
        0.27062838, 0.28792218, 0.30358516, 0.31885922, 0.32964299]))

In [101]:
np.sum(counts), np.sum(counts[2:])

(43111, 10130)

## LSTM с различными эмбеддингами

### с лемматизацией и без

In [102]:
emb = KeyedVectors.load_word2vec_format("wiki.ru.vec")
emb_1 = KeyedVectors.load_word2vec_format("cc.ru.300.vec")
emb_2 = KeyedVectors.load_word2vec_format("ft_native_300_ru_wiki_lenta_lower_case.vec")

In [107]:
import torch, torch.nn as nn
import torch.nn.functional as F

In [124]:
def calculate_n_tokens(emb):
    n_tokens = 0
    for word in count_words.keys():
        if word in emb.vocab and count_words[word] >= 3:
            n_tokens += 1
    return n_tokens

In [125]:
calculate_n_tokens(emb), calculate_n_tokens(emb_1), calculate_n_tokens(emb_2) 

(11110, 10627, 11453)

In [258]:
def transform_to_features(emb, emb_size, ind_to_word, batch_x):
    to_emb = np.zeros((len(batch_x), len(batch_x[0])+1, emb_size))
    for i in range(len(batch_x)):
        to_emb[i][0] = np.ones(emb_size)
        for j in range(len(batch_x[i])):
            if batch_x[i][j] != pad_id:
                to_emb[i][j+1] = emb[ind_to_word[batch_x[i][j]]]
    return to_emb

In [259]:
class Net(nn.Module):
    def __init__(self, emb, ind_to_word, emb_size=300, lstm_units=256):
        super(self.__class__, self).__init__()
        n_tokens = calculate_n_tokens(emb)
        self.lstm = nn.LSTM(emb_size, lstm_units, batch_first=True)
        self.logits = nn.Linear(lstm_units, n_tokens) 
        self.emb = emb
        self.emb_size = emb_size
        self.ind_to_word = ind_to_word
        
    def forward(self, batch_x):
        input_emb = transform_to_features(self.emb, self.emb_size, ind_to_word, batch_x)
        input_emb = torch.tensor(input_emb, dtype=torch.float32)
        lstm_out = self.lstm(input_emb)
        logits = self.logits(lstm_out[0])
        
        return logits        

In [333]:
pad = '#PAD#'
pad_id = 0

def construct_vocab(emb):
    word_to_ind = dict()
    word_to_ind['#PAD#'] = 0
    ind_to_word = ['#PAD#', ]
    
    count = 1
    for word in count_words.keys():
        if count_words[word] >= 3 and word in emb.vocab:
            ind_to_word.append(word)
            word_to_ind[word] = count
            count += 1
    return ind_to_word, word_to_ind

def as_matrix(sequences, word_to_ind, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences), max_len), dtype=int)
    for i, seq in enumerate(sequences):
        for j, word in enumerate(seq[:max_len]):
            if word in word_to_ind.keys():
                matrix[i][j] = word_to_ind[word]
            else:
                matrix[i][j] = pad_id
        for j in range(max_len, len(seq)):
            matrix[i][j] = pad_id
    
    return matrix

In [334]:
ind_to_word, word_to_ind = construct_vocab(emb)
train[:5], as_matrix(train[:5], word_to_ind)

([['сибирские', 'сети', 'личный', 'кабинет', 'бердск'],
  ['1', 'сантим', 'алжир', '1964'],
  ['река', 'колыма', 'на', 'карте', 'россии'],
  ['ноофен', 'для', 'каких', 'болезней'],
  ['маус', 'хаус', 'спб']],
 array([[ 0,  1,  2,  3,  0],
        [ 0,  0,  4,  0,  0],
        [ 5,  0,  6,  7,  8],
        [ 0,  9, 10, 11,  0],
        [12, 13, 14,  0,  0]]))

In [335]:
ind_to_word, word_to_ind = construct_vocab(emb)
network = Net(emb, ind_to_word)

In [336]:
dummy_batch_x = as_matrix(train[:5], word_to_ind)

dummy_logits = network.forward(dummy_batch_x)

print('shape:', dummy_logits.shape)

shape: torch.Size([5, 6, 11110])


In [386]:
def compute_loss(network, batch_x):
    """
    use scalar crossentropy loss (neg llh) loss 
    """
    batch_x = np.array(batch_x)
    batch_x_inp = batch_x[:, :-1]
    batch_x_next = batch_x[:, 1:]
    
    logits_for_next = network.forward(batch_x_inp)
    logits_for_next = logits_for_next[:, 1:]
    
    answers = torch.argmax(logits_for_next, dim=-1).numpy()
    logits_for_next = logits_for_next.contiguous()
    logits_for_next = logits_for_next.view(-1, logits_for_next.shape[-1])
    
    accr = np.array([answers == batch_x_next]) * np.array([answers != pad_id])
    accr = accr.mean()
    batch_x_next = torch.tensor(batch_x_next, dtype=torch.int64)
    batch_x_next = batch_x_next.view(-1)
    
    loss = F.cross_entropy(logits_for_next, batch_x_next, ignore_index=pad_id, reduction='mean')
    
    
    return loss, accr

In [387]:
dummy_loss, dummy_accr = compute_loss(network, dummy_batch_x)

assert dummy_loss.shape == torch.Size([]), 'loss must be scalar'
assert dummy_loss.data.numpy() > 0, "did you forget the 'negative' part of negative log-likelihood"

dummy_loss.backward()

assert all(param.grad is not None for param in network.parameters()), \
        'loss should depend differentiably on all neural network weights'

In [339]:
from torch.optim import Adam

opt = Adam(network.parameters())

In [343]:
from random import choice

def generate_batch(train, batch_size, word_to_ind, max_len=None):
    random_x = np.random.randint(0, len(train), size=batch_size)
    batch_x = []
    for x in random_x:
        batch_x.append(train[x])
    return as_matrix(batch_x, word_to_ind, max_len)

In [391]:
batch_size = 64 
n_epochs = 100  
n_batches_per_epoch = 300  
n_validation_batches = 100

In [392]:
len(train) / batch_size, len(test) / batch_size

(802.390625, 330.84375)

### emb и без лемматизации

In [393]:
from tqdm import tqdm

ind_to_word, word_to_ind = construct_vocab(emb)
network = Net(emb, ind_to_word)
opt = Adam(network.parameters())

for epoch in range(n_epochs):
    train_loss=0
    train_accr=0
    network.train(True)
    for _ in tqdm(range(n_batches_per_epoch)):
        
        loss_t, accr_t = compute_loss(network, generate_batch(train, batch_size, word_to_ind))
        
        loss_t.backward()
        opt.step()
        opt.zero_grad()
        
        train_loss += loss_t.item()
        train_accr += accr_t.item()
        
    train_loss /= n_batches_per_epoch
    train_accr /= n_batches_per_epoch
    
    val_loss=0
    val_accr=0
    network.train(False)
    for _ in range(n_validation_batches):
        loss_t, accr_t = compute_loss(network, generate_batch(test, batch_size, word_to_ind))
        
        val_loss += loss_t.item()
        val_accr += accr_t.item()
    val_loss /= n_validation_batches
    val_accr /= n_validation_batches
    
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))
    print('\nEpoch: {}, train accr: {}, val accr: {}'.format(epoch, train_accr, val_accr))

print("Finished!")

100%|██████████| 300/300 [02:27<00:00,  2.51it/s]
  0%|          | 0/300 [00:00<?, ?it/s]


Epoch: 0, train loss: 7.759962277412415, val loss: 7.447972011566162

Epoch: 0, train accr: 0.011002072271073414, val accr: 0.013308434519531129


100%|██████████| 300/300 [02:20<00:00,  2.30it/s]
  0%|          | 0/300 [00:00<?, ?it/s]


Epoch: 1, train loss: 7.055293418566386, val loss: 6.7997199106216435

Epoch: 1, train accr: 0.02167520221429876, val accr: 0.022811159839641902


100%|██████████| 300/300 [02:13<00:00,  2.34it/s]
  0%|          | 0/300 [00:00<?, ?it/s]


Epoch: 2, train loss: 6.493653011322022, val loss: 6.434842958450317

Epoch: 2, train accr: 0.028725509839979277, val accr: 0.027242298317545023


 20%|██        | 61/300 [00:27<01:49,  2.19it/s]

RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed.  at /Users/soumith/b101_2/2019_02_08/wheel_build_dirs/wheel_3.7/pytorch/aten/src/THNN/generic/ClassNLLCriterion.c:93