In [205]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
import gensim
from gensim.models import KeyedVectors
from IPython.display import clear_output
from collections import Counter
from tqdm import tqdm

%matplotlib inline

In [69]:
def read_queries(path):
    f = open(path, 'r')
    #tokenizer = WordPunctTokenizer()
    queries = []
    
    for line in f:
        #queries.append(tokenizer.tokenize(line.split('\t')[1][5:].lower()))
        queries.append(line.split('\t')[1][5:].lower().split())
    return queries

In [70]:
path = 'data/train.crash'
train = read_queries(path)
path = 'data/test_queries.crash'
test = read_queries(path)
train[:5], test[:5]

([['за', 'рулем'],
  ['решу', 'о'],
  ['марципан'],
  ['календарь', 'с', 'девушками', 'на', '2019'],
  ['можно', 'ли', 'в', '2020', 'году', 'сдавать', 'фзк', 'огэ']],
 [['вк'],
  ['учебник', 'по', 'обществознанию', '9', 'класс', 'боголюбова'],
  ['жена', 'и', 'муж', 'куколд'],
  ['самое', 'главное', 'грибы'],
  ['почта', 'дчл', 'в', 'москве', 'стоимость']])

### LSTM

In [58]:
ru_emb = KeyedVectors.load_word2vec_format("cc.ru.300.vec")

In [123]:
unk = np.zeros(300)
pad = np.ones(300)

def to_matrix(data):
    matrix = []
    for sent in data:
        mtr = []
        mtr.append(unk)
        for word in sent:
            if word in ru_emb.vocab:
                mtr.append(ru_emb[word])
            else:
                mtr.append(unk)
        matrix.append(mtr)
    return matrix

In [124]:
train_matrix = to_matrix(train)
test_matrix = to_matrix(test)

In [126]:
from copy import deepcopy

In [196]:
def transform_to_pairs(data, add_label=False, text=None):
    X = []
    y = []
    labels = []
    for j, sent in enumerate(data):
        tmp = []
        tmp.append(sent[0])
        for i in range(1, len(sent)):
            X.append(deepcopy(tmp))
            y.append(sent[i])
            if add_label:
                labels.append(text[j][i-1])
            tmp.append(sent[i])
    if add_label:
        return X, y, labels
    return X, y

In [187]:
X, y = transform_to_pairs(train_matrix)

In [188]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

print("Train size = ", len(X_train))
print("Validation size = ", len(X_val))

Train size =  15811
Validation size =  3953


In [189]:
X_train = np.array(X_train)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [105]:
import keras
import keras.layers as L

Using TensorFlow backend.


In [190]:
def build_model(emb_size=300, hid_size=32):
    
    l_input = L.Input(shape=[None, emb_size], name='input')
    l_lstm = L.LSTM(units=hid_size)(l_input)
    
    output_layer = L.Dense(emb_size)(l_lstm)
    
    model = keras.models.Model(inputs=[l_input], outputs=[output_layer])
    model.compile('adam', 'mean_squared_error', metrics=['mean_absolute_error'])
    return model
    
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, None, 300)         0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 32)                42624     
_________________________________________________________________
dense_7 (Dense)              (None, 300)               9900      
Total params: 52,524
Trainable params: 52,524
Non-trainable params: 0
_________________________________________________________________


In [208]:
emb_size = 300

def as_matrix(data, max_len=None):    
    max_len = min(max(map(len, data)), max_len or float('inf'))
    
    #matrix = np.full((len(data), max_len, emb_size), pad)
    matrix = np.full((len(data), max_len, emb_size), unk)
    for i, seq in enumerate(data):
        #matrix[i, :len(seq)] = seq
        matrix[i, -len(seq):] = seq
    
    return matrix

def make_batch(X_data, max_len=None):
    batch = {}
    batch["input"] = as_matrix(X_data, max_len)
    return batch

In [209]:
batch = make_batch(X_train[:3])
batch['input'].shape

(3, 3, 300)

In [210]:
def iterate_minibatches(X_data, y_data, batch_size=256, shuffle=True, cycle=False, **kwargs):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(X_data))
        if shuffle:
            indices = np.random.permutation(indices)

        for start in range(0, len(indices), batch_size):
            batch = make_batch(X_data[indices[start : start + batch_size]], **kwargs)
            target = y_data[indices[start : start + batch_size]]
            yield batch, target
        
        if not cycle: break

In [211]:
batch_size = 256
epochs = 10            
steps_per_epoch = 100  

model = build_model()

model.fit_generator(iterate_minibatches(X_train, y_train, batch_size, cycle=True), 
                    epochs=epochs, steps_per_epoch=steps_per_epoch,
                    
                    validation_data=iterate_minibatches(X_val, y_val, batch_size, cycle=True),
                    validation_steps=X_val.shape[0] // batch_size
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1726f7ef0>

In [212]:
X_test, y_test, label_test = transform_to_pairs(test_matrix, add_label=True, text=test)
X_test = np.array(X_test)
y_test = np.array(y_test)
X_test.shape, y_test.shape, len(label_test)

((19819,), (19819, 300), 19819)

In [213]:
def calculate_accuracy(model, X_test, y_test, label_test):   
    #squared_error = abs_error = num_samples = 0.0
    all_accr = 0
    all_count = 0
    for batch_x, batch_y in iterate_minibatches(X_test, y_test, batch_size=batch_size, shuffle=False):
        batch_pred = model.predict(batch_x)
        for k in batch_pred:
            if label_test[all_count] == ru_emb.most_similar([k], topn=1)[0][0]:
                all_accr += 1
            all_count += 1
        clear_output()
        print('In progress {} / {}, accuracy = {}'.format(all_count, len(X_test), all_accr / all_count), flush=True)
     
    return all_accr / all_count
    
calculate_accuracy(model, X_test, y_test, label_test)

In progress 768 / 19819, accuracy = 0.044270833333333336


KeyboardInterrupt: 