In [1]:
import sys, os
from pathlib import Path
sys.path.append(str(Path(os.getcwd()).parent))

import numpy as np
from scipy.spatial.distance import cdist

import nmslib

In [2]:
import tensorflow as tf
import keras.backend.tensorflow_backend as kbt

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
kbt.set_session(tf.Session(config=config))

Using TensorFlow backend.


In [3]:
from neural_LM.UD_preparation.extract_tags_from_UD import read_tags_infile, make_UD_pos_and_tag
from neural_tagging.neural_tagging_1 import load_tagger
from neural_LM.common import *

In [4]:
# переменные
dev_file = "/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
test_file = "/home/alexeysorokin/data/Data/UD2.3/UD_Belarusian-HSE/be_hse-ud-test.conllu"
load_file = "../neural_tagging/models/work/beruuk-large-5.json"

In [5]:
# читаем тестовую выборку
test_data, test_source = read_tags_infile(test_file, read_words=True, return_source_words=True)
dev_data, dev_source = read_tags_infile(dev_file, read_words=True, return_source_words=True)

/home/alexeysorokin/data/Data/UD2.3/UD_Belarusian-HSE/be_hse-ud-test.conllu
/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu


In [6]:
cls = load_tagger(load_file)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 32, 91) 0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 32, 91) 0           input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, None, 32, 32) 2912        lambda_1[0][0]                   
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, None, 32, 50) 1650        dense_1[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (

In [7]:
def get_embeddings(data, dataset_index=0, bucket_size=16):
    dataset_codes = [dataset_index] * len(data)
    transformed_data, indexes, _ = cls.transform(data, bucket_size=64, join_buckets=False, dataset_codes=dataset_codes)
    answer = [[None] * len(data), [None] * len(data)]
    for curr_indexes in indexes:
        curr_batch = make_batch([transformed_data[i] for i in curr_indexes], {0: cls.symbols_number_})
        curr_embeddings = cls._embedder_(curr_batch + [0])
        for i, index in enumerate(curr_indexes):
            L = len(data[index])
            answer[0][index] = curr_embeddings[0][i,:L]
            answer[1][index] = curr_embeddings[1][i,:L]
    print("")
    return answer

In [8]:
test_sents = [elem[0] for elem in test_data]
test_embeddings = get_embeddings(test_sents, bucket_size=64)
dev_sents = [elem[0] for elem in dev_data]
dev_embeddings = get_embeddings(dev_sents, bucket_size=64)





In [12]:
test_predictions = cls.predict(test_sents)
dev_predictions = cls.predict(dev_sents, dataset_codes=[1] * len(dev_sents))

[['SCONJ',
  'ADV,Degree=Cmp',
  'VERB,Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act',
  'NOUN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
  'PUNCT',
  'VERB,Aspect=Imp|VerbForm=Inf|Voice=Act',
  'ADJ,Case=Acc|Degree=Pos|Gender=Fem|Number=Sing',
  'PRON,Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs',
  'VERB,Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin|Voice=Act',
  'ADP',
  'NOUN,Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing',
  'PUNCT',
  'VERB,Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act',
  'ADJ,Case=Gen|Degree=Pos|Gender=Neut|Number=Sing',
  'NOUN,Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing',
  'NOUN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
  'ADJ,Case=Gen|Degree=Pos|Gender=Fem|Number=Sing',
  'ADJ,Case=Gen|Degree=Pos|Gender=Fem|Number=Sing',
  'NOUN,Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing',
  'PUNCT',
  'NOUN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing',
  'PUNCT',
  

In [None]:
# create a random matrix to index
vectors = np.random.randn(10000, 100).astype(np.float32)

# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(vectors)
index.createIndex({'post': 2}, print_progress=True)

# query for the nearest neighbours of the first datapoint
ids, distances = index.knnQuery(vectors[0], k=10)

In [None]:
def make_vectors_data(data, sents):
    vectors, refs = [], []
    for j, (sent_data, sent) in enumerate(zip(data, sents)):
        for i, embedding in enumerate(sent_data):
            vectors.append(embedding)
            refs.append((i, sent))
    vectors = np.array(vectors, dtype="float32")
    return vectors, refs

In [None]:
test_vectors, test_refs = make_vectors_data(test_embeddings[1], test_source)
dev_vectors, dev_refs = make_vectors_data(dev_embeddings[1], dev_source)

In [None]:
len(test_vectors), len(dev_vectors)

In [None]:
distances = cdist(test_vectors[:1000], dev_vectors[:10000])

In [None]:
indexes = np.argsort(distances)[:,:10]

In [None]:
indexes[:100,:3]

In [None]:
dev_sents[334]

In [None]:
dev_source[334]

In [None]:
for i in range(10):
    pos, sent = test_refs[i]
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    print("_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    index = indexes[i, 0]
    dist = distances[pos, index]
    pos, sent = dev_refs[index]
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    print(index, "{:.3f}".format(dist), "_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    print("")

In [None]:
[layer.name for layer in cls.model_.layers]

In [None]:
flat_distances[indexes[1000000:1000010]]

In [None]:
from collections import defaultdict

row_counts = defaultdict(int)
for index in indexes[100000:1000000]:
    i, j = index // distances.shape[1], index % distances.shape[1]
    test_word = dev_refs[i][1][dev_refs[i][0]]
    dev_word = dev_refs[j][1][dev_refs[j][0]]
    if test_word == dev_word:
        continue
    row_counts[i] += 1
    if not test_word.isdigit() and row_counts[i] < 5:
        print(test_word, dev_word, "{:.3f}".format(flat_distances[index]))

In [None]:
search_index = nmslib.init(method='hnsw', space='l2')
search_index.addDataPointBatch(dev_vectors)
search_index.createIndex({'post': 2}, print_progress=True)

In [None]:
test_refs[19]

In [None]:
ids, distances = index.knnQuery(test_vectors[23], k=10)

In [None]:
ids, distances

In [None]:
for i in range(100):
    pos, sent = test_refs[i]
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    print("_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    indexes, distances = search_index.knnQuery(test_vectors[i], k=10)
    for index, dist in zip(indexes[:3], distances[:3]):
        pos, sent = dev_refs[index]
        start, end = max(pos-5, 0), min(pos+6, len(sent))
        print("{:.3f}".format(dist), "_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    print("")