In [1]:
import sys, os
from pathlib import Path
sys.path.append(str(Path(os.getcwd()).parent))

import numpy as np
from scipy.spatial.distance import cdist

import nmslib

In [2]:
import tensorflow as tf
import keras.backend.tensorflow_backend as kbt

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
kbt.set_session(tf.Session(config=config))

Using TensorFlow backend.


In [3]:
from neural_LM.UD_preparation.extract_tags_from_UD import read_tags_infile, make_UD_pos_and_tag
from neural_tagging.neural_tagging_1 import load_tagger
from neural_LM.common import *
from neural_tagging.misc import TagNormalizer, load_tag_normalizer

In [45]:
# переменные
dev_file = "/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
test_file = "/home/alexeysorokin/data/Data/UD2.3/UD_Belarusian-HSE/be_hse-ud-test.conllu"
load_file = "/home/alexeysorokin/data/neural_tagging/neural_tagging/models/best/beruuk-large-decay-10-rus.json"
normalizer_file = "../neural_tagging/models/tag_normalizers/beruuk-2"

normalizer = load_tag_normalizer(normalizer_file)

In [46]:
# читаем тестовую выборку
test_data, test_source = read_tags_infile(test_file, read_words=True, return_source_words=True)
dev_data, dev_source = read_tags_infile(dev_file, read_words=True, return_source_words=True)
test_sents, test_tags = [elem[0] for elem in test_data], [elem[1] for elem in test_data]
dev_sents, dev_tags = [elem[0] for elem in dev_data], [elem[1] for elem in dev_data]
dev_tags = [[normalizer.transform(tag, mode="UD") for tag in elem] for elem in dev_tags]

/home/alexeysorokin/data/Data/UD2.3/UD_Belarusian-HSE/be_hse-ud-test.conllu
/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu


In [49]:
dev_file, len(dev_data)

('/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu',
 48814)

In [29]:
other_file = "/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
other_data, other_source = read_tags_infile(other_file, read_words=True, return_source_words=True)
other_sents, other_tags = [elem[0] for elem in other_data], [elem[1] for elem in other_data]
other_tags = [[normalizer.transform(tag, mode="UD") for tag in elem] for elem in other_tags]

/home/alexeysorokin/data/Data/UD2.3/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu


In [9]:
cls = load_tagger(load_file)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 32, 91) 0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 32, 91) 0           input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, None, 32, 32) 2912        lambda_1[0][0]                   
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, None, 32, 50) 1650        dense_1[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (

In [10]:
def get_embeddings(data, dataset_index=0, bucket_size=16):
    dataset_codes = [dataset_index] * len(data)
    transformed_data, indexes, _ = cls.transform(data, bucket_size=64, join_buckets=False, dataset_codes=dataset_codes)
    answer = [[None] * len(data), [None] * len(data)]
    for curr_indexes in indexes:
        curr_batch = make_batch([transformed_data[i] for i in curr_indexes], {0: cls.symbols_number_})
        curr_embeddings = cls._embedder_(curr_batch + [0])
        for i, index in enumerate(curr_indexes):
            L = len(data[index])
            answer[0][index] = curr_embeddings[0][i,:L]
            answer[1][index] = curr_embeddings[1][i,:L]
    print("")
    return answer

In [11]:
test_embeddings = get_embeddings(test_sents, bucket_size=64)
dev_embeddings = get_embeddings(dev_sents, bucket_size=64, dataset_index=1)
other_embeddings = get_embeddings(other_sents, bucket_size=64, dataset_index=1)






In [12]:
test_predictions = cls.predict(test_sents)
dev_predictions = cls.predict(dev_sents, dataset_codes=[1] * len(dev_sents))
other_predictions = cls.predict(other_sents, dataset_codes=[1] * len(other_sents))

In [18]:
def make_vectors_data(data, sents):
    vectors, refs = [], []
    for j, (sent_data, sent) in enumerate(zip(data, sents)):
        for i, embedding in enumerate(sent_data):
            vectors.append(embedding)
            refs.append((i, j, sent))
    vectors = np.array(vectors, dtype="float32")
    return vectors, refs

In [19]:
test_vectors, test_refs = make_vectors_data(test_embeddings[1], test_source)
dev_vectors, dev_refs = make_vectors_data(dev_embeddings[1], dev_source)
other_vectors, other_refs = make_vectors_data(other_embeddings[1], other_source)

In [15]:
len(test_vectors), len(dev_vectors), len(other_vectors)

(1382, 870474, 118487)

In [16]:
search_index = nmslib.init(method='hnsw', space='l2')
search_index.addDataPointBatch(dev_vectors)
search_index.createIndex({'post': 2}, print_progress=True)

In [17]:
BATCH_SIZE = 64

### Классы для kNN-классификации

In [71]:
from collections import defaultdict

class RefClassifier:
    
    def __init__(self, search_index, tags, k=1, batch_size=64):
        self.search_index = search_index
        self.tags = tags
        self.k = k
        self.batch_size = batch_size
        
    def predict(self, data, return_probs=False):
        answer, probs = [None] * len(data), [None] * len(data)
        for start in range(0, len(data), self.batch_size):
            end = min(len(data), start + self.batch_size)
            batch_answers = self.search_index.knnQueryBatch(data[start:end], k=self.k)
            for i, elem in enumerate(batch_answers):
                class_probs = defaultdict(float)
                for index, score in zip(*elem):
                    label, prob = self.tags[index], np.exp(-score)
                    class_probs[label] += prob
                label, label_score = max(class_probs.items(), key=(lambda x: x[1]))
                label_score /= sum(class_probs.values())
                answer[start+i], probs[start+i] = label, label_score
        return (answer, probs) if return_probs else answer

In [130]:
from neural_tagging.algorithm import BisectingKMeans

class CentroidClassifier:
    
    def __init__(self, k=1, batch_size=64, max_class_centroids=5, 
                 min_class_size=20, refinement_iterations=5, random_state=187):
        self.k = k
        self.batch_size = batch_size
        self.max_class_centroids = max_class_centroids
        self.min_class_size = min_class_size
        self.refinement_iterations = refinement_iterations
        self.random_state = random_state
        
    def train(self, X, y):
        self._make_centroids(X, y)
        self.search_index =  nmslib.init(method='hnsw', space='l2')
        self.search_index.addDataPointBatch(self.centroids_)
        self.search_index.createIndex({'post': 2}, print_progress=True)
        return self
        
    @property
    def labels_number_(self):
        return len(self.labels_)
    
    def _initialize_centroids(self, X, y):
        m, self.dim = X.shape
        self.labels_, y = np.unique(y, return_inverse=True)
        centroids = np.zeros(shape=(self.labels_number_, X.shape[1]))
        counts = np.zeros(shape=(self.labels_number_), dtype=int)
        for x, label in zip(X, y):
            centroids[label] += x
            counts[label] += 1
        centroids /= counts[:, None]
        return centroids
    
    def _make_centroids(self, X, y):
        self.labels_, y = np.unique(y, return_inverse=True)
        indexes_by_labels = [[] for _ in self.labels_]
        for i, label in enumerate(y):
            indexes_by_labels[label].append(i)
        self.centroids_, new_labels = [], []
        self.support_ = []
        for i, curr_indexes in enumerate(indexes_by_labels):
            if self.max_class_centroids > 1 and len(curr_indexes) > self.min_class_size:
                clusterizer = BisectingKMeans(
                    max_clusters=self.max_class_centroids, refinement_iter=self.refinement_iterations,
                    use_relative_tolerance=True, tol=0.01, largest_cluster_size=self.min_class_size,
                    random_state=self.random_state).fit(X[curr_indexes])
                curr_centers = clusterizer.centers_
                curr_support = [[curr_indexes[j] for j in elem] for elem in clusterizer.class_indexes_]
            else:
                curr_centers = [np.mean(X[curr_indexes], axis=0)]
                curr_support = [curr_indexes]
            self.centroids_.extend(curr_centers)
#             if i % 50 == 0:
#                 print(i, self.labels_[i], len(curr_indexes))
            new_labels.extend([self.labels_[i]] * len(curr_centers))
            self.support_.extend(curr_support)
        self.labels_ = new_labels 
        self.centroids_ = np.array(self.centroids_, dtype="float32")
        return self
    
    def predict(self, data, return_probs=False):
        answer = [None] * len(data)
        answer, probs = [None] * len(data), [None] * len(data)
        for start in range(0, len(data), self.batch_size):
            end = min(len(data), start + self.batch_size)
            batch_answers = self.search_index.knnQueryBatch(data[start:end], k=self.k)
            for i, elem in enumerate(batch_answers):
                class_probs = defaultdict(float)
                for index, score in zip(*elem):
                    label, prob = self.labels_[index], np.exp(-score)
                    class_probs[label] += prob
                label, label_score = max(class_probs.items(), key=(lambda x: x[1]))
                label_score /= sum(class_probs.values())
                answer[start+i], probs[start+i] = label, label_score
        return (answer, probs) if return_probs else answer

### Проверка качества kNN

In [77]:
flat_dev_tags = list(itertools.chain.from_iterable(dev_tags))

In [78]:
def evaluate(cls, vectors, corr_tags):
    if isinstance(corr_tags[0], list):
        corr_tags = list(itertools.chain.from_iterable(corr_tags))
    pred_tags = cls.predict(vectors)
    corr, stats = 0, defaultdict(lambda: defaultdict(int))
    for pred_tag, corr_tag in zip(pred_tags, corr_tags):
        if corr_tag == pred_tag:
            corr += 1
        stats[corr_tag][pred_tag] += 1
    corr /= len(corr_tags)
    return corr, stats

In [82]:
for k in range(2, 10):
    cls = RefClassifier(search_index, flat_dev_tags, k=k)
    acc, stats = evaluate(cls, other_vectors, other_tags)
    test_acc, test_stats = evaluate(cls, test_vectors, test_tags)
    print(k, "{:.2f}".format(100 * acc), "{:.2f}".format(100 * test_acc))

2 91.67 61.51
3 92.24 62.81
4 92.31 62.81
5 92.44 63.24
6 92.48 63.24
7 92.52 63.53
8 92.52 63.68
9 92.52 63.60


In [91]:
flat_dev_tags[:10]

['NOUN,Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing',
 'PUNCT',
 'NOUN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
 'ADJ,Case=Gen|Degree=Pos|Gender=Neut|Number=Sing',
 'NOUN,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing',
 'NOUN,Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing',
 'PROPN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
 'PROPN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
 'AUX,Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act',
 'NOUN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing']

In [133]:
# for c, k in itertools.product([5, 10, 15], [1, 3, 5]):
for c in [5, 10, 15]:
    centroid_classifier = CentroidClassifier(max_class_centroids=c, k=1).train(dev_vectors, flat_dev_tags)
    with open("../neural_tagging/results/centroid_sizes-{}.out".format(c), "w", encoding="utf8") as fout:
        start = 0
        while start < centroid_classifier.labels_number_:
            label = centroid_classifier.labels_[start]
            end = start
            while end < centroid_classifier.labels_number_ and centroid_classifier.labels_[end] == label:
                end += 1
            cluster_lengths = [len(x) for x in centroid_classifier.support_[start:end]]
            fout.write('{}\t{}\t{}\t{}\n'.format(
                label, sum(cluster_lengths), end-start, " ".join(map(str, cluster_lengths))))
            start = end
#     acc, stats = evaluate(centroid_classifier, dev_vectors, flat_dev_tags)
#     print(c, k, "{:.2f}".format(100 * acc))
#     with open("../neural_tagging/results/centroid_stats-{}-{}.out".format(c, k), "w", encoding="utf8") as fout:
#         for label, label_stats in sorted(stats.items(), key=(lambda x: -sum(x[1].values()))):
#             corr, total = label_stats[label], sum(label_stats.values())
#             fout.write('{}\t{}\t{}\t{:.2f}\n'.format(label, total, corr, 100 * corr / total))

In [70]:
data = cls.search_index.knnQueryBatch(other_vectors[:1000], k=cls.k)
for (pos, j, sent), (ids, distances) in zip(other_refs[:100], data[:100]):
    pred_tags = [cls.tags[i] for i in ids]
    counts = defaultdict(int)
    for pred_tag in pred_tags:
        counts[pred_tag] += 1
    class_probs = defaultdict(float)
    for label, score in zip(pred_tags, distances):
        prob = np.exp(-score)
        class_probs[label] += prob
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    corr_tag, predicted_tag = other_tags[j][pos], other_predictions[j][pos]
    print("_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    print(corr_tag, counts, class_probs, sep="\n")

 Алгоритм ,_от_имени_учёного_аль
NOUN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
defaultdict(<class 'int'>, {'NOUN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing': 2, 'NOUN,Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing': 1})
defaultdict(<class 'float'>, {'NOUN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing': 0.0025496888265479356, 'NOUN,Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing': 0.0007354518747888505})
Алгоритм , от_имени_учёного_аль_-
PUNCT
defaultdict(<class 'int'>, {'PUNCT': 3})
defaultdict(<class 'float'>, {'PUNCT': 2.646495044231415})
Алгоритм_, от имени_учёного_аль_-_Хорезми
ADP
defaultdict(<class 'int'>, {'ADP': 3})
defaultdict(<class 'float'>, {'ADP': 1.4448688626289368})
Алгоритм_,_от имени учёного_аль_-_Хорезми_,
NOUN,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing
defaultdict(<class 'int'>, {'NOUN,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing': 3})
defaultdict(<class 'float'>, {'NOUN,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing': 0.3954713046550751})
Алгоритм_,_от_имени

In [54]:
flat_other_tags = list(itertools.chain.from_iterable(other_tags))
flat_other_tags[:10]

['NOUN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing',
 'PUNCT',
 'ADP',
 'NOUN,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing',
 'NOUN,Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing',
 'PART',
 'PUNCT',
 'PROPN,Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing',
 'PUNCT',
 'PUNCT']

In [30]:
for i in range(100):
    pos, j, sent = other_refs[i]
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    corr_tag, predicted_tag = other_tags[j][pos], other_predictions[j][pos]
    print("_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    indexes, distances = search_index.knnQuery(test_vectors[i], k=10)
    index, dist = indexes[0], distances[0]
    pos, j, sent = dev_refs[index]
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    nn_tag = dev_tags[j][pos]
    print("{:.3f}".format(dist), "_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    print(corr_tag, predicted_tag, nn_tag, sep="\n", end="\n\n")

 Як раней_перадаваў_Тэлеграф_,_будаваць
9.764 Да яз ти_ту_сежу_.
SCONJ
ADP
X

Як раней перадаваў_Тэлеграф_,_будаваць_беларускую
14.935 подтвердили_,_что_мы_с Женей Плющенко_действительно_немного_похожи_:
ADV,Degree=Cmp
ADV,Degree=Cmp
PROPN,Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing

Як_раней перадаваў Тэлеграф_,_будаваць_беларускую_АЭС
13.451 ректором_Уральского_горного_института_и исследовал Камчатку_.
VERB,Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
VERB,Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
VERB,Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act

Як_раней_перадаваў Тэлеграф ,_будаваць_беларускую_АЭС_пачнуць
10.434 Но_что_там граф ,_когда_подобным_же_образом
PROPN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
NOUN,Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
NOUN,Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing

Як_раней_перадаваў_Тэлеграф , будаваць_беларускую_АЭС_пачнуць_у
0.869 Пот

In [32]:
search_index.knnQueryBatch(test_vectors[:10], k=5)

[(array([157832, 252240, 239704, 746237, 637099], dtype=int32),
  array([ 9.764,  9.841, 10.138, 10.425, 10.469], dtype=float32)),
 (array([204246, 472950,  37999,  21127,  38139], dtype=int32),
  array([14.935, 16.006, 16.24 , 16.488, 16.555], dtype=float32)),
 (array([164910, 163356,   6493,   8954, 127185], dtype=int32),
  array([13.451, 13.874, 13.99 , 14.094, 14.107], dtype=float32)),
 (array([679361, 534962, 859328, 486947, 552180], dtype=int32),
  array([10.434, 10.985, 11.352, 11.568, 11.701], dtype=float32)),
 (array([277754, 799940, 818475, 534963, 268258], dtype=int32),
  array([0.869, 0.956, 0.977, 0.98 , 1.08 ], dtype=float32)),
 (array([ 76532, 490372, 465780, 730023, 273604], dtype=int32),
  array([8.303, 8.481, 8.768, 8.96 , 9.273], dtype=float32)),
 (array([200584,  86169,  92000, 147594, 787195], dtype=int32),
  array([7.039, 7.641, 7.981, 8.107, 8.315], dtype=float32)),
 (array([741073, 367748, 741040, 200585, 181473], dtype=int32),
  array([8.632, 8.947, 9.014, 9.1 

In [None]:
test_refs[23]

In [None]:
ids, distances = search_index.knnQuery(test_vectors[23], k=10)

In [None]:
ids, distances

In [None]:
outfile = "../neural_tagging/results/belarusian-nn.out"
with open(ouftile, "w", enc)
for i in range(100):
    pos, sent = test_refs[i]
    start, end = max(pos-5, 0), min(pos+6, len(sent))
    print("_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    indexes, distances = search_index.knnQuery(test_vectors[i], k=10)
    for index, dist in zip(indexes[:3], distances[:3]):
        pos, sent = dev_refs[index]
        start, end = max(pos-5, 0), min(pos+6, len(sent))
        print("{:.3f}".format(dist), "_".join(sent[start:pos]), sent[pos], "_".join(sent[pos+1:end]))
    print("")