## ELMO

[Репозиторий с подсказками по элмо от Андрея Кутузова](https://github.com/ltgoslo/simple_elmo)

In [3]:
%load_ext autoreload

import time
import numpy as np
import tensorflow as tf
from elmo_helpers import tokenize, get_elmo_vectors, load_elmo_embeddings

tf.reset_default_graph()
elmo_path = 'elmo'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'elmo_helpers'

In [5]:
!pip install bilm

Collecting bilm
  Downloading https://files.pythonhosted.org/packages/22/a6/711e6ea5a05f7ce72f0a5c6c3bfbd1451aeb8810c9ec8074d5667e3ff433/bilm-0.1.post5-py3-none-any.whl
Installing collected packages: bilm
Successfully installed bilm-0.1.post5


In [6]:
import sys
import re
import os
import tensorflow as tf
from bilm import Batcher, BidirectionalLanguageModel, weight_layers

In [5]:
raw_sentences = [
    'хочу изучить технику стрельбы из лука',
    'можешь нарезать мелко лук, возьми для этого большой нож'
]

sentences = [tokenize(s) for s in raw_sentences]
    
print('=====')
print('%d sentences total' % len(sentences))
print('=====')
print(sentences)

=====
2 sentences total
=====
[['хочу', 'изучить', 'технику', 'стрельбы', 'из', 'лука'], ['можешь', 'нарезать', 'мелко', 'лук', 'возьми', 'для', 'этого', 'большой', 'нож']]


In [6]:
# Loading a pre-trained ELMo model:
batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(elmo_path)


# Actually producing ELMo embeddings for our data:
with tf.Session() as sess:
    
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())
    
    start = time.time()
    elmo_vectors = get_elmo_vectors(
        sess, sentences, batcher, sentence_character_ids, elmo_sentence_input)
    
    print(time.time() - start)
    print('ELMo embeddings for your input are ready')
    print('Tensor shape:', elmo_vectors.shape)
    
    # Due to batch processing, the above code produces for each sentence
    # the same number of token vectors, equal to the length of the longest sentence
    # (the 2nd dimension of the elmo_vector tensor).
    # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
    # Let's make a version without these redundant vectors:
    
    cropped_vectors = []
    for vect, sent in zip(elmo_vectors, sentences):
        cropped_vector = vect[:len(sent), :]
        cropped_vectors.append(cropped_vector)
        

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


Sentences in this batch: 2


1.622628927230835
ELMo embeddings for your input are ready
Tensor shape: (2, 9, 1024)


In [12]:
# A quick test:
# in each sentence, we find the tokens most similar to the 2nd token of the first sentence

query_nr = 5
query_word = sentences[0][query_nr]
print('Query sentence:', sentences[0])
print('Query word:', query_word)

query_vec = cropped_vectors[0][query_nr]


for sent_nr, sent in enumerate(sentences):
    if sent_nr == 0:
        continue
        
    print('======')
    print(sent)
    sims = {}
    
    for nr, word in enumerate(sent):
        w_vec = cropped_vectors[sent_nr][nr]
        sims[word] = np.dot(query_vec, w_vec)

    for k in sorted(sims, key=sims.get, reverse=True):
        print(k, sims[k])

Query sentence: ['хочу', 'изучить', 'технику', 'стрельбы', 'из', 'лука']
Query word: лука
['можешь', 'нарезать', 'мелко', 'лук', 'возьми', 'для', 'этого', 'большой', 'нож']
лук 155.8144
нож 150.87787
этого 93.784744
мелко 88.479836
нарезать 85.494
большой 79.993164
возьми 70.24108
можешь 55.802162
для 50.650307
