Este é o exemplo simples de como gerar WordEmbeddings, segundo o tutorial do tensorflow

In [100]:
import tensorflow as tf
import errno
import os
import zipfile
import numpy as np

from six.moves import urllib

In [None]:
WORDS_PATH = "datasets/words"
WORDS_URL = "http://mattmahoney.net/dc/text8.zip"

In [None]:
def mkdir_path(path):
    os.makedirs(path, exist_ok=True)

In [None]:
def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):
    os.makedirs(words_path, exist_ok=True)
    zip_path = os.path.join(words_path, "words.zip")
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(words_url, zip_path)
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
    return data.decode("ascii").split()

In [None]:
words = fetch_words_data()

In [None]:
words[:5]

Agora que temos as palavras, podemos criar um dicionário para elas

In [None]:
from collections import Counter
import numpy as np

In [None]:
import numpy as np

In [None]:
vocabulary_size = 50000

#seleciona apenas as top vocabulary_size
vocabulary = [("UNK", None)] + Counter(words).most_common(vocabulary_size - 1)
vocabulary = np.array([word for word, _ in vocabulary])
dictionary = {word: code for code, word in enumerate(vocabulary)}

#cria um array com todo o texto encodado (para o id da palavra), sendo 0 para as que nao existem no dic.
data = np.array([dictionary.get(word, 0) for word in words])

In [None]:
" ".join(words[:9]), data[:9]

In [None]:
" ".join([vocabulary[word_index] for word_index in 
         [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]])

In [None]:
words[24], data[24]

Para treinarmos nossa rede neural precisamos de uma função que gere nossos batchs

In [None]:
from collections import deque

In [None]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=[batch_size], dtype=np.int32) #vetor
    labels = np.ndarray(shape=[batch_size, 1], dtype=np.int32) #matrz de uma coluna
    span = 2*skip_window + 1
    buffer = deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = np.random.randint(0, span)
            targets_to_avoid.append(target)
            batch[i*num_skips+j] = buffer[skip_window]
            labels[i*num_skips+j, 0] = buffer[target]
        #print(buffer)
        #quando dou append e ultrapssa maxlen, os a esquerda sao deletados, ou seja, desloca a janela
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

In [None]:
np.random.seed(42)

In [None]:
data_index = 0
batch, labels = generate_batch(8, 2, 1)

In [None]:
batch, [vocabulary[word] for word in batch]

In [None]:
labels, [vocabulary[word] for word in labels[:, 0]]

In [None]:
words[:8]

Agora podemos treinar o modelo

In [None]:
batch_size = 128
skip_window = 1
num_skips = 2

Vamos selecionar algumas palavras aleatoriamente para criar nosso conjunto de avaliacao. Limitamos o conjunto a palavras ocm IDs baixos pois essas sao as que possuem maior frequencia

In [None]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64 #numero de negatuve_sampes

learning_rate = 0.01

In [None]:
#reincia o grafo sempre para o mesmo estado
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [None]:
reset_graph()

train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [None]:
vocabulary_size = 50000
embedding_size = 150

#inicia aleatoriamente
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)

In [None]:
train_inputs = tf.placeholder(tf.int32, shape=[None])
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

NCE loss

In [101]:
nce_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                        stddev=1.0/np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [102]:
loss = tf.reduce_mean(
    tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
                  num_sampled, vocabulary_size)
)

In [103]:
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

Similaridade de consseno entre os exemplos do minibatch e todos os embeddings

In [104]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [105]:
init = tf.global_variables_initializer()

Treinamento do modelo

In [106]:
num_steps = 10001

with tf.Session() as session:
    init.run()
    
    average_loss = 0
    for step in range(num_steps):
        print("\rIteration: {}".format(step), end="\t")
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        _, loss_val = session.run([training_op, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step ", step, ":", average_loss)
            average_loss = 0
            
    if step % 10000 == 0:
        sim = similarity.eval()
        for i in range(valid_size):
            valid_word = vocabulary[valid_examples[i]]
            top_k = 8
            nearest = (-sim[i, :]).argsort()[1:top_k+1]
            log_str = "Nearest to %s:" % valid_word
            for k in range(top_k):
                close_word = vocabulary[nearest[k]]
                log_str = "%s %s, "% (log_str, close_word)
            print(log_str)

    
    final_embeddings = normalized_embeddings.eval()

Iteration: 0	Average loss at step  0 : 285.433898926
Iteration: 2000	Average loss at step  2000 : 130.987416891
Iteration: 4000	Average loss at step  4000 : 62.7637656999
Iteration: 6000	Average loss at step  6000 : 42.172603972
Iteration: 8000	Average loss at step  8000 : 31.7323915248
Iteration: 10000	Average loss at step  10000 : 25.7874812515
Nearest to over: ataxia,  diabetes,  athens,  satisfies,  with,  zero,  by,  roddenberry, 
Nearest to one: nine,  two,  six,  seven,  three,  five,  eight,  four, 
Nearest to were: cooperstown,  eponym,  illyrians,  accepting,  milne,  escherichia,  amoebae,  conglomerates, 
Nearest to may: nine,  deteriorate,  reeds,  wicket,  hostess,  sarris,  lubricants,  allies, 
Nearest to two: zero,  one,  five,  three,  nine,  six,  four,  seven, 
Nearest to its: the,  nosed,  ales,  ampere,  historiography,  korchnoi,  delegates,  exploits, 
Nearest to than: embraces,  altaic,  oak,  hamadan,  questioned,  romanus,  broad,  vented, 
Nearest to these: 