In [44]:
import sys
sys.path.append("/workspace/server")
import warnings

from thera.python.mxnet import mxnet as mx
warnings.filterwarnings('ignore')

import random
import numpy as np
import mxnet as mx
from mxnet import gluon
import gluonnlp as nlp
# https://gluon-nlp.mxnet.io/master/examples/word_embedding/word_embedding.html
import re

In [45]:

np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.gpu(0)


In [46]:
text = " hello world \n hello nice world \n hi world \n"


To start, let’s implement a simple tokenizer to separate the words and then count the frequency of each word in the data set. We can use our defined tokenizer to count word frequency in the data set.

In [47]:


def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
    return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
counter = nlp.data.count_tokens(simple_tokenize(text))

counter

Counter({'hello': 2, 'hi': 1, 'nice': 1, 'world': 3})

The obtained counter behaves like a Python dictionary whose key-value pairs consist of words and their frequencies, respectively. We can then instantiate a Vocab object with a counter. Because counter tracks word frequencies, we are able to specify arguments such as max_size (maximum size) and min_freq (minimum frequency) to the Vocab constructor to restrict the size of the resulting vocabulary.

Suppose that we want to build indices for all the keys in counter. If we simply want to construct a Vocab containing every word, then we can supply counter the only argument.

In [48]:
vocab = nlp.Vocab(counter)


In [49]:
for word in vocab.idx_to_token:
    print(word)



<unk>
<eos>
<pad>
<bos>
world
hello
hi
nice


Contrarily, we can also grab an index given a token using vocab.token_to_idx.

In [50]:
print(vocab.token_to_idx["<unk>"])
print(vocab.token_to_idx["world"])

0
4


In Gluon NLP, for each word, there are three representations: the index of where it occurred in the original input (idx), the embedding (or vector/vec), and the token (the actual word). At any point, we may use any of the following methods to switch between the three representations: idx_to_vec, idx_to_token, token_to_idx.m

Our next step will be to attach word embeddings to the words indexed by vocab. In this example, we’ll use fastText embeddings trained on the wiki.simple dataset. First, we’ll want to create a word embedding instance by calling nlp.embedding.create, specifying the embedding type fasttext (an unnamed argument) and the source source='wiki.simple' (the named argument).


In [51]:
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.simple')

To attach the newly loaded word embeddings fasttext_simple to indexed words in vocab, we can simply call vocab’s set_embedding method:

In [52]:
vocab.set_embedding(fasttext_simple)

By default, the vector of any token that is unknown to vocab is a zero vector. Its length is equal to the vector dimensions of the fastText word embeddings: (300,).

In [53]:
vocab.embedding['hello'].shape

(300,)

To demonstrate how to use pre- trained word embeddings in Gluon, let us first obtain the indices of the words ‘hello’ and ‘world’.

In [54]:
vocab['hello', 'world']

[5, 4]

We can obtain the vectors for the words ‘hello’ and ‘world’ by specifying their indices (5 and 4) and the weight or embedding matrix, which we get from calling vocab.embedding.idx_to_vec in gluon.nn.Embedding. We initialize a new layer and set the weights using the layer.weight.set_data method. Subsequently, we pull out the indices 5 and 4 from the weight vector and check their first five entries.

In [55]:
from mxnet import nd

In [56]:
input_dim, output_dim = vocab.embedding.idx_to_vec.shape
layer = gluon.nn.Embedding(input_dim, output_dim)
layer.initialize()
layer.weight.set_data(vocab.embedding.idx_to_vec)
layer(nd.array([5, 4]))[:, :5]



[[ 0.39567   0.21454  -0.035389 -0.24299  -0.095645]
 [ 0.10444  -0.10858   0.27212   0.13299  -0.33165 ]]
<NDArray 2x5 @cpu(0)>

In [57]:
input_dim, output_dim

(8, 300)

Creating Vocabulary from Pre-trained Word Embeddings

We can also create vocabulary by using vocabulary of pre-trained word embeddings, such as GloVe. Below are a few pre-trained file names under the GloVe word embedding.

In [58]:

glove_6b50d = nlp.embedding.create('glove', source='glove.6B.50d')



In [59]:

vocab = nlp.Vocab(nlp.data.Counter(glove_6b50d.idx_to_token))
vocab.set_embedding(glove_6b50d)



To apply word embeddings, we need to define cosine similarity. Cosine similarity determines the similarity between two vectors.

In [60]:

from mxnet import nd
def cos_sim(x, y):
    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))



The range of cosine similarity between two vectors can be between -1 and 1. The larger the value, the larger the similarity between the two vectors.

In [61]:


x = nd.array([1, 2])
y = nd.array([10, 20])
z = nd.array([-1, -2])

print(cos_sim(x, y))
print(cos_sim(x, z))
print(cos_sim(x, x))



[1.]
<NDArray 1 @cpu(0)>

[-1.]
<NDArray 1 @cpu(0)>

[1.]
<NDArray 1 @cpu(0)>


Given an input word, we can find the nearest 𝑘

words from the vocabulary (400,000 words excluding the unknown token) by similarity. The similarity between any given pair of words can be represented by the cosine similarity of their vectors.

We first must normalize each row, followed by taking the dot product of the entire vocabulary embedding matrix and the single word embedding (dot_prod). We can then find the indices for which the dot product is greatest (topk), which happens to be the indices of the most similar words.

In [62]:
def norm_vecs_by_row(x):
    return x / nd.sqrt(nd.sum(x * x, axis=1) + 1E-10).reshape((-1,1))

def get_knn(vocab, k, word):
    word_vec = vocab.embedding[word].reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_vec)
    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+1, ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    # Remove unknown and input tokens.
    return vocab.to_tokens(indices[1:])

In [63]:

get_knn(vocab, 5, 'baby')



['babies', 'boy', 'girl', 'newborn', 'pregnant']

In [64]:
cos_sim(vocab.embedding['baby'], vocab.embedding['babies'])


[0.83871305]
<NDArray 1 @cpu(0)>

In [65]:
get_knn(vocab, 5, 'computers')

['computer', 'phones', 'pcs', 'machines', 'devices']

We can also apply pre-trained word embeddinmgs to the word analogy problem. For example, “man : woman :: son : daughter” is an analogy. This sentence can also be read as “A man is to a woman as a son is to a daughter.”

The word analogy completion problem is defined concretely as: for analogy ‘a : b :: c : d’, given the first three words ‘a’, ‘b’, ‘c’, find ‘d’. The idea is to find the most similar word vector for vec(‘c’) + (vec(‘b’)-vec(‘a’)).

In this example, we will find words that are analogous from the 400,000 indexed words in vocab.

In [78]:
def get_top_k_by_analogy(vocab, k, word1, word2, word3):
    word_vecs = vocab.embedding[word1, word2, word3]
    word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_diff)
    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k, ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    return vocab.to_tokens(indices)

In [79]:
get_top_k_by_analogy(vocab, 5, 'pepito', 'chocolate', 'cake')

['chocolate', 'cake', 'cream', 'dessert', 'candy']

In [80]:
def cos_sim_word_analogy(vocab, word1, word2, word3, word4):
    words = [word1, word2, word3, word4]
    vecs = vocab.embedding[words]
    return cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3])

cos_sim_word_analogy(vocab, 'man', 'woman', 'son', 'daughter')


[0.9658341]
<NDArray 1 @cpu(0)>