In [1]:
# grade 1 reading example from: http://www.pearsonlongman.com/ae/marketing/sfesl/tests/grade1.html

sample_text1 = """Saturday is our day to clean, but Grandpa turns work into fun.

We like to sing when we dust. We like to dance when we mop. We clean the car together. We do the wash together. We both wear our caps.

When we are done, we hurry to the store. We shop for food. Grandpa lets me pick the fruits that I like best. We carry the food home. Grandpa tells stories while we walk.

At night, our work is done. Friends come over. We cook dinner. We sing and dance.

Saturday is our day to clean, but Grandpa turns work into fun!

"""

sample_text2 = """How does a butterfly grow? It starts out as a tiny egg. It becomes a caterpillar. It eats lots of leaves. It grows and grows. Then it goes inside a cocoon. At last, it comes out. It’s a butterfly!

How does a frog grow? It starts out as a tiny egg in the water. The egg grows into a tadpole. It keeps changing. It eats tiny plants. It grows and grows. At last, it hops out of the pond. It’s a frog!

How does a flower grow? It starts out as a seed. Sun and rain help the seed grow. Roots grow into the ground. The plant grows and grows. At last, a bud opens. It’s a flower!

Now you know how they grow!"""


In [2]:
# build training data...using skip-gram
import re
import numpy as np

def n_gram_build(text, full_window):
    """
    Inputs:
    *  text: sample text which skipgram will be applied
    *  window_size: skip gram parameter for window size.
    
    technically window_size is the full window size, but i've done it this way for easier programming...
    """
    # some check to see full window size is odd
    window_size = (full_window-1)/2
    # some basic text cleaning
    text = re.sub(r"[^0-9a-z]", " ", text.lower())
    text = re.sub(r"\s+", " ", text.strip())
    
    full_vocab = np.array(list(set(text.split())))
    
    padded = [" "] * (full_window-1)
    #tokens = padded+text.split()+padded
    tokens = text.split()
    
    zip_list = []
    for idx in range(full_window):
        if idx == full_window-1:
            zip_list.append(tokens[idx:])
        else:
            zip_list.append(tokens[idx:-(full_window-1-idx)])
            
    ngram_tokens = list(zip(*zip_list))
        
    # now create ngram
    return ngram_tokens, full_vocab


In [3]:
print(n_gram_build(sample_text1, 3)[0][:5])

[('saturday', 'is', 'our'), ('is', 'our', 'day'), ('our', 'day', 'to'), ('day', 'to', 'clean'), ('to', 'clean', 'but')]


In [4]:
def n_gram_build_split(ngram_tokens, cv_mod):
    """
    ngram tokens as produced in the ngram function
    
    we simply take the middle element to be the label, and the rest to be
    the context.
    """
    mid_point = int(((len(ngram_tokens[0])+1)/2)-1) # zero indexing
    def create_label_point(context, mid_point):
        mid_point = int(mid_point)
        context = list(context)
        target = context.pop(mid_point)
        return (np.array(cv_1.transform(context).todense()), np.array(cv_1.transform([target]).todense()).flatten())
    skip_gram_list = [create_label_point(tt, mid_point) for tt in ngram_tokens]
    return skip_gram_list

In [5]:
# before we train in keras, we need to know the input and output size, 
# in this case the input AND output will be the size of our vocabulary
# we will need to transform it into binary text blobs.

from sklearn.feature_extraction.text import CountVectorizer

ngrams_text1, vocab1 = n_gram_build(sample_text1,3)
vocab_size1 = len(vocab1)
cv_1 = CountVectorizer(vocabulary=vocab1)

In [6]:
print(n_gram_build_split(n_gram_build(sample_text1,1)[0], cv_1)[:2])

[(array([], shape=(0, 55), dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)), (array([], shape=(0, 55), dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))]


In [7]:
# window size = 
window_size = 3
train_set1 = n_gram_build_split(n_gram_build(sample_text1, window_size)[0], cv_1)

In [8]:
context1 = np.array([x[0] for x in train_set1])
target1  = np.array([x[1] for x in train_set1])

In [9]:
# now plug into keras

# CBOW

In [10]:
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Reshape, Merge

# create CBOW model
embedding_size = vocab_size1
embedding_size = 10

model = Sequential()
# our input and output is the same size
model.add(Dense(vocab_size1, input_shape=(window_size-1, vocab_size1,), name='input_layer')) 
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(embedding_size, name='embedding_layer'))
model.add(Dense(vocab_size1, name='output_layer'))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

Using TensorFlow backend.


In [11]:
%%timeit
# CBOW
model.fit(context1, target1, verbose=0, batch_size=5, nb_epoch=1000) # feel free to change verbose to 1

1 loop, best of 3: 21.8 s per loop


In [12]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_layer (Dense)              (None, 2, 55)         3080        dense_input_1[0][0]              
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 2, 55)         0           input_layer[0][0]                
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 110)           0           activation_1[0][0]               
____________________________________________________________________________________________________
embedding_layer (Dense)          (None, 10)            1110        flatten_1[0][0]                  
___________________________________________________________________________________________

In [13]:
loss, accuracy = model.evaluate(context1, target1, verbose=0)
print("Accuracy is {:.2f}".format(accuracy))

Accuracy is 0.92


In [14]:
embedding = Model(input=model.input, 
                  output=model.get_layer('embedding_layer').output)
embedding_vector = embedding.predict(np.expand_dims(context1[0,:,:], axis=0))
embedding_vector.flatten()

array([ 7.10413742, -2.65807986,  2.89546251, -0.52381462,  0.46597743,
        3.92889667, -2.4014895 , -6.09131289, -1.42299211, -2.1180861 ], dtype=float32)

# Skip Gram

In [15]:
# skip gram

model = Sequential()
# our input and output is the same size
model.add(Dense(vocab_size1, input_shape=(vocab_size1,), name='input_layer')) 
model.add(Activation('relu'))
model.add(Dense(embedding_size, name='embedding_layer'))
"""
model.add(Merge([Sequential([Dense((window_size-1)*vocab_size1, input_shape=(embedding_size, ))]), 
                 Sequential([Reshape((window_size-1, vocab_size1), input_shape=((window_size-1)*vocab_size1,))])], 
                name='output_layer', mode='concat'))
"""
model.add(Dense((window_size-1)*vocab_size1))
model.add(Reshape((window_size-1, vocab_size1), name='output_layer'))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [16]:
%%timeit
# skip-gram
model.fit(target1, context1, verbose=0, batch_size=5, nb_epoch=1000)

1 loop, best of 3: 25.3 s per loop


In [17]:
loss, accuracy = model.evaluate(target1, context1, verbose=0)
print("Accuracy is {:.2f}".format(accuracy))

Accuracy is 0.68


In [18]:
target1[0].shape

(55,)

In [19]:
embedding = Model(input=model.input, 
                  output=model.get_layer('embedding_layer').output)
embedding_vector = embedding.predict(np.expand_dims(target1[0], axis=0))
embedding_vector

array([[ 0.92829096, -5.07062626, -9.21794701, -1.31392336,  3.94425321,
        -0.80875969,  3.23186731, -3.33245945,  0.34641635,  2.22713947]], dtype=float32)

In [20]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_layer (Dense)              (None, 55)            3080        dense_input_2[0][0]              
____________________________________________________________________________________________________
activation_3 (Activation)        (None, 55)            0           input_layer[0][0]                
____________________________________________________________________________________________________
embedding_layer (Dense)          (None, 10)            560         activation_3[0][0]               
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 110)           1210        embedding_layer[0][0]            
___________________________________________________________________________________________

Final Considerations
--------------------

*  In the current context of word2vec we have only provided _positive_ examples. Positive examples seek to promote vectors so that they are more similar. Negative sampling can be used to perhaps overcome the bias within the model which favours more common words. In this case we will randomly generate dissimilar training data in order to improve the word2vec model. This is what the negative sampling parameter refers to. Skip-gram implementations seems to benefit greatly from negative sampling.


**When to use Skipgram? When to use CBOW?**

> Skip-gram: works well with small amount of the training data, represents well even rare words or phrases.
CBOW: several times faster to train than the skip-gram, slightly better accuracy for the frequent words
This can get even a bit more complicated if you consider that there are two different ways how to train the models: the normalized hierarchical softmax, and the un-normalized negative sampling. Both work quite differently. - Mikolov, author of Word2Vec Paper