In [1]:
from nltk.corpus import gutenberg
from string import punctuation

bible=gutenberg.sents('bible-kjv.txt')
remove_terms=punctuation + '0123456789'

norm_bible=[[word.lower() for word in sent if word not in remove_terms]for sent in bible]
norm_bible=[' '.join(tok_sent)for tok_sent in norm_bible]
norm_bible=[tok_sent for tok_sent in norm_bible if len(tok_sent.split())>2]

print('Total lines:',len(bible))
print('\nSample line:',bible[10])
print('\nProcessed line:',norm_bible[10])

Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: and god said let there be a firmament in the midst of the waters and let it divide the waters from the waters


In [4]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

### Build the corpus vocabulary

In [7]:
tokenizer=text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id=tokenizer.word_index

#build vocabulary of unique words
word2id['PAD']=0
id2word={v:k for k,v in word2id.items()}
wids=[[word2id[w] for w in text.text_to_word_sequence(doc)]for doc in norm_bible]

vocab_size=len(word2id)
embed_size=100
window_size=2 #Context window size

print('vocabulary size: ',vocab_size)
print('Vocabulary sample: ',list(word2id.items())[20:30])

vocabulary size:  12726
Vocabulary sample:  [('it', 21), ('with', 22), ('all', 23), ('thou', 24), ('thy', 25), ('was', 26), ('god', 27), ('which', 28), ('my', 29), ('me', 30)]


### Build a CBOW(context,target) generator

We need pairs which consist of a target centre word and surround context words.In implementation, a target word is of size 1 and surrounding context is of size `2 * window_size` where we take window_size words before and after the target word in our corpus.

For example, if the original text was ‘in the beginning god created heaven and earth’ which after pre-processing and removal of stopwords became ‘beginning god created heaven earth’ and for us, what we are trying to achieve is that. Given [beginning, god, heaven, earth] as the context, what the target center word is, which is ‘created’ in this case.

In [22]:
import numpy as np
def generate_context_word_pairs(corpus,window_size,vocab_size):
    context_length=window_size * 2
    for words in corpus:
        sentence_length=len(words)
        for index,word in enumerate(words):
            context_words=[]
            label_word=[]
            start=index-window_size
            end=index+window_size+1
            
            
            context_words.append([words[i]
                                 for i in range(start,end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)
            
            x=sequence.pad_sequences(context_words,maxlen=context_length)
            y=np_utils.to_categorical(label_word,vocab_size)
            yield(x,y)
            
#Test for some samples
i=0
for x,y in generate_context_word_pairs(corpus=wids,window_size=window_size,vocab_size=vocab_size):
    #print(x[0])
    #print([id2word[w] for w in x[0]])
    if 0 not in x[0]:#0 for PAD
        print('Context (x):',[id2word[w] for w in x[0]],' -> Target (Y):',id2word[np.argwhere(y[0])[0][0]])
                          
    if i==10:
        break
                    
    i += 1
                        

Context (x): ['the', 'old', 'of', 'the']  -> Target (Y): testament
Context (x): ['old', 'testament', 'the', 'king']  -> Target (Y): of
Context (x): ['testament', 'of', 'king', 'james']  -> Target (Y): the
Context (x): ['of', 'the', 'james', 'bible']  -> Target (Y): king


### Build the CBOW model architecture

We can leverage `keras` on top of the `tensorflow` to build our deep learning architecture for CBOW model.

- For this, inputs will be context words which are passed to an embedding layer ( initialized with random weights)
- The word embeddings are propogated to a lambda layer where we average out the word embeddings(hence called **CBOW** because we don't really consider the order or sequence in the context words when averaged)
- Then we pass this averaged context embeddings to a dense softmax layer which predicts target word.
- We match this with the actual target word,compute the loss by leveraging the `categorical_crossentropy` loss and perform backpropogation with each epoch to update the embedding layer in the process

In [26]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense,Embedding,Lambda


#Build CBOW architecture
cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size,output_dim=embed_size,
                  input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x,axis=1),output_shape=(embed_size,)))
cbow.add(Dense(vocab_size,activation='softmax'))
cbow.compile(loss='categorical_crossentropy',optimizer='rmsprop')

#View Model summary
print(cbow.summary())

'''

# visualize model structure
import pydot as pyd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot


SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, 
                 rankdir='TB').create(prog='dot', format='svg'))
                 '''

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 4, 100)            1272600   
_________________________________________________________________
lambda_9 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 12726)             1285326   
Total params: 2,557,926
Trainable params: 2,557,926
Non-trainable params: 0
_________________________________________________________________
None


"\n\n# visualize model structure\nimport pydot as pyd\nfrom IPython.display import SVG\nfrom keras.utils.vis_utils import model_to_dot\n\n\nSVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, \n                 rankdir='TB').create(prog='dot', format='svg'))\n                 "

### Train the model

In [28]:
for epoch in range(1,6):
    loss=0.
    i=0
    for x,y in generate_context_word_pairs(corpus=wids,window_size=window_size,vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        if i % 100000 ==0:
            print('Processed {} (context,word) pairs'.format(i))
            
    print('Epoch: ',epoch, '\tLoss:',loss)
    print()

KeyboardInterrupt: 

### Get word embedding

In [None]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

Now let's find out contexually similar words

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

#Compute pairwise distance matrix
distance_matrix=euclidean_distances(weights)
print(distance_matrix.shape)

#View contexually similar words
similar_words={search_term:[id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
              for search_term in ['god','jesus','noah','egypt','john','gospel','moses','famine']}

similar_words