In [18]:
from keras.preprocessing import text
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [12]:
data = """Deep learning (also known as deep structured learning) is part of a␣ ↪broader family
of machine learning methods based on artificial neural␣ ↪networks with representation
learning. Learning can be supervised,␣ ↪semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks,␣ ↪deep
reinforcement learning, recurrent neural networks, convolutional neural␣ ↪networks and
Transformers have been applied to fields including computer␣ ↪vision, speech recognition,
natural language processing, machine␣ ↪translation, bioinformatics, drug design, medical
image analysis, climate␣ ↪science, material inspection and board game programs, where
they have␣ ↪produced results comparable to and in some cases surpassing human expert␣
↪performance.
"""
dl_data = data.split()

In [19]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]
vocab_size = len(word2id)
embed_size = 100
window_size = 2
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])


Vocabulary Size: 81
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('and', 4), ('as', 5), ('of', 6), ('neural␣', 7), ('↪networks', 8), ('supervised', 9), ('␣', 10)]


In [21]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            context_words.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size,vocab_size=vocab_size):
    if 0 not in x[0]:

        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):',id2word[np.argwhere(y[0])[0][0]])
    if i == 10:
        break
    i += 1


In [23]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size,input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            8100      
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 81)                8181      
                                                                 
Total params: 16281 (63.60 KB)
Trainable params: 16281 (63.60 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [28]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids,window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
    if i % 100000 == 0:
        print('Processed {} (context, word) pairs'.format(i))
    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 437.18522596359253

Epoch: 2 	Loss: 436.8022713661194

Epoch: 3 	Loss: 436.45318508148193

Epoch: 4 	Loss: 436.1224322319031

Epoch: 5 	Loss: 435.8004274368286



In [29]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()


(80, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,-0.001995,-0.011436,0.075044,0.047425,-0.01194,-0.015333,0.029801,0.00683,0.043598,-0.076849,...,-0.053535,0.088919,0.053725,0.005008,0.006755,0.002106,0.0791,-0.020048,-0.003732,0.029792
networks,0.0343,-0.052747,0.066491,0.078028,-0.066471,0.023391,0.005626,0.061527,0.051237,-0.042125,...,-0.065887,-0.012509,0.094437,0.000709,0.021038,0.079572,0.082506,-0.001565,-0.02313,0.015624
and,-0.008408,-0.031874,0.044613,-0.077847,-0.017317,-0.072767,0.044673,-0.011651,-0.07185,-0.062053,...,-0.082288,-0.046847,-0.096221,0.083486,0.082004,0.006684,-0.068995,0.09364,-0.032247,0.015511
as,-0.000307,0.037113,-0.011164,-0.005272,0.018458,-0.01305,-0.034871,-0.030539,-0.046336,-0.02721,...,0.02932,-0.006135,0.006043,-0.027359,-0.019847,-0.035198,-0.013137,0.043853,0.03966,-0.012054
of,0.043565,0.009171,0.009804,-0.022381,-0.033475,-0.049087,0.008909,0.027836,-0.014609,0.024286,...,-0.016846,0.010598,0.026178,0.005373,-0.01223,0.032175,-0.0475,-0.029658,-0.012575,-0.018918


In [31]:
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] for search_term in ['deep']}
similar_words

(80, 80)


{'deep': ['applied', 'bioinformatics', 'learning', 'inspection', 'medical']}