# Import Necessary Packages

In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd


In [58]:
data = """Deep learning (also known as deep structured learning) is part of a 
broader family of machine learning methods based on artificial neural networks 
with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks, 
deep reinforcement learning, recurrent neural networks, convolutional neural networks and 
Transformers have been applied to fields including computer vision, speech recognition, 
natural language processing, machine translation, bioinformatics, drug design, 
medical image analysis, climate science, material inspection and board game programs, 
where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

# Data Preparation

In [59]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id=tokenizer.word_index
words2id['PAD']=0

id2words={v:k for k,v in words2id.items()}

wids = [[words2id[w] for w in text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(words2id)
embed_size=100
window_size=2

print("Vocabulary size: ", vocab_size)
print("Vocabulary Sample: ", list(words2id.items())[:10])

Vocabulary size:  75
Vocabulary Sample:  [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


# Generate Training Data

In [60]:
def generate_context_word_pair(corpus, window_size, vocab_size):
    context_length=window_size*2

    for words in corpus:
        sentence_length = len(words)

        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <=i <sentence_length
                                 and i!= index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen = context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x,y)



 # Building the CBOW Model

In [61]:
cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x:K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))


cbow.compile(loss="categorical_crossentropy", optimizer="rmsprop")
print(cbow.summary())

None


 # Training the Model

In [62]:
for epoch in range(1,6):
    loss=0
    i=0

    for x,y in generate_context_word_pair(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:',epoch, '\tLoss:', loss)
    print()



Epoch: 1 	Loss: 431.72362422943115

Epoch: 2 	Loss: 431.0139813423157

Epoch: 3 	Loss: 429.25389766693115

Epoch: 4 	Loss: 427.4843759536743

Epoch: 5 	Loss: 425.9421081542969



In [63]:
weights=cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2words.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,0.019623,-0.015472,-0.030682,-0.008383,0.007776,0.063193,-0.044547,0.012944,-0.030188,-0.009665,...,-0.011485,-0.023949,-0.046942,-0.019864,-0.014411,-0.036033,-0.027973,-0.013447,-0.063946,-0.032568
networks,-0.050105,-0.036108,0.035422,0.040411,0.062042,-0.034405,0.027385,-0.021025,-0.026591,0.01717,...,0.013413,0.046476,0.05309,-0.051544,0.02344,0.030803,-0.025256,-0.003942,-0.024713,-0.052283
neural,0.037188,-0.034261,0.015794,-0.042897,0.015623,-0.013113,-0.038284,-0.037851,0.027928,0.027009,...,0.01071,0.006225,-0.013257,0.048861,0.022655,-0.019808,-0.048734,-0.037118,0.003858,-0.031201
and,-0.032364,0.030334,0.027834,-0.015151,-0.021081,0.007244,-0.03612,0.033495,-0.008872,0.047202,...,0.003559,-0.039277,0.038549,-0.035929,0.024735,0.013316,-0.044594,-0.049898,-0.045405,0.014855
as,-0.003054,-0.014807,0.022494,-0.04454,-0.026504,-0.037334,0.002657,0.005635,-0.003892,-0.009022,...,-0.043143,0.00661,-0.033644,-0.011659,0.037333,-0.027747,0.037625,-0.038943,-0.045849,0.014122


 # Output

In [64]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

inwords = input()

similar_words = {search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
                   for search_term in {inwords}}

similar_words

(74, 74)
machine


{'machine': ['of',
  'processing',
  'material',
  'architectures',
  'semi',
  'translation']}