In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
sent = [ 'the glass of milk',
        'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good',
        ]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
## Define the vocabulary size
voc_size = 10000


In [5]:
## One hot representation for every word
# for word in sent:
#     print(one_hot(word, voc_size))

one_hot_repr = [one_hot(word, voc_size) for word in sent]
one_hot_repr

[[6694, 2904, 5217, 5456],
 [6694, 2904, 5217, 8540],
 [6694, 7221, 5217, 1591],
 [5448, 1900, 7039, 8262, 7159],
 [5448, 1900, 7039, 8262, 3503],
 [9409, 6694, 2968, 5217, 5392],
 [340, 1073, 3513, 8262]]

In [6]:
## Word Embedding Representation
from tensorflow.keras.layers import Embedding
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [7]:
import numpy as np

In [8]:
## The use of pad sequence is this that if we
#  defined the max length of a sentence to be 8 then if a sentence has 5 words then it will add 3 zeros in the start of the sentence to make it 8 words or add 3 zeros in the end or
#  if a sentence has 10 words then it will remove the last 2 words to make it 8 words long
sentecelength = 8
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sentecelength)
print(embedded_docs)
# If we write 'post' instead of 'pre' then it will add zeros in the end of the sentence

[[   0    0    0    0 6694 2904 5217 5456]
 [   0    0    0    0 6694 2904 5217 8540]
 [   0    0    0    0 6694 7221 5217 1591]
 [   0    0    0 5448 1900 7039 8262 7159]
 [   0    0    0 5448 1900 7039 8262 3503]
 [   0    0    0 9409 6694 2968 5217 5392]
 [   0    0    0    0  340 1073 3513 8262]]


In [9]:
## Feature Representation
dim = 10
## In google dataset we saw 300 dimensions but here we will take just 10.

In [10]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sentecelength))
model.compile('adam', 'mse')
## Adam is optimizer and loss function is mse = mean squared error



In [11]:
model.summary()

In [None]:
model.predict(embedded_docs)
## Every word below got represented in 10 dimensions

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 292ms/step


array([[[ 0.0416958 ,  0.02233291, -0.04800744,  0.0087273 ,
          0.01782941, -0.0219224 ,  0.02830359,  0.04281769,
         -0.00219993, -0.01662966],
        [ 0.0416958 ,  0.02233291, -0.04800744,  0.0087273 ,
          0.01782941, -0.0219224 ,  0.02830359,  0.04281769,
         -0.00219993, -0.01662966],
        [ 0.0416958 ,  0.02233291, -0.04800744,  0.0087273 ,
          0.01782941, -0.0219224 ,  0.02830359,  0.04281769,
         -0.00219993, -0.01662966],
        [ 0.0416958 ,  0.02233291, -0.04800744,  0.0087273 ,
          0.01782941, -0.0219224 ,  0.02830359,  0.04281769,
         -0.00219993, -0.01662966],
        [-0.04991701,  0.00281075, -0.03792243, -0.00266367,
         -0.00681318,  0.01449782, -0.01147698, -0.02471963,
         -0.04333406,  0.01166294],
        [ 0.04389435,  0.03135283,  0.04166033, -0.01672582,
          0.04596123,  0.01700005, -0.03213011, -0.01265623,
         -0.00753119, -0.03311183],
        [ 0.0161002 ,  0.0327499 , -0.03339895, -0.0

In [64]:
embedded_docs[0]
model.predict(embedded_docs)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


array([[ 0.02671435, -0.03399097,  0.02016089, -0.01410593,  0.00584571,
         0.00208769, -0.0128706 , -0.04950658,  0.02407682, -0.00882763],
       [ 0.02671435, -0.03399097,  0.02016089, -0.01410593,  0.00584571,
         0.00208769, -0.0128706 , -0.04950658,  0.02407682, -0.00882763],
       [ 0.02671435, -0.03399097,  0.02016089, -0.01410593,  0.00584571,
         0.00208769, -0.0128706 , -0.04950658,  0.02407682, -0.00882763],
       [ 0.02671435, -0.03399097,  0.02016089, -0.01410593,  0.00584571,
         0.00208769, -0.0128706 , -0.04950658,  0.02407682, -0.00882763],
       [-0.01154772,  0.04928457,  0.0069805 , -0.01935471,  0.0386242 ,
         0.04278148, -0.02126749, -0.04814367,  0.03042353, -0.00166864],
       [ 0.02470003,  0.02382613, -0.02972004, -0.03233521, -0.04616944,
        -0.02307711,  0.04590144,  0.02033205,  0.04375106,  0.01047163],
       [-0.01855172,  0.01045214, -0.03014507, -0.00065754, -0.03097663,
        -0.02597007,  0.04852624,  0.02831651