In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]


In [4]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [5]:
## Define the vocabulary size
voc_size=10000

In [6]:
### One Hot Representation
one_hot_repr=[one_hot(words,voc_size)for words in sent]
one_hot_repr

[[9346, 8662, 2690, 6058],
 [9346, 8662, 2690, 8001],
 [9346, 3507, 2690, 332],
 [2676, 5157, 5501, 2027, 7875],
 [2676, 5157, 5501, 2027, 2627],
 [3528, 9346, 2807, 2690, 5789],
 [4232, 7895, 2332, 2027]]

In [7]:
from tensorflow.keras.layers import Embedding
#from tensorflow.keras.processing.sequence import pad_sequences

# Every number have different words, so we need to make all these sentences of the same length, this is done by pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [8]:
import numpy as np

In [9]:
# here we are making all the senteces of the same length (it just add zeroes either at the beginning, or at the end, here we are adding it in the starting)
sent_length=8
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 9346 8662 2690 6058]
 [   0    0    0    0 9346 8662 2690 8001]
 [   0    0    0    0 9346 3507 2690  332]
 [   0    0    0 2676 5157 5501 2027 7875]
 [   0    0    0 2676 5157 5501 2027 2627]
 [   0    0    0 3528 9346 2807 2690 5789]
 [   0    0    0    0 4232 7895 2332 2027]]


In [10]:
## feature representation
dim=10

In [11]:
# Create a sequential model
model=Sequential()
# Add an Embedding layer. This layer will learn a dense word embedding for each word in the vocabulary.
model.add(Embedding(voc_size,dim,input_length=sent_length))

model.compile('adam','mse')

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [15]:
# now every word in the vocabulary, will be represented by 10 features
model.predict(embedded_docs)



array([[[-1.71848312e-02,  3.07143666e-02,  4.01177444e-02,
          9.99944285e-03, -3.45014222e-02, -4.91489880e-02,
          1.31797530e-02,  3.35771330e-02,  1.21164098e-02,
          1.54869594e-02],
        [-1.71848312e-02,  3.07143666e-02,  4.01177444e-02,
          9.99944285e-03, -3.45014222e-02, -4.91489880e-02,
          1.31797530e-02,  3.35771330e-02,  1.21164098e-02,
          1.54869594e-02],
        [-1.71848312e-02,  3.07143666e-02,  4.01177444e-02,
          9.99944285e-03, -3.45014222e-02, -4.91489880e-02,
          1.31797530e-02,  3.35771330e-02,  1.21164098e-02,
          1.54869594e-02],
        [-1.71848312e-02,  3.07143666e-02,  4.01177444e-02,
          9.99944285e-03, -3.45014222e-02, -4.91489880e-02,
          1.31797530e-02,  3.35771330e-02,  1.21164098e-02,
          1.54869594e-02],
        [ 3.49329002e-02,  1.83907263e-02,  3.16467136e-03,
         -1.77908540e-02,  9.66268778e-03,  3.52028944e-02,
          3.89840119e-02,  2.97502428e-03, -8.538626

In [16]:
embedded_docs[0]

array([   0,    0,    0,    0, 9346, 8662, 2690, 6058], dtype=int32)

In [17]:
model.predict(embedded_docs[0])



array([[-0.01718483,  0.03071437,  0.04011774,  0.00999944, -0.03450142,
        -0.04914899,  0.01317975,  0.03357713,  0.01211641,  0.01548696],
       [-0.01718483,  0.03071437,  0.04011774,  0.00999944, -0.03450142,
        -0.04914899,  0.01317975,  0.03357713,  0.01211641,  0.01548696],
       [-0.01718483,  0.03071437,  0.04011774,  0.00999944, -0.03450142,
        -0.04914899,  0.01317975,  0.03357713,  0.01211641,  0.01548696],
       [-0.01718483,  0.03071437,  0.04011774,  0.00999944, -0.03450142,
        -0.04914899,  0.01317975,  0.03357713,  0.01211641,  0.01548696],
       [ 0.0349329 ,  0.01839073,  0.00316467, -0.01779085,  0.00966269,
         0.03520289,  0.03898401,  0.00297502, -0.00853863,  0.00493679],
       [-0.04702666, -0.03359659,  0.01449528, -0.00443111,  0.00516738,
        -0.01924445,  0.01657871,  0.00743727, -0.02164837, -0.02748278],
       [ 0.00680507, -0.02746353, -0.03649353, -0.03620485, -0.02575088,
        -0.04266874, -0.04933197, -0.0422968 