In [1]:
### Libraries used Tensorflow > 2.0 and keras

In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
## sentences
sent = [
    'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good boy',
    'I am a good developer',
    ' understand the meaning of words',
    'your videos are good'
]

In [4]:
#vocabulary size
voc_size = 10000 

In [5]:
#to form one hot representation of the sentence
onehot_repr = [one_hot(words,voc_size) for words in sent]
print(onehot_repr)

[[1395, 6187, 3515, 1154], [1395, 6187, 3515, 7404], [1395, 7465, 3515, 3281], [8215, 7190, 7868, 8283, 6488], [8215, 7190, 7868, 8283, 3614], [566, 1395, 7577, 3515, 443], [4298, 1888, 9705, 8283]]


#### Word embedding representation

In [6]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [7]:
import numpy as np

In [8]:
#whatever one hot representation we have, we will pass it through embedding layer in keras, to form a embedding matrix
#we have to specify how many dimensions we have to give

In [9]:
sent_length = 8
embedding_docs = pad_sequences(onehot_repr,padding = 'pre',maxlen = sent_length)
#whenever we want to pass anything to embedding layer, all the sentences should have same number of words
#padding techniques used, make all the sentence of length 8, it will add 0 in the words, and make it 8 words sentence

In [10]:
dim = 10  #for featurized representation

In [11]:
embedding_docs

array([[   0,    0,    0,    0, 1395, 6187, 3515, 1154],
       [   0,    0,    0,    0, 1395, 6187, 3515, 7404],
       [   0,    0,    0,    0, 1395, 7465, 3515, 3281],
       [   0,    0,    0, 8215, 7190, 7868, 8283, 6488],
       [   0,    0,    0, 8215, 7190, 7868, 8283, 3614],
       [   0,    0,    0,  566, 1395, 7577, 3515,  443],
       [   0,    0,    0,    0, 4298, 1888, 9705, 8283]])

In [12]:
#initialize sequential model and add embedding layer
model = Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile("adam",'mse')

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
print(model.predict(embedding_docs))

[[[-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909
   -0.01005045 -0.04344146 -0.023806    0.02904824 -0.00866426]
  [-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909
   -0.01005045 -0.04344146 -0.023806    0.02904824 -0.00866426]
  [-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909
   -0.01005045 -0.04344146 -0.023806    0.02904824 -0.00866426]
  [-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909
   -0.01005045 -0.04344146 -0.023806    0.02904824 -0.00866426]
  [ 0.00273933  0.04234305  0.03527263 -0.00925684 -0.04364363
   -0.00801913  0.0009814  -0.00891588  0.04803124 -0.02196107]
  [-0.04260963  0.0247821  -0.01513176 -0.02422612 -0.04363854
   -0.04477165 -0.00795441  0.03376127  0.02213181 -0.0290828 ]
  [-0.03121476  0.0393716  -0.03302357  0.04804705  0.02047126
    0.02938889  0.01345886  0.01438488 -0.0476303   0.01695079]
  [ 0.02368537  0.00670565 -0.00297441  0.0228155  -0.04608826
    0.03962917 -0.04822322 -0.01776301  0.015970

In [15]:
embedding_docs[0]

array([   0,    0,    0,    0, 1395, 6187, 3515, 1154])

In [16]:
#index gets converted into dimension of 10 vectors

In [17]:
# this is how we form embedding matrix

In [18]:
print(model.predict(embedding_docs[0]))

[[-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909 -0.01005045
  -0.04344146 -0.023806    0.02904824 -0.00866426]
 [-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909 -0.01005045
  -0.04344146 -0.023806    0.02904824 -0.00866426]
 [-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909 -0.01005045
  -0.04344146 -0.023806    0.02904824 -0.00866426]
 [-0.04460268 -0.03839206 -0.01519613  0.00706003 -0.03431909 -0.01005045
  -0.04344146 -0.023806    0.02904824 -0.00866426]
 [ 0.00273933  0.04234305  0.03527263 -0.00925684 -0.04364363 -0.00801913
   0.0009814  -0.00891588  0.04803124 -0.02196107]
 [-0.04260963  0.0247821  -0.01513176 -0.02422612 -0.04363854 -0.04477165
  -0.00795441  0.03376127  0.02213181 -0.0290828 ]
 [-0.03121476  0.0393716  -0.03302357  0.04804705  0.02047126  0.02938889
   0.01345886  0.01438488 -0.0476303   0.01695079]
 [ 0.02368537  0.00670565 -0.00297441  0.0228155  -0.04608826  0.03962917
  -0.04822322 -0.01776301  0.0159702   0.0131462 ]]

In [19]:
#this is word representation of sentence with the help of embedding layer