# Word Embedding Techniques using Embedding Layer in Keras

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
## vocabulary size
voc_size=10000

## One hot representation:

In [5]:
onehot_repr=[one_hot(words,voc_size) for words in sent]
print(onehot_repr)

[[5685, 1019, 6790, 459], [5685, 1019, 6790, 678], [5685, 3024, 6790, 2642], [2545, 1914, 2163, 2463, 2565], [2545, 1914, 2163, 2463, 8797], [974, 5685, 3593, 6790, 9082], [4172, 9092, 9784, 2463]]


##### As we can see that all the sentences are represented in the vector format which the index value being less than 10000.

## Word Embedding Representation:

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [9]:
import numpy as np

In [10]:
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

In [11]:
print(embedded_docs)

[[   0    0    0    0 5685 1019 6790  459]
 [   0    0    0    0 5685 1019 6790  678]
 [   0    0    0    0 5685 3024 6790 2642]
 [   0    0    0 2545 1914 2163 2463 2565]
 [   0    0    0 2545 1914 2163 2463 8797]
 [   0    0    0  974 5685 3593 6790 9082]
 [   0    0    0    0 4172 9092 9784 2463]]


In [12]:
dim=10

In [14]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [17]:
print(model.predict(embedded_docs))

[[[-0.02644897 -0.02053416 -0.02930422  0.01099825 -0.02618381
    0.03773497 -0.02158752 -0.01275078  0.02579759  0.04763069]
  [-0.02644897 -0.02053416 -0.02930422  0.01099825 -0.02618381
    0.03773497 -0.02158752 -0.01275078  0.02579759  0.04763069]
  [-0.02644897 -0.02053416 -0.02930422  0.01099825 -0.02618381
    0.03773497 -0.02158752 -0.01275078  0.02579759  0.04763069]
  [-0.02644897 -0.02053416 -0.02930422  0.01099825 -0.02618381
    0.03773497 -0.02158752 -0.01275078  0.02579759  0.04763069]
  [-0.02912262 -0.00034293  0.03887353  0.02575464 -0.0042394
    0.00769556 -0.00589733 -0.01837502  0.0262208  -0.0152835 ]
  [ 0.00863483 -0.0084656   0.03457877 -0.04897263  0.0005412
    0.03594049  0.01388254 -0.00744691  0.0480862   0.02348939]
  [-0.03852201 -0.00013697 -0.02441045 -0.02836829 -0.02921116
   -0.0412895   0.04867649 -0.03033539  0.00106522  0.02487153]
  [-0.04711483  0.00965372  0.0440156   0.01541021 -0.02727323
    0.00643445 -0.0451892  -0.04868964  0.00426443