<a href="https://colab.research.google.com/github/Balajilp/DL-RNN-learning/blob/main/word_embedding_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Word Embedding Techniques Using Embedding Layers in Keras

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
# some sentences
sent = ['the glass of milk', 'the glass of juice', 'the cup of tea', 'i am a good boy', 'i am a good Datascienctist',
        'understand the meaning of words', 'your videos are good']

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'i am a good boy',
 'i am a good Datascienctist',
 'understand the meaning of words',
 'your videos are good']

In [4]:
# vocabulory size
voc_size = 10000

### one hot representation

In [7]:
onehot_repr = [one_hot(words, voc_size) for words in sent] # inside one_hot we need to provide 2 parameters (word, vocabulory size)
onehot_repr
# it will give us the index from the dictionary because it is the one hot representation only

[[734, 9618, 9959, 8431],
 [734, 9618, 9959, 741],
 [734, 3071, 9959, 9973],
 [3680, 8697, 1049, 5093, 8504],
 [3680, 8697, 1049, 5093, 3143],
 [2243, 734, 3558, 9959, 5457],
 [4636, 5879, 3205, 5093]]

### word embedding representation

In [10]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences # it is importatnt because whenever we pass into embedding layer all the sentences must contains same number of words for that we are using this
from tensorflow.keras.models import Sequential

In [9]:
import numpy as np

In [11]:
sent_length = 8  # we can customize this value based on our problem statement
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
embedded_docs

array([[   0,    0,    0,    0,  734, 9618, 9959, 8431],
       [   0,    0,    0,    0,  734, 9618, 9959,  741],
       [   0,    0,    0,    0,  734, 3071, 9959, 9973],
       [   0,    0,    0, 3680, 8697, 1049, 5093, 8504],
       [   0,    0,    0, 3680, 8697, 1049, 5093, 3143],
       [   0,    0,    0, 2243,  734, 3558, 9959, 5457],
       [   0,    0,    0,    0, 4636, 5879, 3205, 5093]], dtype=int32)

In [12]:
dim = 10

In [13]:
model = Sequential()
model.add(Embedding(voc_size, 10, input_length=sent_length))
model.compile('adam', 'mse')

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.predict(embedded_docs)

array([[[-0.02686136, -0.01546522, -0.02546816, -0.00625422,
         -0.01392877,  0.00139062,  0.03722961, -0.01597352,
         -0.02980516, -0.02047476],
        [-0.02686136, -0.01546522, -0.02546816, -0.00625422,
         -0.01392877,  0.00139062,  0.03722961, -0.01597352,
         -0.02980516, -0.02047476],
        [-0.02686136, -0.01546522, -0.02546816, -0.00625422,
         -0.01392877,  0.00139062,  0.03722961, -0.01597352,
         -0.02980516, -0.02047476],
        [-0.02686136, -0.01546522, -0.02546816, -0.00625422,
         -0.01392877,  0.00139062,  0.03722961, -0.01597352,
         -0.02980516, -0.02047476],
        [ 0.02281551, -0.01088864, -0.0478619 , -0.0128717 ,
          0.03360898, -0.03853196,  0.0018452 , -0.03480992,
          0.03600944, -0.01414161],
        [-0.01712074, -0.02264278,  0.0157434 ,  0.00382141,
          0.01170456, -0.02201074, -0.0108602 ,  0.01404184,
         -0.01328126,  0.0356806 ],
        [ 0.00015704,  0.02468461,  0.02981417,  0.0

In [16]:
# predict only one value
model.predict(embedded_docs[0])



array([[-0.02686136, -0.01546522, -0.02546816, -0.00625422, -0.01392877,
         0.00139062,  0.03722961, -0.01597352, -0.02980516, -0.02047476],
       [-0.02686136, -0.01546522, -0.02546816, -0.00625422, -0.01392877,
         0.00139062,  0.03722961, -0.01597352, -0.02980516, -0.02047476],
       [-0.02686136, -0.01546522, -0.02546816, -0.00625422, -0.01392877,
         0.00139062,  0.03722961, -0.01597352, -0.02980516, -0.02047476],
       [-0.02686136, -0.01546522, -0.02546816, -0.00625422, -0.01392877,
         0.00139062,  0.03722961, -0.01597352, -0.02980516, -0.02047476],
       [ 0.02281551, -0.01088864, -0.0478619 , -0.0128717 ,  0.03360898,
        -0.03853196,  0.0018452 , -0.03480992,  0.03600944, -0.01414161],
       [-0.01712074, -0.02264278,  0.0157434 ,  0.00382141,  0.01170456,
        -0.02201074, -0.0108602 ,  0.01404184, -0.01328126,  0.0356806 ],
       [ 0.00015704,  0.02468461,  0.02981417,  0.02126857,  0.048188  ,
         0.00668966,  0.00861318, -0.02519759