In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

In [2]:
sent = ['Cats love chasing mice',
        'Dogs enjoy chasing balls',
        'Birds fly in the sky',
        'Fish swim in clear waters',
        'The sun sets beautifully',
        'Moonlight creates a magical atmosphere',
        'Flowers bloom in spring',
        'Trees provide shade and oxygen',
        'Raindrops fall on the roof',
        'Children play in parks']
len(sent)

10

In [3]:
### Vocabulary size
voc_size=100  # no. of different words we have in our corpus. This can be any number BTW which is greater than or equal to no. of distinct words

In [4]:
# One Hot Representation
onehot_repr=[one_hot(words, voc_size) for words in sent] 
print(onehot_repr)

[[76, 14, 5, 90], [11, 14, 5, 85], [96, 95, 31, 82, 43], [34, 1, 31, 51, 76], [82, 80, 22, 29], [91, 97, 89, 26, 67], [34, 41, 31, 26], [66, 87, 50, 51, 62], [29, 47, 62, 82, 4], [86, 76, 31, 88]]


In [5]:
sent_length=8 # the no. of words we are expecting to have in a sentence for representation
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[ 0  0  0  0 76 14  5 90]
 [ 0  0  0  0 11 14  5 85]
 [ 0  0  0 96 95 31 82 43]
 [ 0  0  0 34  1 31 51 76]
 [ 0  0  0  0 82 80 22 29]
 [ 0  0  0 91 97 89 26 67]
 [ 0  0  0  0 34 41 31 26]
 [ 0  0  0 66 87 50 51 62]
 [ 0  0  0 29 47 62 82  4]
 [ 0  0  0  0 86 76 31 88]]


In [6]:
dim=10 # The no. of dimensions we ultimately need as output from Embedding layer
print(voc_size, sent_length, dim)

100 8 10


In [7]:
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile('adam','mse')

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             1000      
                                                                 
Total params: 1000 (3.91 KB)
Trainable params: 1000 (3.91 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
embedded_docs[0]

array([ 0,  0,  0,  0, 76, 14,  5, 90], dtype=int32)

In [10]:
print(model.predict(embedded_docs[0]))

[[ 0.00362832 -0.03568153 -0.03198759 -0.01476466 -0.00770712  0.02070655
   0.00477032 -0.00237374  0.04972719 -0.01942462]
 [ 0.00362832 -0.03568153 -0.03198759 -0.01476466 -0.00770712  0.02070655
   0.00477032 -0.00237374  0.04972719 -0.01942462]
 [ 0.00362832 -0.03568153 -0.03198759 -0.01476466 -0.00770712  0.02070655
   0.00477032 -0.00237374  0.04972719 -0.01942462]
 [ 0.00362832 -0.03568153 -0.03198759 -0.01476466 -0.00770712  0.02070655
   0.00477032 -0.00237374  0.04972719 -0.01942462]
 [ 0.04054158  0.00317365 -0.01596113 -0.04694506 -0.03707706 -0.04392582
  -0.04030523  0.04150787 -0.01581163  0.02550243]
 [-0.03235699  0.03670971  0.00273252  0.03508884  0.02654843  0.00942119
   0.018214    0.01516372  0.04989288 -0.04306885]
 [-0.04347601  0.00911499  0.03308667 -0.03963071 -0.02530123 -0.01795372
   0.01523474 -0.02648896 -0.02859916 -0.02267624]
 [ 0.00624987 -0.00218208  0.03745656 -0.01519524  0.04138892 -0.00310284
  -0.04673404  0.04585054 -0.03787084  0.02866526]]