In [1]:
import pandas as pd

In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
sent=[  'this is a glass of milk',
     'this is the glass of juice',
     'this is a cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [4]:
sent

['this is a glass of milk',
 'this is the glass of juice',
 'this is a cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [5]:
# Vocab size

vocab_size = 10000

### One Hot Representation

In [6]:
onehot = [one_hot(words, vocab_size)for words in sent]
print(onehot)

[[912, 1475, 5869, 6552, 866, 3143], [912, 1475, 3803, 6552, 866, 2264], [912, 1475, 5869, 2451, 866, 8762], [6872, 1573, 5869, 6712, 991], [6872, 1573, 5869, 6712, 7433], [9174, 3803, 1750, 866, 1542], [2288, 2996, 1305, 6712]]


### Word Embedding Represntation


In [7]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [8]:
## Using pad_sequence we will do equal the length of each sentence

word_length = 8
embedding = pad_sequences(onehot, padding = 'pre', maxlen = word_length)
print(embedding)

[[   0    0  912 1475 5869 6552  866 3143]
 [   0    0  912 1475 3803 6552  866 2264]
 [   0    0  912 1475 5869 2451  866 8762]
 [   0    0    0 6872 1573 5869 6712  991]
 [   0    0    0 6872 1573 5869 6712 7433]
 [   0    0    0 9174 3803 1750  866 1542]
 [   0    0    0    0 2288 2996 1305 6712]]


In [9]:
## giving features number
dim = 10

In [10]:
model = Sequential()
model.add(Embedding(vocab_size, dim, input_length = word_length))
model.compile('adam', 'mse')

## (MSE) tells you how close a regression line is to a set of points. It does this by taking the distances from 
## the points to the regression line (these distances are the “errors”) and squaring them

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [12]:
print(model.predict(embedding))

[[[-3.80615480e-02  8.46632570e-03 -2.84459833e-02  2.53973044e-02
    2.51607411e-02 -9.43381712e-03 -2.51630675e-02  2.27710269e-02
    2.29523294e-02 -2.88811568e-02]
  [-3.80615480e-02  8.46632570e-03 -2.84459833e-02  2.53973044e-02
    2.51607411e-02 -9.43381712e-03 -2.51630675e-02  2.27710269e-02
    2.29523294e-02 -2.88811568e-02]
  [-1.36571042e-02 -1.95696112e-02 -8.11327249e-04 -4.55108993e-02
   -1.67518146e-02 -3.94065157e-02  2.75935568e-02  1.90367140e-02
   -5.15934080e-03  3.04973610e-02]
  [-5.13628870e-03  2.14411281e-02 -4.55908887e-02 -1.11822374e-02
   -2.45620254e-02  3.24725546e-02 -1.36482120e-02 -7.34122843e-03
    3.48598696e-02 -4.09760252e-02]
  [-2.05880534e-02 -4.13604379e-02 -4.65408452e-02  2.05311514e-02
   -1.02624670e-02 -2.20318791e-02 -2.23189723e-02  3.09206583e-02
    2.57676840e-03  2.95643248e-02]
  [-4.79652546e-02 -3.54089364e-02 -1.59786120e-02 -9.37118381e-03
    2.34469287e-02  6.03641197e-03  1.32652558e-02  3.50268967e-02
   -4.08065096e-

In [13]:
embedding[0]

array([   0,    0,  912, 1475, 5869, 6552,  866, 3143])

In [14]:
print(model.predict(embedding)[0])

#For every above word there are 10 dimensions. Means each word is represented in 10 dimensions

[[-0.03806155  0.00846633 -0.02844598  0.0253973   0.02516074 -0.00943382
  -0.02516307  0.02277103  0.02295233 -0.02888116]
 [-0.03806155  0.00846633 -0.02844598  0.0253973   0.02516074 -0.00943382
  -0.02516307  0.02277103  0.02295233 -0.02888116]
 [-0.0136571  -0.01956961 -0.00081133 -0.0455109  -0.01675181 -0.03940652
   0.02759356  0.01903671 -0.00515934  0.03049736]
 [-0.00513629  0.02144113 -0.04559089 -0.01118224 -0.02456203  0.03247255
  -0.01364821 -0.00734123  0.03485987 -0.04097603]
 [-0.02058805 -0.04136044 -0.04654085  0.02053115 -0.01026247 -0.02203188
  -0.02231897  0.03092066  0.00257677  0.02956432]
 [-0.04796525 -0.03540894 -0.01597861 -0.00937118  0.02344693  0.00603641
   0.01326526  0.0350269  -0.04080651 -0.00183482]
 [-0.01799475 -0.02759967 -0.04359691  0.02191963  0.03874857 -0.00596504
  -0.01726647 -0.01576076 -0.02397652  0.01419989]
 [-0.01343412  0.00641962  0.03151956 -0.04359195 -0.0260438   0.01494409
   0.00855828 -0.01783214  0.02086646 -0.0131501 ]]