# To do Word Embeddings manually:
1. we need to convert text in to one_hot representation/vectors
2. we need to apply padding to make sure all the sentences in the corpus are same in length
3. Then apply word embedding.

In [3]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [8]:
# Lets start with a small text data 1st:

data= ['hello good morning',
       'I am chandu',
       'I am a data scientist',
       'I love to have espresso while coding']

In [11]:
# lets do one_hot encoding the above text data

vocab_size= 5000

onehot_representation= [one_hot(words, vocab_size) for words in data]

In [12]:
onehot_representation

[[1786, 1709, 789],
 [3867, 2619, 2708],
 [3867, 2619, 4265, 517, 4917],
 [3867, 4504, 4404, 1975, 4020, 2031, 2077]]

In [14]:
# now lets do padding

sent_length= 8 # the highest length sentence is the last one in the corpus, which is having 7 but i took it randomly 8.

embedded_doc= pad_sequences(onehot_representation, padding='pre', maxlen= sent_length)

embedded_doc


array([[   0,    0,    0,    0,    0, 1786, 1709,  789],
       [   0,    0,    0,    0,    0, 3867, 2619, 2708],
       [   0,    0,    0, 3867, 2619, 4265,  517, 4917],
       [   0, 3867, 4504, 4404, 1975, 4020, 2031, 2077]])

In [15]:
dim= 10

In [17]:
model = Sequential()

model.add(Embedding(vocab_size, 10, input_length= sent_length))

In [18]:
model.compile('adam', 'rmse')

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             50000     
                                                                 
Total params: 50,000
Trainable params: 50,000
Non-trainable params: 0
_________________________________________________________________


In [20]:
embedded_vect= model.predict(embedded_doc)



In [21]:
embedded_vect

array([[[ 0.00718312, -0.04502806,  0.0444039 ,  0.00205811,
         -0.00998566, -0.04020165,  0.00405882, -0.04338538,
         -0.02190809, -0.04815148],
        [ 0.00718312, -0.04502806,  0.0444039 ,  0.00205811,
         -0.00998566, -0.04020165,  0.00405882, -0.04338538,
         -0.02190809, -0.04815148],
        [ 0.00718312, -0.04502806,  0.0444039 ,  0.00205811,
         -0.00998566, -0.04020165,  0.00405882, -0.04338538,
         -0.02190809, -0.04815148],
        [ 0.00718312, -0.04502806,  0.0444039 ,  0.00205811,
         -0.00998566, -0.04020165,  0.00405882, -0.04338538,
         -0.02190809, -0.04815148],
        [ 0.00718312, -0.04502806,  0.0444039 ,  0.00205811,
         -0.00998566, -0.04020165,  0.00405882, -0.04338538,
         -0.02190809, -0.04815148],
        [-0.03956305, -0.04336307,  0.008479  ,  0.03146017,
         -0.02979231, -0.00709634,  0.04044194, -0.02009124,
         -0.00249628, -0.00305982],
        [-0.02908537,  0.03806306,  0.01685245, -0.0