# NLP : Convert words to numbers and get Embedded Vectors 

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,Dense,Flatten
from keras.models import Sequential

In [2]:
# data
reviews = [
    
    'nice food',
    'very good food',
    'delicious cusine',
    'nice hotel',
    'poor test',
    'bad service',
    'bad staff',
    'worst food'
]

labels = np.array([1,1,1,1,0,0,0,0])

### Word to vector conversion using one_hot encoding

In [3]:
# now lets convert reviews in to number using one hot encoding
vocabulary_size = 40
one_hot('nice food',vocabulary_size)
# this method assign unique values between 0,40 i.e. size of vocabulary

[18, 30]

In [4]:
encoded_reviews = [one_hot(i,vocabulary_size) for i in reviews ]
encoded_reviews

[[18, 30],
 [29, 10, 30],
 [23, 20],
 [18, 33],
 [22, 13],
 [26, 22],
 [26, 15],
 [8, 30]]

### Pad zeros to achive same length of vectors

In [5]:
# We pad 0 to sequence to get fix size vector
max_seq_len = 3
padding_reviews = pad_sequences(encoded_reviews,max_seq_len,padding='post')
padding_reviews

array([[18, 30,  0],
       [29, 10, 30],
       [23, 20,  0],
       [18, 33,  0],
       [22, 13,  0],
       [26, 22,  0],
       [26, 15,  0],
       [ 8, 30,  0]])

In [6]:
X = padding_reviews
y = labels

In [7]:
# What is Embedding  vector?
# When we train the model on data , as a side effect embedding vectors gets generated
# Embedding vectors are nothing buts the weightes used in embedding layer on NN

### Train models and get weightes

In [8]:
# define our embeded vector size
embeded_vector_size = 4
model = Sequential()
model.add(Embedding(vocabulary_size,embeded_vector_size,input_length = max_seq_len, name = 'embedding_layer'))
model.add(Flatten())
model.add(Dense(1,activation = 'sigmoid'))

In [9]:
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [10]:
model.fit(X,y,epochs=50,verbose=0)

<keras.callbacks.History at 0x2164bc17460>

In [11]:
model.evaluate(X,y)



[0.6039316654205322, 1.0]

### Weights of Embedding layer Nothing but our Embedded Vectors

In [12]:
# This are the embedding vectors genrated as by product of model training
embedded_vectors = model.get_layer('embedding_layer').get_weights()
embedded_vectors

[array([[ 0.03384051,  0.00351452, -0.0665306 , -0.0770149 ],
        [-0.01352127,  0.04585799, -0.04243093, -0.04240325],
        [-0.01363826,  0.04615723, -0.03486953, -0.0316488 ],
        [ 0.03666026,  0.0230563 ,  0.04383364, -0.02502699],
        [-0.00818635,  0.01206272, -0.02288911, -0.03219646],
        [ 0.03890054,  0.00865523, -0.02347723, -0.02095464],
        [-0.02427973,  0.0224246 ,  0.01259789,  0.00755932],
        [ 0.04455408,  0.03521914, -0.04374627, -0.02639772],
        [-0.00245443, -0.05883659,  0.06196056,  0.04493309],
        [ 0.04143638, -0.02915286,  0.0222996 , -0.01149797],
        [ 0.00954473,  0.08181886,  0.05412078, -0.01149709],
        [-0.03174105, -0.02863827,  0.01150708,  0.03001643],
        [-0.04593758, -0.03296578,  0.04892499, -0.04888413],
        [-0.06420647, -0.07558623, -0.05121271,  0.04137637],
        [ 0.03444702, -0.0088213 , -0.01221281, -0.03687661],
        [-0.07297999, -0.0813506 , -0.03264644,  0.01776345],
        

### Compare two vectors of same sentiment

In [15]:
# second review 'very good food'
print(embedded_vectors[0][1])
# third review  'delicious cusine'
print(embedded_vectors[0][2])
# They are nearly similiar 

[-0.01352127  0.04585799 -0.04243093 -0.04240325]
[-0.01363826  0.04615723 -0.03486953 -0.0316488 ]
