# ONE HOT ENCODING

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
docs = ['glass of orange juice',
       'bottle of mango juice',
       'glass of mango juice',
       'drink bottle of banana shake',
       'I want a glass of cold water',
       'The king and the queen',
       'man and woman']

In [3]:
vocab_size = 10000

In [4]:
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

[[2049, 1868, 3760, 9302], [9126, 1868, 6515, 9302], [2049, 1868, 6515, 9302], [8776, 9126, 1868, 4595, 7898], [8647, 6492, 8187, 2049, 1868, 810, 2748], [1817, 6740, 1896, 1817, 7800], [2716, 1896, 6369]]


# WORD EMBEDDINGS

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, InputLayer, Embedding, Bidirectional, LSTM, Dense

import numpy as np

In [6]:
embedding_length = 5
max_doc_len = 10

pad_encoded_docs = pad_sequences(encoded_docs, truncating = 'post', padding = 'post', maxlen = max_doc_len)

print(pad_encoded_docs)

[[2049 1868 3760 9302    0    0    0    0    0    0]
 [9126 1868 6515 9302    0    0    0    0    0    0]
 [2049 1868 6515 9302    0    0    0    0    0    0]
 [8776 9126 1868 4595 7898    0    0    0    0    0]
 [8647 6492 8187 2049 1868  810 2748    0    0    0]
 [1817 6740 1896 1817 7800    0    0    0    0    0]
 [2716 1896 6369    0    0    0    0    0    0    0]]


In [7]:
model = Sequential()

model.add(Input(shape=(max_doc_len,)))
model.add(Embedding(vocab_size, embedding_length, input_length = max_doc_len))
model.add(Bidirectional(LSTM(units = 64, return_sequences = True)))
model.add(LSTM(100))
model.add(Dense(5, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])



In [8]:
# model.fit()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 5)             50000     
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 128)           35840     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dense (Dense)                (None, 5)                 505       
Total params: 177,945
Trainable params: 177,945
Non-trainable params: 0
_________________________________________________________________


In [11]:
pad_encoded_docs.shape

(7, 10)

In [12]:
pad_encoded_docs

array([[2049, 1868, 3760, 9302,    0,    0,    0,    0,    0,    0],
       [9126, 1868, 6515, 9302,    0,    0,    0,    0,    0,    0],
       [2049, 1868, 6515, 9302,    0,    0,    0,    0,    0,    0],
       [8776, 9126, 1868, 4595, 7898,    0,    0,    0,    0,    0],
       [8647, 6492, 8187, 2049, 1868,  810, 2748,    0,    0,    0],
       [1817, 6740, 1896, 1817, 7800,    0,    0,    0,    0,    0],
       [2716, 1896, 6369,    0,    0,    0,    0,    0,    0,    0]])

In [10]:
model.predict(pad_encoded_docs)
# print(output.shape)
# print(output)

array([[0.19999623, 0.19958383, 0.20013404, 0.1995562 , 0.20072965],
       [0.20005645, 0.19950429, 0.20008186, 0.19962567, 0.2007317 ],
       [0.20000744, 0.19956036, 0.20002045, 0.19962035, 0.2007914 ],
       [0.20008624, 0.19948606, 0.20032817, 0.1995429 , 0.20055664],
       [0.20015542, 0.19984493, 0.2001458 , 0.19967352, 0.20018037],
       [0.19984193, 0.1996233 , 0.20010845, 0.19953312, 0.20089321],
       [0.20001295, 0.19954999, 0.20012335, 0.19957848, 0.20073523]],
      dtype=float32)