# Text Preprocessing using OHE and Word Embedding Layer

In [1]:
!pip install tensorflow



In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [14]:
corpus = ['Hello everyone',
         'Welcome to the conferrence',
         'Hope everyone is doing good',
         'People like coffee',
         'I like data science',
         'There are lot of resources available']

In [15]:
corpus

['Hello everyone',
 'Welcome to the conferrence',
 'Hope everyone is doing good',
 'People like coffee',
 'I like data science',
 'There are lot of resources available']

In [16]:
#Vocabulary Size
#Larger vocabulary size gives us larger and good feature representation
vocabulary_size = 50000 

# ONE HOT ENCODING 

In [17]:
ohr = [one_hot(words,vocabulary_size) for words in corpus]
print(ohr)

[[40344, 34226], [5142, 17458, 37537, 14889], [43352, 34226, 457, 49123, 12664], [12857, 15719, 42206], [1019, 15719, 13555, 21969], [12163, 35612, 45388, 45584, 35174, 22657]]


# WORD EMBEDDING

In [11]:
import pyforest

In [12]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
from tensorflow.keras.models import Sequential

In [18]:
corpus_size = 7

In [20]:
#Post Padding
embedded_corpus=pad_sequences(ohr,padding = 'post', maxlen = corpus_size)

In [28]:
embedded_corpus

array([[40344, 34226,     0,     0,     0,     0,     0],
       [ 5142, 17458, 37537, 14889,     0,     0,     0],
       [43352, 34226,   457, 49123, 12664,     0,     0],
       [12857, 15719, 42206,     0,     0,     0,     0],
       [ 1019, 15719, 13555, 21969,     0,     0,     0],
       [12163, 35612, 45388, 45584, 35174, 22657,     0]])

In [29]:
embedded_corpus[1]

array([ 5142, 17458, 37537, 14889,     0,     0,     0])

In [23]:
mod = Sequential()

In [24]:
#Using Embedding layer training the model with vocabulary size and feature required to represent each and evry index, here i am using 10
#As the data is less I took feature dimension length as 10, this may vary with respect to the amount of data
#specify input length eaqual to corpus length
mod.add(Embedding(vocabulary_size,10,input_length=corpus_size))

In [25]:
#compiling with adam activation function and taking the loss 
mod.compile('adam','mse')

In [27]:
mod.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 10)             500000    
                                                                 
Total params: 500,000
Trainable params: 500,000
Non-trainable params: 0
_________________________________________________________________


In [30]:
#This will predict all the words in the sentence with 10 dimensions as we initialised
mod.predict(embedded_corpus[1])



array([[-0.04333601, -0.01581476, -0.04310143, -0.02180597, -0.02483678,
        -0.01572995, -0.04696418, -0.005769  ,  0.03474187, -0.03474562],
       [-0.04196861,  0.03407853,  0.02518752, -0.01129743, -0.00817338,
        -0.03059706, -0.01365741, -0.00691295,  0.01149023, -0.01147121],
       [-0.00940199, -0.02797021, -0.00280652, -0.04923694,  0.02661897,
         0.04491141, -0.01164074, -0.0181792 , -0.00260801,  0.02021552],
       [ 0.00093553,  0.01434285, -0.00361079,  0.02785819,  0.02855114,
         0.02153236,  0.02621999, -0.01950793, -0.02271258, -0.00971758],
       [-0.01741911, -0.00345195,  0.04373522,  0.00886941, -0.04492586,
        -0.04713974,  0.0348576 , -0.04236095,  0.02248765,  0.04028858],
       [-0.01741911, -0.00345195,  0.04373522,  0.00886941, -0.04492586,
        -0.04713974,  0.0348576 , -0.04236095,  0.02248765,  0.04028858],
       [-0.01741911, -0.00345195,  0.04373522,  0.00886941, -0.04492586,
        -0.04713974,  0.0348576 , -0.04236095

In [31]:
mod.predict(embedded_corpus)



array([[[ 4.42097075e-02, -2.34134197e-02, -1.24706253e-02,
          4.61209081e-02,  3.37159745e-02, -9.44912434e-04,
          6.92424923e-03, -2.68378984e-02,  2.70717256e-02,
          1.31981447e-03],
        [-2.84114480e-02,  5.70427254e-03, -2.89896615e-02,
          4.69726957e-02,  1.29777826e-02, -2.96117198e-02,
          3.51515152e-02, -4.16276939e-02,  3.28285582e-02,
         -1.56338699e-02],
        [-1.74191110e-02, -3.45195457e-03,  4.37352173e-02,
          8.86940956e-03, -4.49258573e-02, -4.71397415e-02,
          3.48575972e-02, -4.23609503e-02,  2.24876516e-02,
          4.02885787e-02],
        [-1.74191110e-02, -3.45195457e-03,  4.37352173e-02,
          8.86940956e-03, -4.49258573e-02, -4.71397415e-02,
          3.48575972e-02, -4.23609503e-02,  2.24876516e-02,
          4.02885787e-02],
        [-1.74191110e-02, -3.45195457e-03,  4.37352173e-02,
          8.86940956e-03, -4.49258573e-02, -4.71397415e-02,
          3.48575972e-02, -4.23609503e-02,  2.248765