### RNN - Integer Encoding Approach

In [23]:
import numpy as np
doc = ['hello guys','babmb rnn','gg bois','walk good','dog best','king holy moly',
       'superman is dope','doing great']

In [24]:
len(doc)

8

In [25]:
# tokenizer - split word by word
from keras.preprocessing.text import Tokenizer
tk = Tokenizer(oov_token = '<pal>') # oov_token = out of vocab 
# any new words will come(out of voab) will be denoted by pal

In [26]:
tk.fit_on_texts(doc)

In [27]:
tk.word_index

{'<pal>': 1,
 'hello': 2,
 'guys': 3,
 'babmb': 4,
 'rnn': 5,
 'gg': 6,
 'bois': 7,
 'walk': 8,
 'good': 9,
 'dog': 10,
 'best': 11,
 'king': 12,
 'holy': 13,
 'moly': 14,
 'superman': 15,
 'is': 16,
 'dope': 17,
 'doing': 18,
 'great': 19}

In [28]:
# any new word that will come will be denoted by 1

In [29]:
tk.word_counts

OrderedDict([('hello', 1),
             ('guys', 1),
             ('babmb', 1),
             ('rnn', 1),
             ('gg', 1),
             ('bois', 1),
             ('walk', 1),
             ('good', 1),
             ('dog', 1),
             ('best', 1),
             ('king', 1),
             ('holy', 1),
             ('moly', 1),
             ('superman', 1),
             ('is', 1),
             ('dope', 1),
             ('doing', 1),
             ('great', 1)])

In [30]:
# replace words with the token indexes in doc
seq = tk.texts_to_sequences(doc)
seq

[[2, 3],
 [4, 5],
 [6, 7],
 [8, 9],
 [10, 11],
 [12, 13, 14],
 [15, 16, 17],
 [18, 19]]

In [31]:
# padding to equal the length of sequences-
from keras.utils import pad_sequences

In [32]:
seq1 = pad_sequences(seq,padding='pre')
seq2 = pad_sequences(seq,padding='post')

In [33]:
seq1   # padding = pre

array([[ 0,  2,  3],
       [ 0,  4,  5],
       [ 0,  6,  7],
       [ 0,  8,  9],
       [ 0, 10, 11],
       [12, 13, 14],
       [15, 16, 17],
       [ 0, 18, 19]], dtype=int32)

In [34]:
seq2   # padding = post

array([[ 2,  3,  0],
       [ 4,  5,  0],
       [ 6,  7,  0],
       [ 8,  9,  0],
       [10, 11,  0],
       [12, 13, 14],
       [15, 16, 17],
       [18, 19,  0]], dtype=int32)

### Integer encoding on keras.imdb dataset

In [35]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [36]:
(x_train, y_train),(x_test,y_test) = imdb.load_data() # pre-processed data

In [37]:
x_train.shape

(25000,)

In [38]:
len(x_train[0])

218

In [39]:
len(x_train[1])

189

In [40]:
# padding required since we have different length in reviews
from keras.utils import pad_sequences

In [41]:
x_train = pad_sequences(x_train, padding='post',maxlen=100)
x_test = pad_sequences(x_test, padding='post',maxlen=100)
# don't use maxlen,using maxlen to only get first 100 words from review ; fast processing
# maxlen is used to reduce the computational process ,coupled with padding post
# and giving suitable maxlen it will capture only dense numbers and pruning sparse numbers.

In [42]:
len(x_train[1]) # len is comming 100 for all reviews due to maxlen

100

In [43]:
model = Sequential()
model.add(SimpleRNN(32,input_shape=(100,1),return_sequences=False)) # to get the final output not each layer output(via True)
model.add(Dense(1,activation='sigmoid'))
model.summary()
# write return_sequences=True , when you want each part of words recognition (manytomany)
# here only want to process 100 words in sentence and give the final output sentiment(manyto1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,121
Trainable params: 1,121
Non-trainable params: 0
_________________________________________________________________


In [44]:
# input layer(1)x32 -> output again passed as input rnn(32x32) -> output(32x1)
# ->32 bias -> 1 = 1121

In [45]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])
model.fit(x_train,y_train, validation_data=(x_test,y_test),batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f27496d3100>

### RNN - Embedding Approach

In [46]:
doc = ['hello guys','babmb rnn','gg bois','walk good','dog best','king holy moly',
       'superman is dope','doing great']

In [47]:
# tokenizer - split word by word
from keras.preprocessing.text import Tokenizer
tk1 = Tokenizer() 

In [48]:
tk1.fit_on_texts(doc)

In [49]:
tk1.word_index

{'hello': 1,
 'guys': 2,
 'babmb': 3,
 'rnn': 4,
 'gg': 5,
 'bois': 6,
 'walk': 7,
 'good': 8,
 'dog': 9,
 'best': 10,
 'king': 11,
 'holy': 12,
 'moly': 13,
 'superman': 14,
 'is': 15,
 'dope': 16,
 'doing': 17,
 'great': 18}

In [50]:
seq = tk1.texts_to_sequences(doc)
seq

[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12, 13], [14, 15, 16], [17, 18]]

In [52]:
from keras.utils import pad_sequences
seq1 = pad_sequences(seq,padding='post')
seq1

array([[ 1,  2,  0],
       [ 3,  4,  0],
       [ 5,  6,  0],
       [ 7,  8,  0],
       [ 9, 10,  0],
       [11, 12, 13],
       [14, 15, 16],
       [17, 18,  0]], dtype=int32)

In [54]:
# embedding - always consider the dense vector(non-zero, non sparse)
model = Sequential()
model.add(Embedding(18,output_dim=2,input_length=3))
# output_dim is hidden layer neuron
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 2)              36        
                                                                 
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.compile(optimizer='adam',metrics=['accuracy'])
pred = model.predict(seq1)
print(pred)
# Embedding gave dense vector -

[[[ 0.03470682  0.02604291]
  [-0.00655509  0.02036904]
  [-0.01391447  0.0388681 ]]

 [[-0.03049067 -0.01596469]
  [-0.02683282  0.03326324]
  [-0.01391447  0.0388681 ]]

 [[-0.04039925  0.04698325]
  [-0.00536622  0.02841197]
  [-0.01391447  0.0388681 ]]

 [[ 0.01974135  0.04763727]
  [ 0.03672074 -0.03850418]
  [-0.01391447  0.0388681 ]]

 [[ 0.01828288  0.01271949]
  [-0.03547676  0.00301512]
  [-0.01391447  0.0388681 ]]

 [[-0.03979013 -0.0234611 ]
  [ 0.04500384  0.01792424]
  [ 0.01466273 -0.01980059]]

 [[-0.03323817 -0.01857818]
  [ 0.0296809  -0.00926771]
  [-0.04046168 -0.00979358]]

 [[-0.00497278 -0.02353524]
  [ 0.          0.        ]
  [-0.01391447  0.0388681 ]]]


In [None]:
# using Embedding method with imdb dataset-

In [57]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [59]:
(x_train, y_train),(x_test,y_test) = imdb.load_data()

In [60]:
x_train = pad_sequences(x_train, padding='post',maxlen=100)
x_test = pad_sequences(x_test, padding='post',maxlen=100)

In [61]:
x_train.shape

(25000, 100)

In [62]:
model = Sequential()
model.add(Embedding(10000,output_dim=2,input_length=100)) 
#choosing 10000 out of 25000 randomly, maxfeatures =100
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 2)            20000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21,153
Trainable params: 21,153
Non-trainable params: 0
_________________________________________________________________


In [63]:
# Significant increase in accuracy- by using Embedding method over integer encoder
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=32,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2748af64c0>