### Converting text into vector using integer encoding and using simpleRNN. 

In [32]:
import numpy as np

docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [33]:
from keras.preprocessing.text import Tokenizer       # we do tokenization to convert text into words. Changing capital letters to small letters. Remove stop words, special symbols. 

tokenizer =Tokenizer(oov_token='Arjun')  # Here you are replacing any word in the test dataset which is not present in the training dataset with 'Arjun'. oov = out of vocab token.

In [34]:
tokenizer.fit_on_texts(docs)

In [35]:
# To see the assigned index of the words in the vocab corpus. 
tokenizer.word_index

{'Arjun': 1,
 'india': 2,
 'jeetega': 3,
 'hip': 4,
 'ki': 5,
 'jai': 6,
 'kohli': 7,
 'sachin': 8,
 'dhoni': 9,
 'go': 10,
 'hurray': 11,
 'bhai': 12,
 'bharat': 13,
 'mata': 14,
 'modi': 15,
 'ji': 16,
 'inquilab': 17,
 'zindabad': 18}

In [36]:
# To see the frequency of the words in the vocab. 
tokenizer.word_counts

OrderedDict([('go', 1),
             ('india', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('kohli', 2),
             ('sachin', 2),
             ('dhoni', 2),
             ('modi', 1),
             ('ji', 1),
             ('inquilab', 1),
             ('zindabad', 1)])

In [37]:
# To see the number of rows in the document. 
tokenizer.document_count

10

In [38]:
# Converting words to sequence. 
sequences = tokenizer.texts_to_sequences(docs)
print(sequences)

[[10, 2], [2, 2], [4, 4, 11], [3, 12, 3, 2, 3], [13, 14, 5, 6], [7, 7], [8, 8], [9, 9], [15, 16, 5, 6], [17, 18]]


In [39]:
# Adding padding to make size of all sequences equal. 
from keras.utils import pad_sequences

sequences = pad_sequences(sequences, padding='post')     # post means the zero will be at the end of the sequence. 
print(sequences)

[[10  2  0  0  0]
 [ 2  2  0  0  0]
 [ 4  4 11  0  0]
 [ 3 12  3  2  3]
 [13 14  5  6  0]
 [ 7  7  0  0  0]
 [ 8  8  0  0  0]
 [ 9  9  0  0  0]
 [15 16  5  6  0]
 [17 18  0  0  0]]


In [40]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, SimpleRNN

In [41]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [42]:
# Already preprocessed data with integer encoding. 
print(x_train[1])

[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 10156, 4, 1153, 9, 194, 775, 7, 8255, 11596, 349, 2637, 148, 605, 15358, 8003, 15, 123, 125, 68, 23141, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 36893, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 25249, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 46151, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]


In [43]:
# To make the dataset of similar size I am using padding here and also trimming the data so, it will train fast. 

x_train = pad_sequences(x_train, maxlen=50, padding='post')
x_test = pad_sequences(x_test, maxlen=50, padding='post')

In [44]:
print(x_train[1])

[ 8255     5 25249   656   245  2350     5     4  9837   131   152   491
    18 46151    32  7464  1212    14     9     6   371    78    22   625
    64  1382     9     8   168   145    23     4  1690    15    16     4
  1355     5    28     6    52   154   462    33    89    78   285    16
   145    95]


In [45]:
# Making the model. 

model = Sequential()

model.add(SimpleRNN(32, input_shape=(50,1), return_sequences=False))      # Here we have 50 time-steps same as the dimension of the dataset and sending 1 input feature at a time. The reason why we set return statement false because we don't want output at every time-step. We are doing sentiment analysis so no need of that. 
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_3 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1121 (4.38 KB)
Trainable params: 1121 (4.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [46]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x31824e250>

### Using embedding.

In [52]:
tokenizer = Tokenizer()

In [53]:
tokenizer.fit_on_texts(docs)

In [54]:
len(tokenizer.word_index)

17

In [55]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[9, 1],
 [1, 1],
 [3, 3, 10],
 [2, 11, 2, 1, 2],
 [12, 13, 4, 5],
 [6, 6],
 [7, 7],
 [8, 8],
 [14, 15, 4, 5],
 [16, 17]]

In [63]:
sequences = pad_sequences(sequences,padding='post')
print(sequences)

[[ 9  1  0  0  0]
 [ 1  1  0  0  0]
 [ 3  3 10  0  0]
 [ 2 11  2  1  2]
 [12 13  4  5  0]
 [ 6  6  0  0  0]
 [ 7  7  0  0  0]
 [ 8  8  0  0  0]
 [14 15  4  5  0]
 [16 17  0  0  0]]


In [60]:
model = Sequential()
model.add(Embedding(18,output_dim=2,input_length=5))      # Here you will have to increase the embedding dimension by 1 from 17 to 18 to include the "unknown" token.

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 5, 2)              36        
                                                                 
Total params: 36 (144.00 Byte)
Trainable params: 36 (144.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [61]:
model.compile('adam','accuracy')

In [62]:
pred = model.predict(sequences)
print(pred)                       # Here every word is converted into 2 embedding outputs. 

[[[-0.01510189 -0.01504215]
  [ 0.04290788  0.01403778]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]]

 [[ 0.04290788  0.01403778]
  [ 0.04290788  0.01403778]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]]

 [[ 0.03129901  0.00858117]
  [ 0.03129901  0.00858117]
  [ 0.03141688 -0.02979585]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]]

 [[ 0.00856427 -0.01753614]
  [-0.02623016 -0.0024384 ]
  [ 0.00856427 -0.01753614]
  [ 0.04290788  0.01403778]
  [ 0.00856427 -0.01753614]]

 [[ 0.02463695 -0.02652116]
  [-0.04525285 -0.04378453]
  [-0.0203311  -0.03013411]
  [-0.02868924 -0.03771811]
  [ 0.00112004  0.01680036]]

 [[-0.04188278 -0.03115167]
  [-0.04188278 -0.03115167]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]]

 [[ 0.00867699 -0.00260112]
  [ 0.00867699 -0.00260112]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]
  [ 0.00112004  0.01680036]]

 [[ 0.

In [71]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [72]:
x_train = pad_sequences(x_train, maxlen=50, padding='post')
x_test = pad_sequences(x_test, maxlen=50, padding='post')

In [73]:
x_train.shape

(25000, 50)

In [75]:
model = Sequential()

# Adding embedding. 
model.add(Embedding(100000, 2, input_length=50))    # Here I am changing every word in two words embedding. In this my input dimension is 10000, and it is going to 2 embedding nodes which is giving 2 outputs. 
model.add(SimpleRNN(32, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 2)             200000    
                                                                 
 simple_rnn_8 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 201153 (785.75 KB)
Trainable params: 201153 (785.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [76]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(x_train, y_train,epochs=5,validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
