### Integer Encoding RNN Demo

In [1]:
import numpy as np

In [2]:
# Input data
docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [5]:
#  tokenizing each element of the documents
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='ashis')  # out of vocabulary token

# oov token is used to replace the words which are not in the vocabulary , it will get replaced by 'ashis'

In [6]:
tokenizer.fit_on_texts(docs)

In [7]:
tokenizer.word_index

# Shows you all the unqiue words in the documents alongwit their indexing

{'ashis': 1,
 'india': 2,
 'jeetega': 3,
 'hip': 4,
 'ki': 5,
 'jai': 6,
 'kohli': 7,
 'sachin': 8,
 'dhoni': 9,
 'go': 10,
 'hurray': 11,
 'bhai': 12,
 'bharat': 13,
 'mata': 14,
 'modi': 15,
 'ji': 16,
 'inquilab': 17,
 'zindabad': 18}

In [8]:
tokenizer.word_counts

# It will show how many times each word has appeared in the document

OrderedDict([('go', 1),
             ('india', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('kohli', 2),
             ('sachin', 2),
             ('dhoni', 2),
             ('modi', 1),
             ('ji', 1),
             ('inquilab', 1),
             ('zindabad', 1)])

In [9]:
tokenizer.document_count
# shows u how many rows u have means how many lines

10

In [10]:
# to make the text into sequences , in how the way the word has appeared
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[10, 2],
 [2, 2],
 [4, 4, 11],
 [3, 12, 3, 2, 3],
 [13, 14, 5, 6],
 [7, 7],
 [8, 8],
 [9, 9],
 [15, 16, 5, 6],
 [17, 18]]

In [13]:
# for the uneven length of sentences applying padding to it
from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(sequences , padding='post')  

# post means the zeroes will get added at the end
# pre means the zeroes will get added at the front

In [14]:
sequences

array([[10,  2,  0,  0,  0],
       [ 2,  2,  0,  0,  0],
       [ 4,  4, 11,  0,  0],
       [ 3, 12,  3,  2,  3],
       [13, 14,  5,  6,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [ 9,  9,  0,  0,  0],
       [15, 16,  5,  6,  0],
       [17, 18,  0,  0,  0]], dtype=int32)

### Sentiment Analysis using integer encoding 

In [15]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense , SimpleRNN , Embedding , Flatten

In [16]:
(X_train , y_train) , (X_test , y_test) = imdb.load_data()

# This imdb dataset is already tokenized 

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 10us/step


In [18]:
X_train.shape

(25000,)

In [19]:
# Making the length of the sentences same means padding the sentences and giving only the first 50 words of each sentence

X_train = pad_sequences(X_train , padding='post', maxlen=50)
X_test = pad_sequences(X_test , padding='post' , maxlen=50)



In [20]:
X_train.shape

(25000, 50)

In [None]:
model = Sequential()

model.add(SimpleRNN(32 , input_shape=(50 , 1) , return_sequences=False ))
model.add(Dense(1 , activation='sigmoid'))

model.summary()

# Return Sequences = False means it will only return the last output of the sequence .
# Return Sequences = True means it will return the output of all the sequences to its above .

  super().__init__(**kwargs)


In [23]:
model.compile(optimizer='adam' , loss='binary_crossentropy' , metrics=['accuracy'])
model.fit(X_train , y_train , epochs=5 , validation_data=(X_test , y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.5033 - loss: 0.6961 - val_accuracy: 0.5020 - val_loss: 0.6946
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.5127 - loss: 0.6927 - val_accuracy: 0.5068 - val_loss: 0.6949
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5074 - loss: 0.6934 - val_accuracy: 0.5028 - val_loss: 0.6943
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.5123 - loss: 0.6931 - val_accuracy: 0.5018 - val_loss: 0.6953
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.5121 - loss: 0.6926 - val_accuracy: 0.5008 - val_loss: 0.6949


<keras.src.callbacks.history.History at 0x20468e8a930>

In [None]:
'''  

Feedback :-
Accuracy will be very low , sincewe are using very less data .
Now to learn the Embeddings , it will give better results .

'''