# Sentiment Analysis Using RNN(Recurrent Neural Network)s

## Munsif Raza

In [1]:
# Importing Libraries
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

# Dataset
We shall use IMDB dataset. It consists of 50,000 movie reviews, of which 25,000 are tutorials and 25,000 tests. Half of these comments were labeled as positive and the other half as negative. It is a balanced dataset as the positive and negative labels in this dataset are equal.

In [2]:
# Loading dataset
VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64
(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

In [3]:
print(len(train_data[0]))
print(len(train_data[1]))

218
189


# Pre-Processing
You see our reviews contain different length of words which we can't give to a neural network. we have to make every review of same size for that we can do two things.
1. If review has more than 250 words trim it
2. If review has less than 250 words add remaining as 0's

by doing so we shall make every review to come on same platform regarding size.

In [4]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

# Creating model
We shall use a word embedding layer as the first layer in our model and add a LSTM layer afterwards that feeds into a dense node to get our predicted sentiment.
Here 32 stands for the output dimension of the vectors generated by the embedding layer.

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [6]:
# Summary of model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


# Training the model.
Now that we have got a model let's train it.

In [7]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=['acc'])
history = model.fit(train_data, train_labels, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
# Let's evaluate the model by using testing data.
results = model.evaluate(test_data, test_labels)
print(results)

[0.3916893005371094, 0.8586400151252747]


# Making predictions

Let's use our model to make prediction on our own reviews.
Since our reviews are encoded. We'll need to convert any review that we write into that form so the model can understandit.
To do that We'll load the encodings form the dataset and use them to encode our own data.

In [9]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [10]:
# Let's make a decode function

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num]+" "
    
    return text[:-1]

print(decode_integers(encoded))

that movie was just amazing so amazing


In [12]:
# Now let's make predictions

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    if result[0] > 0.5:
        print('It is Positive Review')
    else:
        print('It is Negative Review')
    
positive_review = "That movie was I just loved it and definately will watch again"
predict(positive_review)

negative_review = "That movie was bad, I hate that one of the worst movies it is, it was waste of time. it really sucked I would not watch it again"
predict(negative_review)

It is Positive Review
It is Negative Review


# Conclusion
We used RNNs to train the model with categorical data and then used our own data to predict the review type.