<a href="https://colab.research.google.com/github/ElizabethA01/Machine_learning/blob/main/RNN_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Project**

This project using natural language processing in the form of recurrent neural networks (RNN) to predict whether the movie reviews are positive or negative.

It uses the imdb movie dataset. 


In [78]:
%tensorflow_version 2.x 
from keras.datasets import imdb
import keras.preprocessing
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

In [79]:
# let's look at one review
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [80]:
# ADDING PADDING IF LENGTHS ARE NOT THE SAME 
# you cannot pass different length data to the neural network so you need to padd the sequences

# add padding if its less 250 words and trim if its more than 250 words

train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)
                                   
len(train_data[0])

250

# **Creating the model**

Use a word embedding layer as the first layer in the model and add a LSTM layer afterwards that feeds into a dense nod to get the predicted sentiment.

32 stands for the ouput dimension of the vectors generated by the embedding 

In [81]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(VOCAB_SIZE, 32),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation='sigmoid') # sigmoid is between 0 and 1. If more than 0.5 then considered a positive review.
])

In [82]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          2834688   
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


# **Train the model**

Compile and then fit the model 

In [83]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# **Evaluate the model**

Evaluate the model on the training data to see how well it performs. 

In [None]:
results = model.evaluate(test_data, test_labels)
print(results)



# **Making Predictions**

Now we can use the network to make predictions on our own reviews. 

We have to convert our encoded reviews to a form that the network will understand. Load the encodings from the datasets and use them to encode our own data. 

In [None]:
word_index = imdb.get_word_index()
def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], MAXLEN)[0]  # added padding so it can be passed through the network

text = 'that movie was just amazing, so amazing'
encoded = encode_text(text)
print(encoded)

In [None]:
# let's make a decode function

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
  PAD = 0
  text = ''
  for num in integers:
    if num != PAD:
      text += reverse_word_index[num] + ' '
  return text[:-1]

print(decode_integers(encoded))


In [None]:
# make a prediction

def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1,250)) # make a blank numpy array with the shape 1 to 250
  pred[0] = encoded_text # insert array of numbers to the first index 
  result = model.predict(pred)
  print(result[0])
  # if result >= 0.5: 
  #   ans = 'Positive review: ' + ' '.join((str(x) for x in result[0]))
  # else:
  #   ans = 'Negative review: ' + ' '.join((str(x) for x in result[0]))
  # print(ans)

positive_review = 'That movie was so good! I loved it and would watch it again because it was so great'
predict(positive_review)

negative_review = 'That movie sucked! I hated it and wouldn\'t watch it again'
predict(negative_review)
