# RNN Sentiment Analysis (Start - In Progress)
#### 05/11/2020 - 86% Accuracy 
Example from https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt

In [3]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()

In [4]:
dataset, info = tfds.load('imdb_reviews/subwords32k', with_info=True, as_supervised=True)
train_examples, test_examples = dataset['train'], dataset['test']



Downloading and preparing dataset imdb_reviews/subwords32k/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to C:\Users\shepa135\tensorflow_datasets\imdb_reviews\subwords32k\1.0.0...


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…







HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to C:\Users\shepa135\tensorflow_datasets\imdb_reviews\subwords32k\1.0.0.incompleteT1HCXT\imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to C:\Users\shepa135\tensorflow_datasets\imdb_reviews\subwords32k\1.0.0.incompleteT1HCXT\imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to C:\Users\shepa135\tensorflow_datasets\imdb_reviews\subwords32k\1.0.0.incompleteT1HCXT\imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

Dataset imdb_reviews downloaded and prepared to C:\Users\shepa135\tensorflow_datasets\imdb_reviews\subwords32k\1.0.0. Subsequent calls will reuse this data.


In [21]:
encoder = info.features['text'].encoder
print('Vocabulary size: {}'.format(encoder.vocab_size))

Vocabulary size: 32650


In [38]:
print(info.features['text'])

Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=32650>)


In [22]:
sample_string = 'Hello TensorFlow.'

encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))

original_string = encoder.decode(encoded_string)
print('The original string: "{}"'.format(original_string))

Encoded string is [16092, 31789, 642, 9656, 32513, 32440]
The original string: "Hello TensorFlow."


In [23]:
assert original_string == sample_string

In [24]:
for index in encoded_string:
    print('{} ------> {}'.format(index, encoder.decode([index])))

16092 ------> Hello 
31789 ------> Tens
642 ------> or
9656 ------> Flo
32513 ------> w
32440 ------> .


In [25]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [26]:
train_dataset = (train_examples.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None],[])))

test_dataset = (test_examples.padded_batch(BATCH_SIZE, padded_shapes=([None],[])))

In [28]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [29]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [30]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)

Epoch 1/10




Epoch 2/10




Epoch 3/10




Epoch 4/10




Epoch 5/10




Epoch 6/10




Epoch 7/10




Epoch 8/10




Epoch 9/10




Epoch 10/10






In [31]:
# history.save('models/test_model_32kimdb_sentiment_05112020')

AttributeError: 'History' object has no attribute 'save'

In [32]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))





Test Loss: 0.5671058297157288
Test Accuracy: 0.8600000143051147


In [16]:
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec


def sample_predict(sample_pred_text, pad):
  encoded_sample_pred_text = encoder.encode(sample_pred_text)

  if pad:
    encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
  encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))

  return (predictions)

## Sample Tests (>0.5 is positive and <0.5 is Negative Sentiment)


In [26]:

sample_pred_text = ('The movie was great! The animation and the graphics '
                    'were out of this world. I would recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=False)
print(predictions)

[[0.72507745]]


In [24]:
# predict on a sample text with padding

sample_pred_text = ("When are the bad Fake Journalists, who received unwarranted Pulitzer Prizes for Russia, Russia, Russia, and the Impeachment Scam, going to turn in their tarnished awards so they can be given to the real journalists who got it right. I’ll give you the names, there are plenty of them!")
predictions = sample_predict(sample_pred_text, pad=False)
print(predictions)

[[2.4124477]]


In [28]:
# predict on a sample text with padding

sample_pred_text = ("We are getting great marks for the handling of the CoronaVirus pandemic, especially the very early BAN of people from China, the infectious source, entering the USA. Compare that to the Obama/Sleepy Joe  known as H1N1 Swine Flu.")
predictions = sample_predict(sample_pred_text, pad=True)
print(predictions)

[[1.2739829]]
