## Importing Modules

In [None]:
import io
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
print(tf.__version__)

2.3.0


### Downloading Dataset form tensorflow_datasets

In [None]:
imdb , info = tfds.load('imdb_reviews' , as_supervised = True , with_info = True)

train_data , test_data = imdb['train'] , imdb['test']

## Extracting Train and Test Sentences and their corresponding Labels

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for sentence,label in train_data:
  training_sentences.append(sentence.numpy().decode('utf8'))
  training_labels.append(label.numpy())
  
for sentence,label in test_data:
  testing_sentences.append(sentence.numpy().decode('utf8'))
  testing_labels.append(label.numpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)


## Initializing Tokenizer and converting sentences into padded sequences

In [None]:
vocab_size = 20000
embed_dims = 16
truncate = 'post'
pad = 'post'
oov_token = '<OOV>'
max_length = 150

tokenizer = Tokenizer(num_words=vocab_size , oov_token = oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(training_sentences)
padded_train = pad_sequences(train_sequences , truncating = truncate , padding = pad , maxlen = max_length)

test_sequences = tokenizer.texts_to_sequences(testing_sentences)
padded_test = pad_sequences(test_sequences , truncating=truncate , padding = pad , maxlen = max_length)


## Decoding Sequences back into Texts by creating a reverse word index

In [None]:
reverse_word_index = dict([(values , keys) for keys , values in word_index.items()])

def decode_review(text):
  return ' '.join([reverse_word_index.get(i , '?') for i in text])

print(decode_review(padded_train[3]))
print(training_sentences[3])

this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm chair and mellow for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <OOV> to cross no dangerous waters just a warm and witty <OOV> through new york life at its best a family film in every sense and one that deserves the praise it received ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and 

### Making DNN Model

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size , embed_dims , input_length= max_length),
                             tf.keras.layers.Flatten(),
                             tf.keras.layers.Dense(units = 6 , activation = 'relu'),
                             tf.keras.layers.Dense(units = 1 , activation = 'sigmoid')
])

model.compile(optimizer = 'adam' , loss = 'binary_crossentropy' , metrics = ['accuracy'])

### Summary of the Model's Processing

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 16)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 2400)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 14406     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 334,413
Trainable params: 334,413
Non-trainable params: 0
_________________________________________________________________


### Initialzing a callback to avoid overfitting , Fitting the data on the model

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.99):
      print("\nReached 99% accuracy so cancelling training!")
      self.model.stop_training = True
callbacks = myCallback()

model.fit(
    padded_train,
    training_labels_final,
    epochs = 10,
    validation_data = (padded_test , testing_labels_final),
    callbacks = [callbacks]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Reached 99% accuracy so cancelling training!


<tensorflow.python.keras.callbacks.History at 0x7f4d7422a6a0>

### Extracting Embeddings from the Model

In [None]:
embed_layer = model.layers[0]
embed_weights = embed_layer.get_weights()[0]
print(embed_weights.shape)

(20000, 16)


### Exporting meta.tsv and vecs.tsv (embeddings) to visualize it in tensorflow projector in spherical form

In [None]:
out_v = io.open("vecs.tsv" , mode = 'w' , encoding='utf-8')
out_m = io.open("meta.tsv" , mode = 'w' , encoding='utf-8')

for word_num in range(1,vocab_size):
  word = reverse_word_index[word_num]
  embeddings = embed_weights[word_num]
  out_m.write(word + '\n')
  out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_m.close()
out_v.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Testing the model on different Sentences ( if y_hat is above 0.5 review has been predicted positive and below 0.5 is a negative predicted review)

### Creating function to convert sentences into padded sequences with the same hyperparameters

In [None]:
def get_pad_sequence(sentence_val):
  sequence = tokenizer.texts_to_sequences([sentence_val])
  padded_seq = pad_sequences(sequence , truncating = truncate , padding = pad , maxlen = max_length)
  return padded_seq

Trying Positive Review

In [None]:
sentence = "I really think this is amazing. honest."
padded_test_1 = get_pad_sequence(sentence)

0.99 means its a very positive review and the classifier is good in predicting posiitve reviews

In [None]:
model.predict(padded_test_1)

array([[0.9824178]], dtype=float32)

Trying Negative Review

In [None]:
sentence = "The movie was so boring , bad and not worth watching. I hated the movie and no one should have to sit through that"
padded_test_2 = get_pad_sequence(sentence)

In [None]:
model.predict(padded_test_2)

array([[0.00090715]], dtype=float32)

0.009 means its a very negative review and the model is correct