#  MACHINE LEARNING INTERNSHIP
8bitWorriors
# WEEK 1 Task
#  dataset - IMDB reviews
#  by Shivansh Hingve


## Imports

Here, first we import common libraries that will be used throughout the exercise.

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

## Download and Prepare the Dataset

Next, download the `plain_text` version of the `IMDB Reviews` dataset.

In [None]:
# Download the plain text dataset
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [None]:
# Get the train and test sets
train_data, test_data = imdb['train'], imdb['test']

# Initialize sentences and labels lists
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# Loop over all training examples and save the sentences and labels
for s,l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())

# Loop over all test examples and save the sentences and labels
for s,l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())

le = LabelEncoder()
training_labels_final = le.fit_transform(training_labels)
testing_labels_final = le.fit_transform(testing_labels)

Unlike the subword encoded set ,I need to build the vocabulary from scratch and generate padded sequences.Here, we already know how to do that with the `Tokenizer` class and `pad_sequences()` method.

In [None]:
# Parameters
vocab_size = 10000
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

## Plot Utility

Before defining the models, we will define the function below so we can easily visualize the accuracy and loss history after training.

In [None]:
import matplotlib.pyplot as plt

# Plot Utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

## Convolution Model
Lastly, I will use a convolution layer to extract features from your dataset. I will append a [GlobalAveragePooling1d](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D) layer to reduce the results before passing it on to the dense layers.

In [None]:
# Parameters
embedding_dim = 16
filters = 128
kernel_size = 5
dense_dim = 6

# Model Definition with Conv1D
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout = 0.1, return_sequences =True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout = 0.1)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Set the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

In [None]:
NUM_EPOCHS = 20
BATCH_SIZE = 128

# Train the model
history = model.fit(padded, training_labels_final, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(testing_padded, testing_labels_final))

## plotting the accuracy and loss history

In [None]:
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

## Model Prediction

Here, we take a sample data of some positive and negative comments for our model prediction.

In [None]:
validation_sentences = ["The film is a worthwhile watch.",
                        "On a scale from zero to five, I give this film a five.",
                        "The film literally brought tears to my eyes.",
                        "I highly recommend this film"
                        ]

In [None]:
for val_set in validation_sentences:
  val_tok = tokenizer.texts_to_sequences(val_set)
  val_pad = pad_sequences(val_tok, maxlen=120, truncating = 'post', padding = 'post')
  predict = model.predict(val_pad)[0]
  print(f"Sentence : {val_set}")
  print(f"Percentage of positivity in sentence : {int(predict *100)} %")