<a href="https://colab.research.google.com/github/GOURAVRISHI/NLP_lectures/blob/main/BiRNN_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import os
ROOT= "/content/drive/MyDrive/Colab Notebooks/text_generation"
os.chdir(ROOT)

In [77]:
import tensorflow as tf
import numpy as np

import time
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt 

In [78]:
dataset_name= "imdb_reviews"

dataset, info= tfds.load(dataset_name , with_info= True, as_supervised= True)

In [79]:
dataset.keys()

dict_keys(['test', 'train', 'unsupervised'])

In [80]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [81]:
info.name

'imdb_reviews'

In [82]:
info.citation

'@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}'

In [91]:
train_ds, test_ds = dataset["train"], dataset["test"]

In [92]:
for example, label in train_ds.take(3):
  print(f"sample text:\n{example.numpy()}\n")
  print(f"label:\n{label.numpy()}\n")

sample text:
b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."

label:
0

sample text:
b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film 

In [85]:
print(info.description)

Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.


In [93]:
class Config:
  BUFFER_SIZE = 10000
  BATCH_SIZE = 64
  VOCAB_SIZE = 1000
  OUTPUT_DIM = 64
  EPOCHS = 10
  BASE_LOG_DIR = "base_log_dir"
  TRAINED_MODEL_DIR = os.path.join(BASE_LOG_DIR, "models")
  CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR, "ckpt")
  TB_ROOT_LOG_DIR = os.path.join(BASE_LOG_DIR, "tb_log_dir")

In [94]:
# shuffling and batching in training dataset
# AUTOTUNE - will pick up a number dynamically for you

train_ds = train_ds.shuffle(Config.BUFFER_SIZE).batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [95]:
for example, label in train_ds.take(1):
  print(f"sample text:\n{example.numpy()}\n")
  print(f"label:\n{label.numpy()}\n")
  print(f"label length:{len(label.numpy())}\n")


sample text:
[b"'Renaissance (2006)' was created over a period of six years, co-funded by France, Luxembourg and the United Kingdom at a cost of around \xc2\x8014 million. The final result is a staggering accomplishment of comic-book style animation, aesthetically similar to what Robert Rodriguez and Frank Miller achieved with 'Sin City (2005),' but this film employed motion capture with live-actors to translate their faces and movements into an entirely animated format. Presented in stark black-and-white, the film looks as though it has been hoisted from the very pages of the graphic novel on which it was based, and the futuristic city of Paris looms ominously above us. Directed by French filmmaker Christian Volckman, in his feature-length debut, 'Renaissance' draws significantly from other films in the science-fiction genre, and the tech-noir storyline isn't something we haven't seen before, but, from a technical standpoint, it is faultless.<br /><br />The year is 2054. The city of P

In [89]:
# train_ds[["label"]]

In [96]:
# text encoding

encoder = tf.keras.layers.TextVectorization(max_tokens=Config.VOCAB_SIZE)

encoder.adapt(train_ds.map(lambda text, label: text))

In [97]:
encoder

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7f7c02d7c2d0>

In [98]:
# first 20 tokens
# trained data always contain text and labels

vocab= np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [99]:
example.numpy()[:3]

array([b"'Renaissance (2006)' was created over a period of six years, co-funded by France, Luxembourg and the United Kingdom at a cost of around \xc2\x8014 million. The final result is a staggering accomplishment of comic-book style animation, aesthetically similar to what Robert Rodriguez and Frank Miller achieved with 'Sin City (2005),' but this film employed motion capture with live-actors to translate their faces and movements into an entirely animated format. Presented in stark black-and-white, the film looks as though it has been hoisted from the very pages of the graphic novel on which it was based, and the futuristic city of Paris looms ominously above us. Directed by French filmmaker Christian Volckman, in his feature-length debut, 'Renaissance' draws significantly from other films in the science-fiction genre, and the tech-noir storyline isn't something we haven't seen before, but, from a technical standpoint, it is faultless.<br /><br />The year is 2054. The city of Paris is

In [100]:
encoder(example.numpy()[:3])

<tf.Tensor: shape=(3, 515), dtype=int64, numpy=
array([[  1,   1,  14, ...,   0,   0,   0],
       [  4,   1,  64, ...,   0,   0,   0],
       [  1, 365, 827, ...,   3,  65, 102]])>

In [101]:
len(encoder.get_vocabulary())

1000

In [102]:
embedding_layer = tf.keras.layers.Embedding(
    input_dim = len(encoder.get_vocabulary()), # 1000
    output_dim = Config.OUTPUT_DIM, # 64
    mask_zero = True
) # it handles the variable sequences lengths 
# makes use of 
# <sos> - Start of Sentence
# <pad> - padding
# <eod> - End of the Data


In [103]:
Layers = [
          encoder, 
          embedding_layer, 
          tf.keras.layers.Bidirectional(
              tf.keras.layers.LSTM(64)
              ),
          tf.keras.layers.Dense(64, activation="relu"),
          tf.keras.layers.Dense(1)
]

model= tf.keras.Sequential(Layers)

In [104]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_6 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          64000     
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 138,369
Trainable params: 138,369
Non-tr

In [106]:
# how many layers are supporting masking 

for layer in model.layers: 
  print(layer.supports_masking)  

False
True
True
True
True


In [107]:
model.compile(
    loss= tf.keras.losses.BinaryCrossentropy(from_logits= True),
    optimizer= tf.keras.optimizers.Adam(1e-4),
    metrics= ["accuracy"]
)

In [108]:
import time 
time.asctime().replace(" ", "_").replace(":", "")

'Mon_May_30_210028_2022'

In [109]:
import time 
time.asctime().replace(" ", "_").replace(":", "")

'Mon_May_30_210029_2022'

In [110]:
# logs folder will be created 
def callbacks(base_dir= "."):

  # tensorboard callbacks
  unique_log= time.asctime().replace(" ", "_").replace(":", "")
  tensorboard_log_dir= os.path.join(Config.TB_ROOT_LOG_DIR, unique_log)
  os.makedirs(tensorboard_log_dir, exist_ok= True)

  tb_cb = tf.keras.callbacks.TensorBoard(log_dir= tensorboard_log_dir)

  # ckpt callback
  ckpt_file = os.path.join(Config.CHECKPOINT_DIR, "model")
  os.makedirs(Config.CHECKPOINT_DIR, exist_ok= True)

  ckpt_cb= tf.keras.callbacks.ModelCheckpoint(
      filepath= ckpt_file, 
      save_best_only= True
  )

  callbacks_list= [
                   tb_cb, 
                   ckpt_cb
  ]
  return callbacks_list

In [111]:
callback_list= callbacks()

In [None]:
history = model.fit(train_ds,
                    epochs= Config.EPOCHS, 
                    validation_data = test_ds,
                    validation_steps = 30,
                    callbacks= callback_list)

Epoch 1/10

In [None]:
test_loss, test_acc = model.evaluate(test_ds)

print(f"test loss: {test_loss}")
print(f"test accuracy: {test_acc}")

In [None]:
def get_plot(history, metric):
  history_obj= history.history
  plt.plot(history_obj[metric])
  plt.plot(history_obj[f"val_{metric}"])
  plt.xlabel("Epochs -->")
  plt.ylabel(f"{metric} -->")
  plt.legend([metric, f'val_{metric}'])


In [None]:
get_plot(history, metric= "accuracy")

In [None]:
get_plot(history, metric= "loss")

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir base_log_dir/tb_log_dir

In [None]:
sample_text = (
    "The movie was cool. The animation and the graphics were out of the world. I would recommend this movie."
)

In [None]:
model.predict([sample_text])