<a href="https://colab.research.google.com/github/AAKAAASSHHH24/NLP-BASICS/blob/main/Sentiment_analysis_BiRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
ROOT = "/content/drive/MyDrive/projects/NLP"
os.chdir(ROOT)``

In [3]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt

In [7]:
dataset_name = 'imdb_reviews'
dataset, info = tfds.load(dataset_name, with_info = True, as_supervised = True )

In [8]:
dataset.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [9]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        '

In [10]:
train_ds, test_ds = dataset['train'], dataset['test']

In [12]:
for example, label in train_ds.take(3):
  print(f"sample_text:\n{example.numpy()} \n", f"label:\n{label} \n")


sample_text:
b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 
 label:
0 

sample_text:
b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the fil

In [13]:
class Config:
  BUFFER_SIZE = 10000
  BATCH_SIZE = 64
  VOCAB_SIZE = 1000
  OUTPUT_DIM = 64
  EPOCHS = 10
  BASE_LOG_DIR = "base_log_dir"
  TRAINED_MODEL_DIR = os.path.join(BASE_LOG_DIR, "models")
  CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR, "ckpt")
  TB_ROOT_LOG_DIR = os.path.join(BASE_LOG_DIR, "tb_log_dir")


In [14]:
# SHUFFLE AND BATCH THE TRAIN DATASET
 
train_ds = train_ds.shuffle(Config.BUFFER_SIZE).batch(Config.BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)  #autotune based on configuration
test_ds = test_ds.batch(Config.BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)  # dont need to shuffle here

In [16]:
for example, label in train_ds.take(1):
  print(f"sample_text:\n{example.numpy()} \n", f"label:\n{label} \n")
  print(f"label:\n{len(label.numpy())} \n")


sample_text:
[b"The original Lensman series of novels is a classic of the genre. It's pure adventure SF with some substance (here and there) and I've always wondered why Hollywood hasn't filmed it verbatim because it's just the kind of thing they love: massive explosions, super-weapons, uber-heroics, hero gets the girl, aliens (great CGI potential), good versus evil in the purest form, etc etc. Instead (and bear in mind I'm a Japan-o-phile and anime lover) we get this horrendous kiddies movie that rips the guts out of the story, mixes in Star-Wars (ironic as the latter ripped off the books occasionally) pastiches and dumbs the whole thing down to 'Thundercats' level. To see Kimball Kinnison, the epitome of the Galactic Patrol officer and second stage Lensman portrayed as a small boy is pitiful (etc). I just can't understand why the makers did this because they obviously had the rights to the story and could have made far more money (FAR!) by telling straight. It makes no sense."
 b'Im 

In [17]:
# text encoding
encoder = tf.keras.layers.TextVectorization(max_tokens= Config.VOCAB_SIZE)
encoder.adapt(train_ds.map( lambda text, label:text)) #adapting our train data to encoder

In [19]:
# first 20 tokens
vocab = np.array(encoder.get_vocabulary())
vocab[:40]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be',
       'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so',
       'like'], dtype='<U14')

In [20]:
# note: [UNK] represents unknown texts or letters

In [22]:
encoder(example.numpy()[:3])

<tf.Tensor: shape=(3, 165), dtype=int64, numpy=
array([[  2, 198,   1, 204,   5,   1,   7,   4, 350,   5,   2, 533,  30,
          1,   1,   1,  17,  47,   1, 132,   3,  48,   3, 195, 203,   1,
        134, 355,   1, 810,   9,   1,  80,  30,  41,   2, 238,   5, 151,
         35, 116,   1,   1,   1,   1, 666, 202,   2, 247,   1,  85,   1,
        963,  50,   1, 435,   8,   2,   1, 801, 572, 572, 297,   3,   1,
          8, 349, 142,   4,   1,   3,   1,   1,  72,  76,  11,   1,   1,
         18,  12,   1,   2,   1,  46,   5,   2,  64,   1,   8,   1,   1,
         15,   2,   1,   1, 127,   2,   1,   1,   1,   3,   1,   2, 219,
        151, 187,   6,   1, 680,   6,  68,   1,   1,   2,   1,   5,   2,
          1,   1,   1,   3, 326, 870,   1, 972,  15,   4, 386, 444,   7,
          1, 572,  10,  41, 175, 373, 134,   2,   1, 117,  11,  80,  35,
        518,  67,   2,   1,   6,   2,  64,   3,  96,  26,  91, 234,  52,
        276, 234,  33,   1, 790,   9, 159,  57, 278],
       [142, 147,   9,

In [28]:
embedding_layer = tf.keras.layers.Embedding(input_dim = len(encoder.get_vocabulary()), 
                                           output_dim = Config.OUTPUT_DIM, 
                                           mask_zero = True)
# MASKING handles the variable sequences length
# makes use of: 
# start of sentence, padding, end of data

In [25]:
Layers = [encoder,
          embedding_layer,
          tf.keras.layers.Bidirectional(
              tf.keras.layers.LSTM(64)
          ),tf.keras.layers.Dense(64, activation = "relu"),
          tf.keras.layers.Dense(1)
          ]

model = tf.keras.Sequential(Layers)

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 64)          64000     
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 138,369
Trainable params: 138,369
Non-trai

In [27]:
for layer in model.layers:
  print(layer.supports_masking)

False
True
True
True
True


ABOVE WE CHECKED WHETHER THE LAYERS SUPPORTS MASKING

In [29]:
model.compile(
    loss =  tf.keras.losses.BinaryCrossentropy(from_logits= True),
    optimizer =  tf.keras.optimizers.Adam(1e-4),
    metrics = ["accuracy"]

)

At the end of a neural network if we dont use an activation funtion there will be raw outputs which are the logits . when we use softmax or any other this is not the case

In [30]:
# for callbacks

import time
time.asctime().replace(' ', "_").replace(":", "")

'Wed_Dec_21_182139_2022'

In [None]:
def callbacks(base_dir = "."):
  # tensorboard_callback
  unique_log = time.asctime().replace(' ', "_").replace(":", "")
  tensorboard_log_dir = od.path.join(Config.TB_ROOT_LOG_DIR, unique_log)

  os.makedirs(tensorboard_log_dir, exist_ok = True)

  tb_cb = tf.keras.callbacks.TensorBoard(log_dir = tensorboard_log_dir )
