# IMPORT DEPENDENCIES

In [9]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import regex as re
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

## IMDB Dataset Download

In [10]:
trainDS, valDS, testDS = tfds.load('imdb_reviews',
                                   split=['train', 'test[:50%]', 'test[50%:]'],
                                   as_supervised=True)

In [11]:
trainDS

<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [12]:
for review, label in trainDS.take(2):
    print('Review:', review.numpy())
    print('Label:', label.numpy())
    print('\n')

Review: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
Label: 0


Review: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbis

2025-02-22 18:20:19.606716: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-02-22 18:20:19.606937: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## TEXT STANDARDIZATION

### 1. Convert to Lower Case
### 2. Remove HTML Tags
### 3. Remove Punctuations
### 4. Stemming : Return a word to its base form -> Porter Stemmer

In [13]:
print(PorterStemmer().stem('Coming'))
print(PorterStemmer().stem('Tensed'))

come
tens


### 5. Lemmatization : Similar to stemming but with analysis of words

In [19]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [15]:
print(WordNetLemmatizer().lemmatize('Coming', pos=wordnet.VERB))
print(WordNetLemmatizer().lemmatize('Tensed', pos=wordnet.ADJ))

Coming
Tensed


In [17]:
def standardization(inputData):
    lowerCaseOutput = tf.strings.lower(inputData)
    noTagOutput = tf.strings.regex_replace(lowerCaseOutput, "<[^>]+>", " ")
    noPunctOutput = tf.strings.regex_replace(noTagOutput, "[%s]" % re.escape(string.punctuation), "")
    return noPunctOutput

In [18]:
for review, label in trainDS.take(2):
    print('Review:', review.numpy())
    print('Label:', label.numpy())
    print('Standardized Review:', standardization(review).numpy())
    print('\n')

Review: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
Label: 0
Standardized Review: b'this was an absolutely terrible movie dont be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline th

2025-02-22 18:20:31.286725: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-02-22 18:20:31.287181: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## TOKENIZATION

### 1. Character Tokenization -> Smaller Vocabulary
#### "i love this movie" -> i, l, o, v, e, t, h, i, .......
### 2. Word Tokenization -> Larger Vocabulary
#### "i love this movie" -> i, love, this , movie
### 3. Subword Tokenization -> Middle Ground
#### "i love this movie" -> i, lov, e, thi, s, mov, ie
### 4. N-Gram Tokenization -> Combines N words as single word
#### "i love this movie" -> i love, love this, this movie (2-Gram)

## NUMERICALIZATION OF TOKENS

### 1. One-Hot Encoding -> Returns matrix of size (vocab, number of token)
### 2. Bag-Of-Words -> Returns a single vector of size (vocab), with each value as count of the words in sentence
### 3. tf-idf Encoding -> Term Frequency / Inverse Document Frequency
#### Term Frequency = No. of times word occurs / No. of words in vocabulary
#### Inverse Document Frequency = log(No. of sentences / No. of sentences with the word)
#### Final Encoding = tf * idf
### 4. Embeddings -> Aims to reduce sparsity and dimensions
#### Embedded Matrix = Matrix * Embedding Matrix
#### => (4, 10000) * (10000, 300) = (4, 300), where 300 is embedding dimensions
#### ** This can also encode semantic relation between words **
#### *** Embedding Matrix is a trainable layer ***

## TEXT VECTORIZATION LAYER -> Combines Tokenization & Numericalization

In [24]:
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 200

In [25]:
vectorizationLayer = tf.keras.layers.TextVectorization(standardize=standardization,
                                                        max_tokens=VOCAB_SIZE,
                                                        output_sequence_length=SEQUENCE_LENGTH,
                                                        output_mode='int')

In [26]:
trainingData = trainDS.map(lambda x, y: x) # -> Takes input review and label and returns only review
vectorizationLayer.adapt(trainingData) # -> Adapts the layer to the training data
# -> This will create a vocabulary of the most common words in the training data
# -> and map them to integers
# -> The vocabulary size is limited to VOCAB_SIZE
# -> The output sequence length is limited to SEQUENCE_LENGTH
# -> The output mode is set to int, which means the output will be a sequence of integers
# -> The output will be a sequence of integers, where each integer corresponds to a word in the vocabulary

2025-02-23 17:46:32.047953: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [27]:
vectorizationLayer.vocabulary_size() # -> Returns the size of the vocabulary

10000

In [28]:
def vectorizer(review, label):
    return vectorizationLayer(review), label
# -> This function takes a review and label as input and returns the vectorized review and label

In [29]:
trainDS = trainDS.map(vectorizer) # -> Maps the vectorizer function to the training data
valDS = valDS.map(vectorizer) # -> Maps the vectorizer function to the validation data

In [30]:
for review, label in trainDS.take(1):
    print('Review:', review.numpy())
    print('Label:', label.numpy())
    print('\n')

Review: [  11   13   33  414  380   17   89   26    1    8   32 1336 3543   41
  489    1  190   23   84  151   18   11  219  318   26   64  241  211
    8  476   53   64   84  111   97   21 5574   11   92  633  729   11
   17    7   33  396 9231  167 2462  408    2   88 1192  135   65  143
   51    2    1 7488   65  247   64 2835   15    1 2869    1    1 1436
 4861    3   39    1 1546   16 3543   13  156   18    4 1192  894 7958
    8    4   17   12   13 4064    5   99  146 1229   10  236  663   12
   47   23   92   38   11 7269  151   38 1336    1   49  397   10   97
 1170  856  140    9    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    

2025-02-23 17:46:32.336214: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [31]:
trainDS.prefetch(tf.data.AUTOTUNE) # -> This will prefetch the data to improve performance
valDS.prefetch(tf.data.AUTOTUNE) # -> This will prefetch the data to improve performance

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

## TRAINING THE EMBEDDING LAYER

In [33]:
embeddingDimension = 128
embeddingModel = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(None,), dtype=tf.int64),
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=embeddingDimension),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# -> This model takes a sequence of integers as input and outputs a single integer

In [34]:
embeddingModel.summary() # -> This will print the summary of the model

In [35]:
embeddingModel.compile(optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy'])
# -> This will compile the model with the Adam optimizer and binary crossentropy loss

In [36]:
embeddingModel.fit(trainDS.shuffle(1000).batch(32),
                    epochs=10,
                    validation_data=valDS.batch(32),
                    verbose=1)

Epoch 1/10


2025-02-23 17:56:49.847210: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.7019 - loss: 0.5629 - val_accuracy: 0.8219 - val_loss: 0.3781
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8771 - loss: 0.2983 - val_accuracy: 0.8598 - val_loss: 0.3322
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.9087 - loss: 0.2322 - val_accuracy: 0.8626 - val_loss: 0.3407
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.9224 - loss: 0.2068 - val_accuracy: 0.8520 - val_loss: 0.3722
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9343 - loss: 0.1832 - val_accuracy: 0.8566 - val_loss: 0.3791
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.9399 - loss: 0.1706 - val_accuracy: 0.8262 - val_loss: 0.4846
Epoch 7/10
[1m782/782[0m [32m

<keras.src.callbacks.history.History at 0x36b60d250>

In [41]:
embeddingModel.layers[0].output # -> This will print the output of the first layer of the model

<KerasTensor shape=(None, None, 128), dtype=float32, sparse=False, name=keras_tensor_1>