In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from tensorflow.keras import losses
import re
import string
import matplotlib.pyplot as plt

  from pandas.core import (


In [2]:
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
data_dir = r'/data'

In [4]:
cwd = os.getcwd()
print(os.listdir())
print(cwd)

['SentAnalysisNN.ipynb', 'data']
/Users/adithyashanker/Desktop/NLP/Course3


In [5]:
raw_training_set = tf.keras.utils.text_dataset_from_directory(
    f'{cwd}/{data_dir}/train',
    labels='inferred',
    label_mode='int',
    batch_size=32, 
    validation_split=0.2, 
    subset='training', 
    seed=seed
)

# Create the validation set. Use 20% of the data that was not used for training.
raw_validation_set = tf.keras.utils.text_dataset_from_directory(
    f'{cwd}/{data_dir}/train',
    labels='inferred',
    label_mode='int',
    batch_size=32, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed
)

# Create the test set.
raw_test_set = tf.keras.utils.text_dataset_from_directory(
    f'{cwd}/{data_dir}/test',
    labels='inferred',
    label_mode='int',
    batch_size=32,
)

Found 5000 files belonging to 2 classes.
Using 4000 files for training.
Found 5000 files belonging to 2 classes.
Using 1000 files for validation.
Found 5000 files belonging to 2 classes.


In [6]:
for text_batch, label_batch in raw_training_set.take(1):
    for i in range(3):
        print(f"Review:\n {text_batch.numpy()[i]}")
        print(f"Label: {label_batch.numpy()[i]}\n")


Review:
 b'This is a reunion, a team, and a great episode of Justice. From hesitation to resolution, Clark has made a important leap from a troubled teenager who was afraid of a controlled destiny, to a Superman who, like Green Arrow, sets aside his emotions to his few loved ones, ready to save the whole planet. This is not just a thrilling story about teamwork, loyalty, and friendship; this is also about deciding what\'s more important in life, a lesson for Clark. I do not want the series to end, but I hope the ensuing episodes will strictly stick to what Justice shows without any "rewind" pushes and put a good end here of Smallville---and a wonderful beginning of Superman.<br /><br />In this episode, however, we should have seen more contrast between Lex and the Team. Nine stars should give it enough credit.'
Label: 1

Review:
 b'"Hey Babu Riba" is a film about a young woman, Mariana (nicknamed "Esther" after a famous American movie star), and four young men, Glenn, Sacha, Kicha, and

In [7]:
def custom_standardization(data):
    lower = tf.strings.lower(data)
    strip_html = tf.strings.regex_replace(lower, '<br />', ' ')
    replaced = tf.strings.regex_replace(
            strip_html,
            '[%s]' % re.escape(string.punctuation),
            ''
        )    
    return replaced

In [8]:
text_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=250)

In [9]:
# grab just the text value from the list of text and labels
train_text = raw_training_set.map(lambda x, y: x)
# build vocabulary
text_layer.adapt(train_text)
# Print out the vocabulary size
print(f"Vocabulary size: {len(text_layer.get_vocabulary())}")

Vocabulary size: 10000


In [10]:
def vectorize_text(text, label):
   
    text = tf.expand_dims(text, -1)
    
    vectorized_text = text_layer(text)
    
    # Return the vectorized text along with the label.
    return vectorized_text, label

# Get one batch and select the first datapoint
text_batch, label_batch = next(iter(raw_training_set))
first_review, first_label = text_batch[0], label_batch[0]

# Show the raw data
print(f"Review:\n{first_review}")
print(f"\nLabel: {raw_training_set.class_names[first_label]}")
# Show the vectorized data
print(f"\nVectorized review\n{vectorize_text(first_review, first_label)}")

Review:
b"This movie is about a side of Ireland that Americans don't normally see, the narrow-minded religiously prejudiced side of the 'friendliest race in the world'. The movie, by the admission of the inhabitants of Fethard who are old enough to remember the events, is fairly accurate (though they insist that the film-makers invented some of the more violent scenes just to spice up the action).<br /><br />The movie was very unpopular in Ireland as it portrayed the Catholic church in a bad light, but the simple fact is that representatives of the Catholic church *did* organise vetoes of minorities (before Protestants it was the Jews).<br /><br />The film is a fascinating insight into the whole issue of religion in Ireland"

Label: pos

Vectorized review
(<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  10,   17,    7,   42,    3,  421,    5, 2801,   12, 1931,   87,
        1709,   65,    2,    1,    1,    1,  421,    5,    2,    1, 1487,
           8,    2,  185,    2,   17,

In [11]:
train_ds = raw_training_set.map(vectorize_text)
val_ds = raw_validation_set.map(vectorize_text)
test_ds = raw_test_set.map(vectorize_text)

In [12]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
embedding_dim = 16

# Create the model by calling tf.keras.Sequential, where the layers are given in a list.
model_sequential = tf.keras.Sequential([
    layers.Embedding(input_dim=10000, output_dim=16),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
])
model_sequential.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

# Print out the summary of the model
model_sequential.summary()

In [14]:
model = model_sequential # model = model_sequential

In [15]:
epochs = 25
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    verbose=2
)

Epoch 1/25
125/125 - 2s - 16ms/step - accuracy: 0.5543 - loss: 0.6895 - val_accuracy: 0.5870 - val_loss: 0.6839
Epoch 2/25
125/125 - 1s - 9ms/step - accuracy: 0.6313 - loss: 0.6763 - val_accuracy: 0.6540 - val_loss: 0.6683
Epoch 3/25
125/125 - 0s - 4ms/step - accuracy: 0.6875 - loss: 0.6555 - val_accuracy: 0.6960 - val_loss: 0.6459
Epoch 4/25
125/125 - 0s - 4ms/step - accuracy: 0.7250 - loss: 0.6260 - val_accuracy: 0.7100 - val_loss: 0.6186
Epoch 5/25
125/125 - 0s - 3ms/step - accuracy: 0.7523 - loss: 0.5908 - val_accuracy: 0.7450 - val_loss: 0.5894
Epoch 6/25
125/125 - 0s - 4ms/step - accuracy: 0.7820 - loss: 0.5530 - val_accuracy: 0.7710 - val_loss: 0.5571
Epoch 7/25
125/125 - 0s - 3ms/step - accuracy: 0.8123 - loss: 0.5154 - val_accuracy: 0.7930 - val_loss: 0.5258
Epoch 8/25
125/125 - 0s - 3ms/step - accuracy: 0.8395 - loss: 0.4796 - val_accuracy: 0.8060 - val_loss: 0.5039
Epoch 9/25
125/125 - 0s - 3ms/step - accuracy: 0.8570 - loss: 0.4467 - val_accuracy: 0.8130 - val_loss: 0.4776


In [16]:
loss, accuracy = model.evaluate(test_ds)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8396 - loss: 0.3726
Loss: 0.3679862916469574
Accuracy: 0.8407999873161316


In [17]:
# Make a new sequential model using the vectorization layer and the model you just trained.
export_model = tf.keras.Sequential([
  text_layer,
  model]
)

# Compile the model
export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

In [18]:
# Convert the list of strings to a tensor and expand its dimensions
examples = ['excellent']
examples_tensor = tf.constant(examples)

# Make predictions using the model
results = export_model.predict(examples_tensor, verbose=False)
for result, example in zip(results, examples):
    print(f'Result: {result[0]:.3f},   Label: {int(np.round(result[0]))},   Review: {example}')

Result: 0.585,   Label: 1,   Review: excellent


In [20]:
print(results)

[[0.5846141]]
