In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from tensorflow.keras import losses
import re
import string
import matplotlib.pyplot as plt

In [None]:
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
data_dir = r'/data'

In [None]:
cwd = os.getcwd()
print(os.listdir())
print(cwd)

In [None]:
raw_training_set = tf.keras.utils.text_dataset_from_directory(
    f'{cwd}/{data_dir}/train',
    labels='inferred',
    label_mode='int',
    batch_size=32, 
    validation_split=0.2, 
    subset='training', 
    seed=seed
)

# Create the validation set. Use 20% of the data that was not used for training.
raw_validation_set = tf.keras.utils.text_dataset_from_directory(
    f'{cwd}/{data_dir}/train',
    labels='inferred',
    label_mode='int',
    batch_size=32, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed
)

# Create the test set.
raw_test_set = tf.keras.utils.text_dataset_from_directory(
    f'{cwd}/{data_dir}/test',
    labels='inferred',
    label_mode='int',
    batch_size=32,
)

In [None]:
for text_batch, label_batch in raw_training_set.take(1):
    for i in range(3):
        print(f"Review:\n {text_batch.numpy()[i]}")
        print(f"Label: {label_batch.numpy()[i]}\n")

In [52]:
def custom_standardization(data):
    lower = tf.strings.lower(data)
    strip_html = tf.strings.regex_replace(lower, '<br />', ' ')
    replaced = tf.strings.regex_replace(
            strip_html,
            '[%s]' % re.escape(string.punctuation),
            ''
        )    
    return replaced

In [55]:
text_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=250)

In [57]:
# grab just the text value from the list of text and labels
train_text = raw_training_set.map(lambda x, y: x)
# build vocabulary
text_layer.adapt(train_text)
# Print out the vocabulary size
print(f"Vocabulary size: {len(text_layer.get_vocabulary())}")

Vocabulary size: 10000


In [58]:
def vectorize_text(text, label):
    # Expand the dimensions of 'text' to add an extra axis.
    # If 'text' is initially [batch_size, sequence_length], it will become [batch_size, sequence_length, 1].
    text = tf.expand_dims(text, -1)
    
    # Apply the vectorize_layer to the expanded 'text' tensor.
    # This will produce the vectorized representation of the text.
    vectorized_text = text_layer(text)
    
    # Return the vectorized text along with the label.
    return vectorized_text, label

In [59]:
train_ds = raw_training_set.map(vectorize_text)
val_ds = raw_validation_set.map(vectorize_text)
test_ds = raw_test_set.map(vectorize_text)

In [60]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)