In [1]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace

def prepareData(dir):
  data = text_dataset_from_directory(dir)
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

dir = r"C:\Users\elias\Downloads\aclImdb_v1\aclImdb"
train_data = prepareData(dir + r'\train')
test_data = prepareData(dir + r'\test')

for text_batch, label_batch in train_data.take(1):
  print(text_batch.numpy()[0])
  print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

Found 75000 files belonging to 3 classes.
Found 25000 files belonging to 2 classes.
b"I came across this film by chance as I was channel surfing during a lazy day at home. I sat down to watch it thinking it was simply an average, not-terrible-but-not-fantastic movie. After the first half an hour, I was pleasantly surprised at the fact that the film was actually NOT 'just another movie'.  This film demonstrates how a dedicated director, stellar acting by talented individuals and beautifully written screenplay can transform a movie into an extremely enjoyable viewing experience.  This movie evoked a lot laughs from me and left me glued to the television. It's not a big-budget film but I can honestly say that this film easily tramples other similar movies. This proves that you don't need deep pockets and mass promotions to create a classic.  For those film junkies like me out there, I HIGHLY recommend watching this. It fuses dramatic comedy with hints of romance and generally a feel-good 

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

model = Sequential()
model.add(Input(shape=(1,), dtype="string"))

In [3]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_tokens = 1000
max_len = 100

vectorize_layer = TextVectorization(
  # Max vocab size. Any words outside of the max_tokens most common ones
  # will be treated the same way: as "out of vocabulary" (OOV) tokens.
  max_tokens=max_tokens,
  # Output integer indices, one per string token
  output_mode="int",
  # Always pad or truncate to exactly this many tokens
  output_sequence_length=max_len,
)

In [4]:
# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

model.add(vectorize_layer)

In [5]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU
from tensorflow.keras import optimizers


# Previous layer: TextVectorization
# max_tokens = 1000
# model.add(vectorize_layer)

# Note that we're using max_tokens + 1 here, since there's an
# out-of-vocabulary (OOV) token that gets added to the vocab.
model.add(Embedding(max_tokens + 1, 128))
# 64 is the "units" parameter, which is the
# dimensionality of the output space.
model.add(LSTM(64))

model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [6]:
model.compile(
  optimizer=optimizers.RMSprop(learning_rate=2e-5),
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

In [7]:
train_data

<_MapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [8]:
model.fit(train_data, epochs=8, batch_size=128)

Epoch 1/8


Epoch 2/8
Epoch 3/8
Epoch 4/8

KeyboardInterrupt: 