In [1]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout

In [3]:
# import tarfile
# fname = "../datasets/tar/aclImdb_v1.tar.gz"
# tar = tarfile.open(fname, "r:gz")
# tar.extractall()
# tar.close()

In [4]:
def prepareData(dir):
    data = text_dataset_from_directory(dir)
    return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

train_data = prepareData("../datasets/Movies/train")
test_data = prepareData("../datasets/Movies/test")

Found 75000 files belonging to 3 classes.
Found 25000 files belonging to 2 classes.


In [12]:
for text_batch, label_batch in train_data.take(2):
    print(text_batch.numpy()[0])
    print()
    print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

b"If you where run over by the Miramax foreign film juggernaut, then you missed this brilliant gem tucked away in one those twenty seats cinema theater.  A film is very much like a painting, meant to be seen not discussed or explained. So let us just leave it at 'see it'.  Benito Zambrano's talent on the other hand merits more than a discussion. A sensitive director and a poignant writer. In many ways 'Solas' reminded me of another gem in the dust 'Heavy'.  Benito managed to keep the movie so simple, that it hurts. His flare for observing and then relaying in his film the raw human angst, is inspiring.  The actors for there part, rose to the greatness of the moment.  BZ makes us cling to hope by our finger nails while steadily adding to our feet the weight of reality. But then, isn't that life!  To look for hope in 'Solas' is to look for simplicity in 'Guernica'. It's there, you just need to see it.  And like all good things in life this one is elusive too. No video or a DVD release ye

In [11]:
model = Sequential()

# ----- 1. INPUT
# We need this to use the TextVectorization layer next.
model.add(Input(shape=(1,), dtype="string"))

In [13]:
# ----- 2. TEXT VECTORIZATION

# This layer processes the input string and turns it into a sequence of max_len integers, each of which maps to a certain token.
max_tokens = 1000
max_len = 100
vectorize_layer = TextVectorization(
  # Max vocab size. Any words outside of the max_tokens most common ones
  # will be treated the same way: as "out of vocabulary" (OOV) tokens.
  max_tokens=max_tokens,
  # Output integer indices, one per string token
  output_mode="int",
  # Always pad or truncate to exactly this many tokens
  output_sequence_length=max_len,
)

In [14]:
# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

model.add(vectorize_layer)

In [15]:
# ----- 3. EMBEDDING

# Note that we're using max_tokens + 1 here, since there's an out-of-vocabulary (OOV) token that gets added to the vocab.
model.add(Embedding(max_tokens + 1, 128))

In [16]:
# ----- 4. RECURRENT LAYER

model.add(LSTM(64))
# 64 is the "units" parameter, which is the dimensionality of the output space.

In [17]:
# ----- 5. DENSE HIDDEN LAYER
model.add(Dense(64, activation="relu"))

In [18]:
# ----- 6. Compile and train the model.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x272694e78e0>

In [20]:
model.save_weights('rnn')

model.load_weights('rnn')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x27165b25040>

In [21]:
# ----- 7. EVALUATE
model.evaluate(test_data)



[7.627357482910156, 0.0]

In [22]:
# ----- 8. PREDICT
print(model.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

[[3.2074769 1.626152  1.2981178 1.4491545 1.7925177 1.5445901 1.701822
  2.4309747 2.0220075 1.6786261 1.428061  2.0254533 1.6126689 2.8055391
  2.7372334 2.6900656 1.4870291 2.1604598 3.3382287 1.9424106 2.2953417
  1.4545922 2.01653   1.6751395 1.5149219 1.4742379 3.03322   1.9492502
  1.6977496 0.        1.8107419 1.4547609 1.6398175 1.4038697 1.9365258
  1.7763956 3.1605396 1.7768819 2.1136408 2.8857856 1.9248207 1.2178427
  1.3763291 2.2874427 1.7787526 1.7716644 1.7903781 2.2046525 2.142461
  2.2708464 1.4177294 1.423459  2.067016  3.3653083 1.947506  1.8641466
  2.2855113 0.        1.7711399 1.5712162 1.5699801 1.7094971 1.8179137
  1.9028884]]


In [23]:
print(model.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

print(model.predict([
  "this was awful! i hated it so much, nobody should watch this. the acting was terrible, the music was terrible, overall it was just bad.",
]))

[[3.2074769 1.626152  1.2981178 1.4491545 1.7925177 1.5445901 1.701822
  2.4309747 2.0220075 1.6786261 1.428061  2.0254533 1.6126689 2.8055391
  2.7372334 2.6900656 1.4870291 2.1604598 3.3382287 1.9424106 2.2953417
  1.4545922 2.01653   1.6751395 1.5149219 1.4742379 3.03322   1.9492502
  1.6977496 0.        1.8107419 1.4547609 1.6398175 1.4038697 1.9365258
  1.7763956 3.1605396 1.7768819 2.1136408 2.8857856 1.9248207 1.2178427
  1.3763291 2.2874427 1.7787526 1.7716644 1.7903781 2.2046525 2.142461
  2.2708464 1.4177294 1.423459  2.067016  3.3653083 1.947506  1.8641466
  2.2855113 0.        1.7711399 1.5712162 1.5699801 1.7094971 1.8179137
  1.9028884]]
[[3.207477  1.6261523 1.2981175 1.4491541 1.7925181 1.5445901 1.7018219
  2.4309747 2.0220075 1.6786261 1.4280611 2.0254533 1.6126689 2.8055391
  2.7372336 2.6900656 1.4870292 2.1604595 3.3382287 1.9424106 2.2953422
  1.454592  2.0165298 1.6751397 1.514922  1.4742379 3.03322   1.94925
  1.6977495 0.        1.8107418 1.4547609 1.6398177 1.