In [54]:
import os
import numpy as np
import pandas as pd
from tensorflow import keras

from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout

In [55]:
# Using text_dataset_from_directory tool (https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory)
dir = "DATA/movie-reviews-dataset"
train_data = text_dataset_from_directory(dir+"/train")
test_data = text_dataset_from_directory(dir+"/test")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [56]:
#Let's take a look at some data
for text_batch, label_batch in train_data.take(1):
  print(text_batch.numpy()[1])
  print(label_batch.numpy()[1]) # 0 = negative, 1 = positive
# we need to remove the <br>

b"I rented this movie last week. I saw Kevin Spacey and Morgan Freeman were on it, so it seemed promising. And it was, until Justin Timberlake came on scene. He is a really bad actor and shouldn't be allowed to make a movie ever again. I mean, he is one of the most boring, uninspired actors I've ever seen. He puts absolutely no emotion to any of his lines whatsoever. Why the hell was he cast for the role of Josh Pollack? I think Matt Damon would have been a better choice.<br /><br />Kevin Spacey was another big disappointment. His character is so dull, it seems like a bad mix of his character in American Beauty and John Doe in Se7en. It might sound cool, but believe me, it's not.<br /><br />Now, Dylan McDermott's acting is very good. It's about one of the very few good things about this movie. He is just inspired.<br /><br />Morgan Freeman is good but nothing special. He has some really cool lines though.<br /><br />About the story, although it was a bit obvious and exaggerated at time

In [57]:
# Cleaning the data (html)
def prepareData(dir):
  data = text_dataset_from_directory(dir)
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

train_data = prepareData(dir+'/train')
test_data = prepareData(dir+'/test')

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [58]:
# Building the model
model = Sequential()
# input layer (1 string):
model.add(Input(shape=(1,), dtype="string"))

In [59]:
# Text vectorization
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/TextVectorization
# The first layer processes the input string and turns it into a sequence of
# max_len integers, each of which maps to a certain token.
# It will automatically lowercases text, splits on whitespace and strips punctuation (when calling adapt()).

max_features = 1000
# We set a limit to 1000 tokens (i.e. the size of our vocabulary).
# Any word outside the vocabulary will be treated as OOV ("out of vocabulary") token which is counted inside the 1000.
# Note on max_features: a value too low will exclude potentially useful words from our vocabulary, while
# a value too high may increase the complexity and training time of our model.

max_len = 100
# Note on max_len: a value too low of a max_len will impact our model’s performance on longer reviews, while
# again a value too high may increase the complexity and training time of our model.

vectorize_layer = TextVectorization(
  max_tokens=max_features,
  output_mode="int",
  # Always pad or truncate to exactly this many tokens
  output_sequence_length=max_len,
)

# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

model.add(vectorize_layer)

In [60]:
# Embedding
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

# This layer turns each integer (representing a token) from the previous layer
# into dense vectors of fixed siwe. 
# Note that we're using max_tokens + 1 here, since there's an
# OOV token that gets added to the vocab.
model.add(Embedding(max_features + 1, 128))

# Recurrent layer
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
# note: 64 is the dimensionality of the output space (i.e. units)
model.add(LSTM(64))

# Standard fully connected dense hidden layer
model.add(Dense(64, activation="relu"))

# Output
# sigmoid is a perfect activation function because it outputs a number between 0 (bad review) and 1 (good review)
model.add(Dense(1, activation="sigmoid"))

In [61]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_5 (TextVe (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 128)          128128    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 181,761
Trainable params: 181,761
Non-trainable params: 0
_________________________________________________________________


In [62]:
# Compiling and training the model.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_data, epochs=10)
# (my laptop is quite old...)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a3b895a90>

In [64]:
# Testing the model

print(model.predict([
  "i loved this movie ! It was really amazing",
])) # it should be close to 1
print(model.predict([
  "a very bad movie ! Save your money",
])) # it should be close to 0

# Let's be nasty...
print("Ambiguous reviews")
print("1", model.predict([
  "The acting was extremely good but the photography and music were meh",
])) 
print("1 worse", model.predict([
  "The acting was extremely good but the photography and music were bad",
])) 
print("1 better", model.predict([
  "The acting was extremely good but the photography and music were on the average",
])) 
print("2", model.predict([
  "An amazing cast and music but the plot was really obvious !",
])) 
print("2 reversed", model.predict([
  "The plot was really obvious but an amazing cast and music !",
])) 
print("3", model.predict([
  "I don't know if it was good or bad...",
]))
print("Non-sense")
print("4", model.predict([
  "Great bad movie !",
])) 
print("5", model.predict([
  "I love pinapples !",
])) 
print("6", model.predict([
  "I hate pears !",
])) 
print("7", model.predict([
  "The cat is under the table",
]))

[[0.97839177]]
[[0.01224911]]
Ambiguous reviews
1 [[0.54231006]]
1 worse [[0.01726171]]
1 better [[0.54523134]]
2 [[0.05067167]]
2 reversed [[0.93946326]]
3 [[0.03092992]]
Non-sense
4 [[0.14377838]]
5 [[0.5409306]]
6 [[0.5390476]]
7 [[0.538474]]
