In [3]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses


In [4]:
print(tf.__version__)

2.9.1


In [6]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.',
                                 cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [7]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [8]:
train_dir = os.path.join(dataset_dir, 'train')

In [9]:
sample = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample) as f:
    print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [10]:
removeDir = os.path.join(train_dir, 'unsup')
shutil.rmtree(removeDir)

In [12]:
batchSize = 32
seed = 42
rawTrainDs = tf.keras.utils.text_dataset_from_directory('aclImdb/train', batch_size=batchSize,
                                                        validation_split=.2,
                                                        subset='training',
                                                        seed=seed)
                                                        

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [13]:
for text_batch, label_batch in rawTrainDs.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label 0
Review b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into 

In [14]:
rawVals = tf.keras.utils.text_dataset_from_directory('aclImdb/train',
                                                     batch_size=batchSize,
                                                     validation_split=.2,
                                                     subset='validation',
                                                     seed=seed
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [46]:

rawTests = tf.keras.utils.text_dataset_from_directory('aclImdb/test', batch_size=batchSize)

Found 25000 files belonging to 2 classes.


Note: When using the validation_split and subset arguments, make sure to either specify a random seed, or to pass shuffle=False, so that the validation and training splits have no overlap.

rawTests = tf.keras.utils.text_dataset_from_directory('aclImdb/test', batch_size=batchSize)

In [30]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [33]:
maxFeats = 10000
sequenceLength = 250

vectorizeLayer = layers.TextVectorization(standardize=custom_standardization, max_tokens=maxFeats, output_mode='int', output_sequence_length=sequenceLength)

In [34]:
trainText = rawTrainDs.map(lambda x, y:x)
vectorizeLayer.adapt(trainText)

In [36]:
def vectorize_text(text, label):
    print(text.shape)
    text = tf.expand_dims(text, -1)
    print(text.shape)
    return vectorizeLayer(text), label

In [42]:
textBatch, labelBatch = next(iter(rawTrainDs))
firstReview, firstLabel = textBatch[0], labelBatch[0]
print("Review", firstReview)
print("label", rawTrainDs.class_names[firstLabel])
print("Vectorized Review", vectorize_text(firstReview, firstLabel))

Review tf.Tensor(b'"Emma" was a product of what might be called by the First Great Jane Austen Cycle of the mid-nineties, and it was recently shown on British television, doubtless because of the interest in the author created by the Second Great Jane Austen Cycle which started with "Pride and Prejudice" two years ago. We currently have in the cinemas the Austen biopic "Becoming Jane", and ITV have recently produced three TV movies based on Austen novels. These include "Northanger Abbey", the only one of the six major novels not to have been filmed previously, so the cycle should now be complete. No doubt, however, there will be more to come in the near future. (There is, after all, her juvenile "Love and Freindship" (sic), the short novella "Lady Susan", and someone, somewhere, has doubtless supplied endings to her two unfinished fragments "The Watsons" and "Sanditon". Then there are all those Austen sequels churned out by modern writers\xc2\x85\xc2\x85\xc2\x85).<br /><br />The main c

In [45]:
print("1287 ---> ",vectorizeLayer.get_vocabulary()[1287])
print("313 ---> ",vectorizeLayer.get_vocabulary()[313])
print('Vocab size: {}'.format(len(vectorizeLayer.get_vocabulary())))



1287 --->  lovely
313 --->  american
Vocab size: 10000


In [47]:
trainDs = rawTrainDs.map(vectorize_text)
valDs = rawVals.map(vectorize_text)
testDs = rawTests.map(vectorize_text)

(None,)
(None, 1)
(None,)
(None, 1)
(None,)
(None, 1)


In [49]:
autoTune = tf.data.AUTOTUNE
trainDs = trainDs.cache().prefetch(buffer_size=autoTune)
valDs = valDs.cache().prefetch(buffer_size=autoTune)
testDs = testDs.cache().prefetch(buffer_size=autoTune)

In [53]:
embeddingDim = 16
model = tf.keras.Sequential([
    layers.Embedding(maxFeats + 1, embeddingDim),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 16)          160016    
                                                                 
 dropout_4 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_2   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_5 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
__________________________________________________

In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True