In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import glob
import os
import shutil
import re
import string
import matplotlib.pyplot as plt
from typing import Tuple

from tensorflow.keras import (
    layers, 
    losses, 
    utils,
)

import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization

%load_ext tensorboard

In [166]:
BATCH_SIZE = 1
SEED = 42

In [167]:
print(tf.config.list_logical_devices("CPU"))
print(tf.config.list_physical_devices("GPU"))

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


#TODO: use imdb.vocab and read imdb error and readme 
### Dataset review

Given [dataset](http://ai.stanford.edu/~amaas/data/sentiment/) consists of 50,000 reviews split evenly into 25k train and 25k test sets. Generally, there are redudant amount of data for testing.  
- So it makes sense to split data to 80/20 or 90/10.

The next one is that, bag of words `imdb.vocab`  and `already-tokenized bag of words (BoW)` are in the dataset. We can test given approach

In [168]:
def dataset_preprocess(url:str)->None:
  """Download and remove metadata"""
  dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')
  dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
  train_dir = os.path.join(dataset_dir, 'train')
  remove_dir = os.path.join(train_dir, 'unsup')
  shutil.rmtree(remove_dir) 
  os.remove(os.path.join(dataset_dir,'imdb.vocab'))
  os.remove(os.path.join(dataset_dir,'imdbEr.txt'))
  os.remove(os.path.join(dataset_dir,'README'))

  remove_feat = glob.glob(f'{dataset_dir}/*/*.feat')
  remove_urls = glob.glob(f'{dataset_dir}/*/urls_*.txt')

  for filePath in remove_feat + remove_urls:
    os.remove(filePath)

In [169]:
def load_ds(dataset_path:str) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
  """Load dataset"""
  ds = tf.keras.utils.text_dataset_from_directory(
    dataset_path, 
    batch_size=BATCH_SIZE, 
    seed=SEED, follow_links=True)
  return ds

In [None]:
dataset_preprocess(url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")

In [None]:
ds = load_ds(dataset_path='aclImdb')

In [None]:
#todo split and join ds
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=SEED)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [None]:
raw_train_ds, raw_val_ds, raw_test_ds = get_dataset_partitions_tf(ds,50000)

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [None]:
max_features = 20000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_dim = 16

In [None]:
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim),
    layers.Dropout(0.2),
    layers.LSTM(100),
    layers.Dropout(0.2),
    layers.Dense(1)],
    name='lstm_model')

model.summary()

In [None]:
logdir = "logs/lstm-model"
checkpoint_path = "models/lstm-model/training__{epoch:02d}__{loss:.6f}/cp.ckpt"


tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
early_stop_callback = tf.keras.callbacks.EarlyStopping(patience=20,monitor='binary_accuracy')
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='binary_accuracy', factor=0.5, patience=10, verbose=1, mode='auto',
    min_delta=0.0001, cooldown=0, min_lr=0.00001
)

In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True, label_smoothing=0.2),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0),
              )

In [None]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=40,
    callbacks=[
      tensorboard_callback, 
      cp_callback, 
      early_stop_callback, 
      reduce_lr_callback],
    )

In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
%tensorboard --logdir logs/

In [160]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)



  numdigits = int(np.log10(self.target)) + 1


OverflowError: ignored

In [161]:
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]

export_model.predict(examples)

array([[0.5174026 ],
       [0.51741004],
       [0.5173913 ]], dtype=float32)