### Requirements (for Colab users)
- Follow these instructions to install requirements if you are using Colab
- Then, restart runtime and skip this section

In [0]:
!pip install -q tensorflow

In [2]:
!git clone https://github.com/tensorflow/models.git

Cloning into 'models'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects:   2% (1/50)[Kremote: Counting objects:   4% (2/50)[Kremote: Counting objects:   6% (3/50)[Kremote: Counting objects:   8% (4/50)[Kremote: Counting objects:  10% (5/50)[Kremote: Counting objects:  12% (6/50)[Kremote: Counting objects:  14% (7/50)[Kremote: Counting objects:  16% (8/50)[Kremote: Counting objects:  18% (9/50)[Kremote: Counting objects:  20% (10/50)[Kremote: Counting objects:  22% (11/50)[Kremote: Counting objects:  24% (12/50)[Kremote: Counting objects:  26% (13/50)[Kremote: Counting objects:  28% (14/50)[Kremote: Counting objects:  30% (15/50)[Kremote: Counting objects:  32% (16/50)[Kremote: Counting objects:  34% (17/50)[Kremote: Counting objects:  36% (18/50)[Kremote: Counting objects:  38% (19/50)[Kremote: Counting objects:  40% (20/50)[Kremote: Counting objects:  42% (21/50)[Kremote: Counting objects:  44% (22/50)[Kremote: Counting o

In [3]:
# install requirements to use tensorflow/models repository
!pip install --user -q -r models/official/requirements.txt
# you may have to restart the runtime afterwards

[K     |████████████████████████████████| 92kB 4.9MB/s 
[K     |████████████████████████████████| 102kB 5.3MB/s 
[K     |████████████████████████████████| 1.0MB 45.1MB/s 
[K     |████████████████████████████████| 21.6MB 153kB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


## data loading and preprocessing

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import sys
sys.path.append('models')
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization

In [2]:
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

TF Version:  2.2.0-rc1
Eager mode:  True
Hub version:  0.7.0
GPU is available


In [3]:
train_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train', 'test'),
    as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…







HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete03J32D/imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete03J32D/imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete03J32D/imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [4]:
# a sample of data
for text, label in train_data.take(1):
  print(text)
  print(label)

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)


In [0]:
"""
Each line of the dataset is composed of the review text and its label
- Data preprocessing consists of transforming text to BERT input features:
input_word_ids, input_mask, segment_ids
- In the process, tokenizing the text is done with the provided BERT model tokenizer
"""

label_list = [0, 1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
train_batch_size = 32

# Get BERT layer and tokenizer:
# More details here: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)


# This provides a function to convert row to input features and label
def to_feature_map_fn(label_list, max_seq_length, tokenizer):
  def to_feature(text, label):
    example = classifier_data_lib.InputExample(guid = None,
                                              text_a = text.numpy(), 
                                              text_b = None, 
                                              label = label.numpy())
    feature = classifier_data_lib.convert_single_example(5, example, label_list,
                                      max_seq_length, tokenizer)
    
    return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)

  def to_feature_map(text, label):
    input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], 
                                  Tout=[tf.int32, tf.int32, tf.int32, tf.int32])
  
    # py_func doesn't set the shape of the returned tensors.
    input_ids.set_shape([128])
    input_mask.set_shape([128])
    segment_ids.set_shape([128])
    label_id.set_shape([])

    x = {
          'input_word_ids': input_ids,
          'input_mask': input_mask,
          'input_type_ids': segment_ids
      }
    return (x, label_id)
  return to_feature_map
  

In [0]:
to_feature_map = to_feature_map_fn(label_list, max_seq_length, tokenizer)

# train
train_data = (train_data.map(to_feature_map,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
                        .shuffle(100)
                        .batch(32, drop_remainder=True)
                        .prefetch(tf.data.experimental.AUTOTUNE))

# test
test_data = (test_data.map(to_feature_map,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
                        .batch(32, drop_remainder=True)
                        .prefetch(tf.data.experimental.AUTOTUNE))

In [7]:
# data spec
train_data.element_spec

({'input_mask': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_type_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_word_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None)},
 TensorSpec(shape=(32,), dtype=tf.int32, name=None))

## Model

In [8]:
# Building the model

tf.keras.backend.clear_session()

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="input_type_ids")

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

output = tf.keras.layers.Dense(2, activation='softmax', name="output")(pooled_output)
#drop = tf.keras.layers.Dropout(0.5)(pooled_output)

model = tf.keras.Model(
      inputs={
          'input_word_ids': input_word_ids,
          'input_mask': input_mask,
          'input_type_ids': input_type_ids
      },
      outputs=output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

### Training

In [0]:
optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

In [10]:
# Train model
epochs = 3
history = model.fit(train_data,
                    epochs=epochs,
                    verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
model.evaluate(test_data, verbose=1)



[0.4276823401451111, 0.877000629901886]