In [None]:
# data tools
import numpy as np
import pandas as pd

# tensorflow
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
import tensorflow_datasets as tfds
import tensorflow_hub as hub

# BERT
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

In [None]:
max_length_test = 20
test_sentence = 'Test tokenization sentence. Followed by another sentence'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
bert_input = tokenizer.encode_plus(
                        test_sentence,                      
                        add_special_tokens = True, # add [CLS], [SEP]
                        max_length = max_length_test, # max length of the text that can go to BERT
                        pad_to_max_length = True, # add [PAD] tokens
                        return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [None]:
print('encoded', bert_input)

## Classification task

__Download IMDB reviews from ```tensorflow_datasets()```__

In [None]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews', 
                                          split = (tfds.Split.TRAIN, tfds.Split.TEST),
                                          as_supervised=True,
                                          with_info=False)

__Inspect info__

In [None]:
print('info', ds_info)

__Show examples__

In [None]:
for review, label in tfds.as_numpy(ds_train.take(5)):
    print('review', review.decode()[0:50], label)

__Helper functions__

In [None]:
def convert_example_to_feature(review):
    return tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=160, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default
        )

# map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,}, label

def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)

    for review, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(review.decode())
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

__Preprocess dataset using helper functions__

In [None]:
# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(32)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(32)

__Define and compile model__

In [None]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, 
                                     epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# compile the model
model.compile(optimizer=optimizer, 
              loss=loss, 
              metrics=[metric])

__Train__

In [None]:
bert_history = model.fit(ds_train_encoded, 
                         epochs=number_of_epochs,
                         batch_size=32,
                         validation_data=ds_test_encoded)

## Question answering

__Download pretrained model__

Part of the TensorflowHub universe [here](https://tfhub.dev/see--/bert-uncased-tf2-qa/1)

In [None]:
import os
os.system('wget https://github.com/see--/natural-question-answering/releases/download/v0.0.1/tokenizer_tf2_qa.zip')
os.system('unzip tokenizer_tf2_qa.zip')

__Question answering function__

Modified from an example found [here](https://tfhub.dev/see--/bert-uncased-tf2-qa/1)

In [None]:
tokenizer = BertTokenizer.from_pretrained('tokenizer_tf2_qa/vocab.txt')
model = hub.load("https://tfhub.dev/see--/bert-uncased-tf2-qa/1")

__Define paragraphs__

Use markdown style paragraph tags to separate lines - ```<p>``` and ```</p>```

In [None]:
paragraph = '''<p>The computer is named Deep Thought.</p>.
               <p>After 46 million years of training it found the answer.</p>
               <p>The answer shocked everyone. It was 42!</p>'''

__Define questions__

In [None]:
questions = ['How long did it take to find the answer?',
            'What was the answer to the great question?',
            'What was the name of the computer?']

__Answer questions!__

In [None]:
for question in questions:
        question_tokens = tokenizer.tokenize(question)
        paragraph_tokens = tokenizer.tokenize(paragraph)
        tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + paragraph_tokens + ['[SEP]']
        input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_word_ids)
        input_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(paragraph_tokens) + 1)

        input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
          tf.convert_to_tensor(t, dtype=tf.int32), 0), (input_word_ids, input_mask, input_type_ids))
        outputs = model([input_word_ids, input_mask, input_type_ids])
        # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
        short_start = tf.argmax(outputs[0][0][1:]) + 1
        short_end = tf.argmax(outputs[1][0][1:]) + 1
        answer_tokens = tokens[short_start: short_end + 1]
        answer = tokenizer.convert_tokens_to_string(answer_tokens)
        print(f'Question: {question}')
        print(f'Answer: {answer}')
        print("\n")

__Another example__

Example from an article in [The Guardian](https://www.theguardian.com/environment/2021/apr/20/carbon-emissions-to-soar-in-2021-by-second-highest-rate-in-history)

In [None]:
paragraph = '''<p>Carbon dioxide emissions are forecast to jump this year by the second biggest annual rise in history, as global economies pour stimulus cash into fossil fuels in the recovery from the Covid-19 recession.</p>
                <p>The leap will be second only to the massive rebound 10 years ago after the financial crisis, and will put climate hopes out of reach unless governments act quickly, the International Energy Agency has warned.</p>
                <p>Surging use of coal, the dirtiest fossil fuel, for electricity is largely driving the emissions rise, especially across Asia but also in the US. Coal’s rebound causes particular concern because it comes despite plunging prices for renewable energy, which is now cheaper than coal.</p>'''

In [None]:
questions = ["What is the problem?",
             "Who has given this warning?"]

In [None]:
for question in questions:
        question_tokens = tokenizer.tokenize(question)
        paragraph_tokens = tokenizer.tokenize(paragraph)
        tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + paragraph_tokens + ['[SEP]']
        input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_word_ids)
        input_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(paragraph_tokens) + 1)

        input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
          tf.convert_to_tensor(t, dtype=tf.int32), 0), (input_word_ids, input_mask, input_type_ids))
        outputs = model([input_word_ids, input_mask, input_type_ids])
        # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
        short_start = tf.argmax(outputs[0][0][1:]) + 1
        short_end = tf.argmax(outputs[1][0][1:]) + 1
        answer_tokens = tokens[short_start: short_end + 1]
        answer = tokenizer.convert_tokens_to_string(answer_tokens)
        print(f'Question: {question}')
        print(f'Answer: {answer}')
        print("\n")

__Another example__

This time taken from the introduction to the Wikipedia page for [Karl Marx](https://en.wikipedia.org/wiki/Karl_Marx)

In [None]:
paragraph = '''Karl Heinrich Marx (German: [maʁks]; 5 May 1818 – 14 March 1883[13]) was a German philosopher, economist, historian, sociologist, political theorist, journalist and socialist revolutionary. Born in Trier, Germany, Marx studied law and philosophy at university. He married Jenny von Westphalen in 1843. Due to his political publications, Marx became stateless and lived in exile with his wife and children in London for decades, where he continued to develop his thought in collaboration with German thinker Friedrich Engels and publish his writings, researching in the reading room of the British Museum. His best-known titles are the 1848 pamphlet The Communist Manifesto and the three-volume Das Kapital (1867–1883). Marx's political and philosophical thought had enormous influence on subsequent intellectual, economic and political history. His name has been used as an adjective, a noun, and a school of social theory.'''

In [None]:
questions = ["Where was Marx born?",
             "Who was his main collaborator?",
             "How do you pronounce Marx in German?"]

In [None]:
for question in questions:
        question_tokens = tokenizer.tokenize(question)
        paragraph_tokens = tokenizer.tokenize(paragraph)
        tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + paragraph_tokens + ['[SEP]']
        input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_word_ids)
        input_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(paragraph_tokens) + 1)

        input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
          tf.convert_to_tensor(t, dtype=tf.int32), 0), (input_word_ids, input_mask, input_type_ids))
        outputs = model([input_word_ids, input_mask, input_type_ids])
        # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
        short_start = tf.argmax(outputs[0][0][1:]) + 1
        short_end = tf.argmax(outputs[1][0][1:]) + 1
        answer_tokens = tokens[short_start: short_end + 1]
        answer = tokenizer.convert_tokens_to_string(answer_tokens)
        print(f'Question: {question}')
        print(f'Answer: {answer}')
        print("\n")