In [1]:

# https://github.com/yahah100/text_summarization/blob/master/t5/t5_tf_huggingface.ipynb 

import tensorflow as tf
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
import tensorflow_datasets as tfds
import time



In [2]:
BATCH_SIZE = 16

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

In [3]:
# pip install ipywidgets

In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))
    
pad_token_id = tokenizer.pad_token_id

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [5]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1.0)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

model.summary()

Model: "tf_t5for_conditional_generation"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  16449536  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  18881280  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  25175808  
Total params: 60,506,624
Trainable params: 60,506,624
Non-trainable params: 0
_________________________________________________________________


In [7]:
cnn_dailymail = tfds.load(name="cnn_dailymail")

INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: cnn_dailymail/3.1.0
INFO:absl:Load dataset info from /tmp/tmpwpk3cnnhtfds
INFO:absl:Field info.module_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Generating dataset cnn_dailymail (/home/ash/tensorflow_datasets/cnn_dailymail/3.1.0)


[1mDownloading and preparing dataset 558.32 MiB (download: 558.32 MiB, generated: 1.27 GiB, total: 1.82 GiB) to /home/ash/tensorflow_datasets/cnn_dailymail/3.1.0...[0m


Dl Completed...: |          | 0/0 [00:00<?, ? url/s]

Dl Size...: |          | 0/0 [00:00<?, ? MiB/s]

Extraction completed...: |          | 0/0 [00:00<?, ? file/s]

INFO:absl:Downloading https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ into /home/ash/tensorflow_datasets/downloads/ucexport_download_id_0BwmD_VLjROrfTHk4NFg2SndKG8BdJPpt2iRo6Dpzz23CByJuAePEilB-pxbcBCHaWDs.tmp.ee2249d148194c169835e71ee14d1050...
INFO:absl:Downloading https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs into /home/ash/tensorflow_datasets/downloads/ucexport_download_id_0BwmD_VLjROrfM1BxdkxVaTY2zVV-G71RIXPssrrvSAjt19Cy91r-9CQ2F9DMKA0uFk0.tmp.2f1cc8fba06a48ef974de3826a95dcf5...
INFO:absl:Skipping download of https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt: File cached in /home/ash/tensorflow_datasets/downloads/raw.gith.com_abis_cnn-dail_mast_url_list_axPXvtewhJkMKXBVu-9E9DpxMtJAWnlUsOLSlGYGgCb0.txt
INFO:absl:Skipping extraction for /home/ash/tensorflow_datasets/downloads/raw.gith.com_abis_cnn-dail_mast_url_list_axPXvtewhJkMKXBVu-9E9DpxMtJAWnlUsOLSlGYGgCb0.txt (method=NO_EXTRACT).
INFO:ab






Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/287113 [00:00<?, ? examples/s]

Shuffling cnn_dailymail-train.tfrecord...:   0%|          | 0/287113 [00:00<?, ? examples/s]

INFO:absl:Done writing cnn_dailymail-train.tfrecord. Number of examples: 287113 (shards: [17945, 17944, 17945, 17944, 17945, 17944, 17945, 17944, 17945, 17945, 17944, 17945, 17944, 17945, 17944, 17945])


Generating validation examples...:   0%|          | 0/13368 [00:00<?, ? examples/s]

Shuffling cnn_dailymail-validation.tfrecord...:   0%|          | 0/13368 [00:00<?, ? examples/s]

INFO:absl:Done writing cnn_dailymail-validation.tfrecord. Number of examples: 13368 (shards: [13368])


Generating test examples...:   0%|          | 0/11490 [00:00<?, ? examples/s]

Shuffling cnn_dailymail-test.tfrecord...:   0%|          | 0/11490 [00:00<?, ? examples/s]

INFO:absl:Done writing cnn_dailymail-test.tfrecord. Number of examples: 11490 (shards: [11490])
INFO:absl:Constructing tf.data.Dataset for split None, from /home/ash/tensorflow_datasets/cnn_dailymail/3.1.0


[1mDataset cnn_dailymail downloaded and prepared to /home/ash/tensorflow_datasets/cnn_dailymail/3.1.0. Subsequent calls will reuse this data.[0m


In [8]:
train_tfds = cnn_dailymail['train']
test_tfds = cnn_dailymail['test']
val_tfds = cnn_dailymail['validation']

In [9]:
len_train = len(list(train_tfds))
len_test = len(list(test_tfds))
len_val = len(list(val_tfds))


In [10]:
def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text.numpy().decode('UTF-8')

def tokenize_articles(text):
    text = normalize_text(text)
    ids = tokenizer.encode_plus((model.config.prefix + text), return_tensors="tf", max_length=512) 

    return tf.squeeze(ids['input_ids']), tf.squeeze(ids['attention_mask'])
        
def tokenize_highlights(text):
    text = normalize_text(text)
    ids = tokenizer.encode(text, return_tensors="tf", max_length=150)
    return tf.squeeze(ids)



def map_func(features):

    article_ids, attention_mask = tf.py_function(tokenize_articles, inp=[features["article"]], Tout=(tf.int32, tf.int32))
    highlights_ids = tf.py_function(tokenize_highlights, inp=[features["highlights"]], Tout=tf.int32)

    return article_ids, attention_mask, highlights_ids

In [11]:
train_ds = train_tfds.map(map_func)\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[150]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

val_ds = val_tfds.map(map_func)\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[150]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

test_ds = test_tfds.map(map_func)\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[150]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

In [20]:


@tf.function
def train_step(input_ids, input_mask, y):
    # https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py
    y_ids = y[:, :-1]
    lm_labels = tf.identity(y[:, 1:])
    lm_labels = tf.where(tf.equal(y[:, 1:],pad_token_id), -100, lm_labels)

    with tf.GradientTape() as tape:
        # prediction_scores: (bs, 150, 32128)
        # decoder_past_key_value_states: (bs, 512, 512), (bs, 8, 150, 64)
        # z: (bs, 512, 512)
        print(model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, labels=lm_labels, training=True))
        predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, labels=lm_labels, training=True)
        loss = loss_object(y[:, 1:], predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(y[:, 1:], predictions)



In [21]:
@tf.function
def val_step(input_ids, input_mask, y):
    # https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py
    y_ids = y[:, :-1]
    lm_labels = tf.identity(y[:, 1:])
    lm_labels = tf.where(tf.equal(y[:, 1:],pad_token_id), -100, lm_labels)
    
    predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, labels=lm_labels, training=False)
    v_loss = loss_object(y[:, 1:], predictions)

    val_loss(v_loss)
    val_accuracy(y[:, 1:], predictions)

In [22]:
EPOCHS = 1
log_interval = 200
for epoch in range(EPOCHS):
    # reset metrics
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    val_loss.reset_states()
    val_accuracy.reset_states()
    
    val_batches = iter(val_ds)
    
    start_time = time.time()
    for i, (input_ids, input_mask, y) in enumerate(train_ds):
        # training
        train_step(input_ids, input_mask, y)
        
        # validation
        if i % log_interval == 0:
            x_val, x_mask_val, y_val = next(val_batches)
            val_step(x_val, x_mask_val, y_val)
            elapsed = time.time() - start_time
            print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'train acc {:5.2f} | val acc {:5.2f} |'
                  'loss {:5.2f} | val loss {:5.2f}'.format(
                    epoch, i, int(len_train/BATCH_SIZE),
                    elapsed * 1000 / log_interval,
                    train_accuracy.result() * 100, val_accuracy.result() * 100, 
                    train_loss.result(),  val_loss.result()))
            start_time = time.time()

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


TFSeq2SeqLMOutput(loss=<tf.Tensor 'tf_t5for_conditional_generation/sparse_categorical_crossentropy/weighted_loss/Mul:0' shape=(None,) dtype=float32>, logits=<tf.Tensor 'tf_t5for_conditional_generation/shared/Reshape_1:0' shape=(16, 149, 32128) dtype=float32>, past_key_values=((<tf.Tensor 'tf_t5for_conditional_generation/encoder/dropout_24/dropout_1/Mul_1:0' shape=(16, 512, 512) dtype=float32>,), ((<tf.Tensor 'tf_t5for_conditional_generation/decoder/block_._0/layer_._0/SelfAttention/transpose_1:0' shape=(16, 8, 149, 64) dtype=float32>, <tf.Tensor 'tf_t5for_conditional_generation/decoder/block_._0/layer_._0/SelfAttention/transpose_2:0' shape=(16, 8, 149, 64) dtype=float32>, <tf.Tensor 'tf_t5for_conditional_generation/decoder/block_._0/layer_._1/EncDecAttention/transpose_1:0' shape=(16, 8, 512, 64) dtype=float32>, <tf.Tensor 'tf_t5for_conditional_generation/decoder/block_._0/layer_._1/EncDecAttention/transpose_2:0' shape=(16, 8, 512, 64) dtype=float32>), (<tf.Tensor 'tf_t5for_conditional_

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


ValueError: in user code:

    <ipython-input-20-c1fffe6199ba>:13 train_step  *
        predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, labels=lm_labels, training=True)

    ValueError: too many values to unpack (expected 3)
