In [1]:


import tensorflow as tf
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
import tensorflow_datasets as tfds
import time
import os
import re

import logging
logging.basicConfig(level=logging.ERROR)



In [2]:
BATCH_SIZE = 8

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-small"

In [4]:
tokenizer = T5Tokenizer.from_pretrained(model_size)

model = TFT5ForConditionalGeneration.from_pretrained(model_size)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))

pad_token_id = tokenizer.pad_token_id

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run LogicalNot: Dst tensor is not initialized. [Op:LogicalNot]

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

model.summary()

In [None]:
from pathlib import Path
import re
import logging
logging.basicConfig(level=logging.ERROR)

tokenizer = T5Tokenizer.from_pretrained(model_size)
pad_token_id = tokenizer.pad_token_id
prefix = "summarize: "

def transfrom(x):
    x = " ".join(x.split("; ")[1:])
    x = re.sub("'(.*)'", r"\1", x)
    return x


def tokenize_articles(text):
    ids = tokenizer.encode_plus((prefix + text), return_tensors="tf", max_length=512, pad_to_max_length=True) 
    return tf.squeeze(ids['input_ids']), tf.squeeze(ids['attention_mask'])
        
def tokenize_highlights(text):
    y = tokenizer.encode(text, return_tensors="tf", max_length=150, pad_to_max_length=True)
    y = tf.squeeze(y)
    y_ids = y[:-1]
    lm_labels = tf.identity(y[1:])
    lm_labels = tf.where(tf.equal(y[1:],pad_token_id), -100, lm_labels)  

    return y, y_ids, lm_labels


def get_data(name):
    article_path = "../data/%s/articles_german" % name
    highlights_path = "../data/%s/highlights_german" % name

    articles = [transfrom(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transfrom(x.rstrip()) for x in open(highlights_path).readlines()]
    return articles, highlights
    
    
def get_tokinized_ds(articles, highlights):
    x = [] 
    x_mask = []
    for x_i in articles:
        t1, t2 = tokenize_articles(x_i)
        x.append(t1)
        x_mask.append(t2)
        
    y = []
    y_ids = [] 
    y_labels = []
    for y_i in highlights:
        t1, t2, t3 = tokenize_highlights(y_i)
        y.append(t1)
        y_ids.append(t2)
        y_labels.append(t3)
        
        
    return x, x_mask, y, y_ids, y_labels

def get_translated_ds(name):
    articles, highlights = get_data(name)
    return get_tokinized_ds(articles, highlights)

In [None]:
train = get_translated_ds("train")

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(train)

In [None]:
skip = False
drive_path = "../data"
if not skip:
    # Prepare tf.Examples and tf.Features and write them as TFRecords
    def save_tfrecord_to_bucket(features_dataset, gdrive_folder, file_name):
        with tf.compat.v1.python_io.TFRecordWriter(f"{gdrive_folder}/{file_name}.tfrecord") as tfwriter:
            for train_feature in features_dataset:
                x, x_mask, y, y_ids, y_labels = train_feature
                feature_key_value_pair = {
                    'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
                    'x_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=x_mask)),
                    'y': tf.train.Feature(int64_list=tf.train.Int64List(value=y)),
                    'y_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=y_ids)),
                    'y_labels': tf.train.Feature(int64_list=tf.train.Int64List(value=y_labels))
                }
                features = tf.train.Features(feature=feature_key_value_pair)
                example = tf.train.Example(features=features)

                tfwriter.write(example.SerializeToString())
        print(f"Saved {file_name}.")

    save_tfrecord_to_bucket(train_ds, drive_path, "train_cnn_daily_mail")

In [None]:
len(val[0])

In [None]:


val_ds = tf.data.Dataset.from_tensor_slices(val)
train_ds = tf.data.Dataset.from_tensor_slices(train)
test_ds = tf.data.Dataset.from_tensor_slices(test)



In [None]:
def write_ds(ds, filename):
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(ds)
    
tf.data.experimental.save(val_ds, "val.tfrecord")

In [None]:
# train_ds = tf.data.Dataset.from_tensor_slices(train)\
#     .map(map_func)\
#     .shuffle(SHUFFEL_SIZE)\
#     .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[149],[149]))\
#     .prefetch(tf.data.experimental.AUTOTUNE)

val_ds = val_ds\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[149],[149]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

# test_ds = tf.data.Dataset.from_tensor_slices(get_translated_ds("test"))\
# .shuffle(SHUFFEL_SIZE)\
# .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[149],[149]))\
# .prefetch(tf.data.experimental.AUTOTUNE)

In [None]:


for i in val_ds.take(1):
    print(i)



In [None]:
@tf.function
def train_step(input_ids, input_mask, y_ids, lm_labels):
    # https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py

    with tf.GradientTape() as tape:
        # prediction_scores: (bs, 150, 32128)
        # decoder_past_key_value_states: (bs, 512, 512), (bs, 8, 150, 64)
        # z: (bs, 512, 512)
        predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=True)
        loss = loss_object(y[:, 1:], predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(y[:, 1:], predictions)

In [None]:


@tf.function
def val_step(input_ids, input_mask, y_ids, lm_labels):
    # https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py
    
    predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=False)
    v_loss = loss_object(y[:, 1:], predictions)

    val_loss(v_loss)
    val_accuracy(y[:, 1:], predictions)



In [None]:
EPOCHS = 1
log_interval = 200
for epoch in range(EPOCHS):
    # reset metrics
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    val_loss.reset_states()
    val_accuracy.reset_states()
    
    val_batches = iter(val_ds)
    
    start_time = time.time()
    for i, (input_ids, input_mask, y_ids, labels) in enumerate(train_ds):
        # training
        strategy.run(train_step(input_ids, input_mask, y_ids, labels))
        
        # validation
        if i % log_interval == 0:
            x_val, x_mask_val, y_val, y_label = next(val_batches)
            strategy.run(val_step(x_val, x_mask_val, y_val, y_label))
            elapsed = time.time() - start_time
            print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'train acc {:5.2f} | val acc {:5.2f} |'
                  'loss {:5.2f} | val loss {:5.2f}'.format(
                    epoch, i, int(len_train/BATCH_SIZE),
                    elapsed * 1000 / log_interval,
                    train_accuracy.result() * 100, val_accuracy.result() * 100, 
                    train_loss.result(),  val_loss.result()))
            start_time = time.time()