In [1]:
# Installation
%%capture

!pip install transformers==4.5.1
!pip install datasets==1.6.2
!pip install tokenizers==0.10.2
!pip install torch==1.8.1+cu111
!pip install psutil==5.8.0
!pip install rouge_score
!pip install sacrebleu
!pip install openpyxl
!pip install xlrd
!pip install git-python
!pip install -U ipython==7.20
!pip install cmake
!pip install SentencePiece

In [2]:
# Imports
import gc
import csv
import torch
import psutil
import datasets
import transformers
import pandas as pd

from datasets import ClassLabel
from IPython.display import display, HTML

In [3]:
# Drive
from google.colab import drive
drive.mount("/content/drive")
path_drive = "/content/drive/My Drive/Temp/"

Mounted at /content/drive


In [7]:
# Config
language = "german"  # english, german, multilingual
model_name = "deepset/gbert-base"
tokenizer_name = "deepset/gbert-base"
batch_size = 4  # 8

ratio_corpus_wiki = 1.00
ratio_corpus_news = 1.00
ratio_corpus_mlsum = 1.00
ratio_corpus_eng = 1.00

path_output = path_drive + "Models"
path_checkpoint = path_output + "/checkpoint-40000"

text_english = """Almost as soon as the World Trade Center's Twin Towers fell on September 11, 2001, thousands of firefighters, police officers, construction workers, search-and-rescue dogs and volunteers headed to Ground Zero to look for survivors. Because they didn’t know how many people were trapped alive in the wreckage, firefighters and other rescue workers had to search carefully through the unstable piles of rubble for air pockets, called \"voids\", where they might find people who had been unable to escape from the collapsing buildings. To be safe, they didn’t use any heavy equipment at first. Some dug with their bare hands, while others formed bucket brigades to move small amounts of debris as efficiently as possible. Unfortunately, there were not many survivors to find: Two firemen were pulled from their truck in a cavity beneath some wreckage, and a few people were pinned at the edges of the pile. By September 12, workers had rescued all of the people who were trapped at the site. After that, the Ground Zero workers had a new and more heartbreaking mission: to sift carefully through the debris in search of human remains. The fallen buildings were unstable, and engineers worried that the weight of trucks and cranes would cause the wreckage to shift and collapse again, so the workers had to keep using the bucket brigades. Meanwhile, huge fires continued to burn at the center of the pile. Jagged, sharp pieces of iron and steel were everywhere. The work was so dangerous that many firefighters and police officers wrote their names and phone numbers on their forearms in case they fell into the hole or were crushed."""
text_german = """Der 11. September 2001 war ein schöner Spätsommertag in New York – bis um 08:46 Uhr ein Flugzeug in den Nordturm des World Trade Centers flog. Zunächst ging man von einem tragischen Unfall aus. Dann aber flog eine 2. Boeing in den Südturm. Die Bilder, die an diesem Tag und an den folgenden Jahrestagen um die Welt gingen, machen noch heute sprach- und fassungslos. Für mehr als 2.500 Menschen wurden die brennenden Hochhaustürme zur Todesfalle; fast 400 Feuerwehrleute und Polizeibeamte verloren bei den Rettungsarbeiten ihr Leben. Dieser traurige Tag ging fortan als '9/11' in die Geschichte ein. New York stand nach dem Anschlag auf das World Trade Center verständlicherweise unter Schock und vor einem Desaster unglaublichen Ausmaßes. Die Trümmer qualmten noch bis in den Dezember 2001 hinein und es sollte rund 9 Monate dauern, bis die insgesamt 1,8 Mio. Tonnen Schutt weggeräumt waren. Seither klafft an der Stelle, wo zuvor die 'Twin Tower' standen, eine riesige Wunde in Manhattans Stadtbild. Nach den Aufräumarbeiten auf dem World Trade Center Gelände blieb eine riesige Grube zurück: Ground Zero. Am Zaun sind die Ereignisse des 11. September dokumentiert. Seit 2011 gibt es einen Gedenkpavillon, in 2012 wurde das 'National September 11 Memorial and Museum' eröffnet."""

'''
- bert-base-multilingual-cased
- deepset/gbert-base
- xlm-roberta-base
- facebook/mbart-large-50
'''

'\n- bert-base-multilingual-cased\n- deepset/gbert-base\n- xlm-roberta-base\n- facebook/mbart-large-50\n'

In [5]:
# Helpers
def load_data(language, ratio_corpus_wiki=0.0, ratio_corpus_news=0.0, ratio_corpus_mlsum=0.0, ratio_corpus_eng=0.0):
    if str(language) == "english":
        train_data = datasets.load_dataset(
            "cnn_dailymail", "3.0.0", split="train")
        val_data = datasets.load_dataset(
            "cnn_dailymail", "3.0.0", split="validation[:10%]")
        test_data = datasets.load_dataset(
            "cnn_dailymail", "3.0.0", split="test[:5%]")

        train_data = train_data.rename_column("article", "text")
        train_data = train_data.rename_column("highlights", "summary")
        val_data = val_data.rename_column("article", "text")
        val_data = val_data.rename_column("highlights", "summary")
        test_data = test_data.rename_column("article", "text")
        test_data = test_data.rename_column("highlights", "summary")

        return train_data, val_data, test_data

    else:
        # CORPUS: WIKI
        data_txt, data_ref = [], []

        with open(path_drive + "Corpus/data_train.csv", "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_ALL)
            next(reader, None)

            for row in reader:
                data_txt.append(row[0])
                data_ref.append(row[1])

        ds_wiki = datasets.arrow_dataset.Dataset.from_pandas(
            pd.DataFrame(
                list(zip(data_txt, data_ref)),
                columns=["text", "summary"]
            )
        )

        # CORPUS: NEWS
        df_news = pd.read_excel(path_drive + "Corpus/data_train_test.xlsx", engine="openpyxl")
        df_news = df_news[["article", "highlights"]]
        df_news.columns = ["text", "summary"]
        df_news = df_news[~df_news["summary"].str.contains("ZEIT")]
        df_news = df_news.dropna()
        ds_news = datasets.arrow_dataset.Dataset.from_pandas(df_news)
        ds_news = ds_news.remove_columns("__index_level_0__")

        # CORPUS: MLSUM
        ds_mlsum = datasets.load_dataset("mlsum", "de", split="train")
        ds_mlsum = ds_mlsum.remove_columns(["topic", "url", "title", "date"])

        text_corpus_mlsum = []
        summary_corpus_mlsum = []

        for entry in ds_mlsum:
            text = entry["text"]
            summary = entry["summary"]

            if summary in text:
                text = text[len(summary) + 1:len(text)]

            text_corpus_mlsum.append(text)
            summary_corpus_mlsum.append(summary)

        ds_mlsum = datasets.arrow_dataset.Dataset.from_pandas(
            pd.DataFrame(
                list(zip(text_corpus_mlsum, summary_corpus_mlsum)),
                columns=["text", "summary"]
            )
        )

        # ACTION: CONCAT
        german_data = datasets.concatenate_datasets([
            ds_wiki.select(
                range(0, int(len(ds_wiki) * ratio_corpus_wiki))),
            ds_news.select(
                range(0, int(len(ds_news) * ratio_corpus_news))),
            ds_mlsum.select(
                range(0, int(len(ds_mlsum) * ratio_corpus_mlsum)))
        ])

        if str(language) == "multilingual":
            english_data = datasets.load_dataset(
                "cnn_dailymail", "3.0.0", split="train"
            )

            english_data = english_data.rename_column("article", "text")
            english_data = english_data.rename_column("highlights", "summary")

            prepared_data = datasets.concatenate_datasets([
                german_data.shuffle(),
                english_data.select(
                    range(0, int(len(english_data) * ratio_corpus_eng))
                ).shuffle()
            ])

        else:
            prepared_data = german_data.shuffle()

        # ACTION: SPLIT
        train_size = int(len(prepared_data) * 0.900)
        valid_size = int(len(prepared_data) * 0.025)
        test_size = int(len(prepared_data) * 0.075)

        train_data = prepared_data.select(
            range(0, train_size))
        val_data = prepared_data.select(
            range(train_size, train_size + valid_size))
        test_data = prepared_data.select(
            range(train_size + valid_size, train_size + valid_size + test_size))

        del prepared_data

        return train_data.shuffle(), val_data.shuffle(), test_data.shuffle()


def explore_corpus(data):
    df = pd.DataFrame(data)

    text_list = []
    summary_list = []

    for index, row in df.iterrows():
        text = row["text"]
        summary = row["summary"]
        text_list.append(len(text))
        summary_list.append(len(summary))

    df = pd.DataFrame(data[:1])

    for column, typ in data.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])


def test_cuda():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    print("Device:", device)
    print("Version:", torch.__version__)


def empty_cache():
    gc.collect()
    torch.cuda.empty_cache()
    psutil.virtual_memory()

    print(torch.cuda.get_device_properties(0).total_memory)
    print(torch.cuda.memory_reserved(0))
    print(torch.cuda.memory_allocated(0))

    %whos


def load_tokenizer_and_model(from_checkpoint=False):
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        tokenizer_name, strip_accent=False  # add_prefix_space=True
    )

    if from_checkpoint:
        if "mbart" in model_name:
            tf2tf = transformers.AutoModelForSeq2SeqLM.from_pretrained(
                path_checkpoint
            )

        else:
            tf2tf = transformers.EncoderDecoderModel.from_pretrained(
                path_checkpoint
            )

    else:
        if "mbart" in model_name:
            tf2tf = transformers.AutoModelForSeq2SeqLM.from_pretrained(
                model_name
            )

        else:
            tf2tf = transformers.EncoderDecoderModel.from_encoder_decoder_pretrained(
                model_name, model_name, tie_encoder_decoder=True
            )

    return tokenizer, tf2tf


def configure_model(tf2tf, tokenizer):
    tf2tf.config.decoder_start_token_id = tokenizer.cls_token_id
    tf2tf.config.bos_token_id = tokenizer.bos_token_id
    tf2tf.config.eos_token_id = tokenizer.sep_token_id
    tf2tf.config.pad_token_id = tokenizer.pad_token_id
    # tf2tf.config.vocab_size = tf2tf.config.encoder.vocab_size

    tf2tf.config.max_length = 128
    tf2tf.config.min_length = 56
    tf2tf.config.no_repeat_ngram_size = 3
    tf2tf.config.early_stopping = True
    tf2tf.config.length_penalty = 2.0
    tf2tf.config.num_beams = 2

    return tf2tf

In [6]:
# Training
tokenizer, tf2tf = load_tokenizer_and_model(from_checkpoint=False)

train_data, val_data, test_data = load_data(
    language=language,
    ratio_corpus_wiki=ratio_corpus_wiki,
    ratio_corpus_news=ratio_corpus_news,
    ratio_corpus_mlsum=ratio_corpus_mlsum,
    ratio_corpus_eng=ratio_corpus_eng
)

test_cuda()
explore_corpus(train_data)
empty_cache()
rouge = datasets.load_metric("rouge")

tf2tf = configure_model(tf2tf, tokenizer)
tf2tf.to("cuda")


def process_data_to_model_inputs(batch):
    encoder_max_length = 512
    decoder_max_length = 128

    inputs = tokenizer(batch["text"], padding="max_length",
                       truncation=True, max_length=encoder_max_length)

    outputs = tokenizer(batch["summary"], padding="max_length",
                        truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels]
                       for labels in batch["labels"]]

    return batch


train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["text", "summary"]
)

train_data.set_format(
    type="torch",
    columns=["input_ids",
             "attention_mask",
             "decoder_input_ids",
             "decoder_attention_mask",
             "labels"]
)

val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["text", "summary"]
)

val_data.set_format(
    type="torch",
    columns=["input_ids",
             "attention_mask",
             "decoder_input_ids",
             "decoder_attention_mask",
             "labels"]
)


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str,
        references=label_str,
        rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }


training_args = transformers.Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir=path_output,
    warmup_steps=1000,
    save_steps=5000,
    logging_steps=1000,
    eval_steps=5000,
    save_total_limit=1,
    learning_rate=5e-5,
    adafactor=True,
    fp16=True
)

trainer = transformers.Seq2SeqTrainer(
    model=tf2tf,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=362.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239836.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=83.0, style=ProgressStyle(description_w…




The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1595.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2199.0, style=ProgressStyle(description…


Downloading and preparing dataset mlsum/de (download: 330.52 MiB, generated: 897.34 MiB, post-processed: Unknown size, total: 1.20 GiB) to /root/.cache/huggingface/datasets/mlsum/de/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=17741147.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=311059697.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=17771216.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset mlsum downloaded and prepared to /root/.cache/huggingface/datasets/mlsum/de/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688. Subsequent calls will reuse this data.
Device: cuda
Version: 1.8.1+cu101
17071734784
0
0
Variable                   Type                   Data/Info
-----------------------------------------------------------
ClassLabel                 type                   <class 'datasets.features.ClassLabel'>
HTML                       type                   <class 'IPython.core.display.HTML'>
batch_size                 int                    4
configure_model            function               <function configure_model at 0x7f5b3e159680>
csv                        module                 <module 'csv' from '/usr/lib/python3.7/csv.py'>
datasets                   module                 <module 'datasets' from '<...>es/datasets/__init__.py'>
display                    function               <function display at 0x7f5bc64c13b0>
drive                

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2170.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, max=74992.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2083.0), HTML(value='')))




	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure,Runtime,Samples Per Second
5000,2.6701,2.657371,0.0896,0.1557,0.1077,4769.9286,1.747
10000,2.6861,2.625022,0.092,0.1616,0.111,4884.8952,1.706
15000,2.6608,2.603165,0.0928,0.161,0.1115,4996.6774,1.668
20000,2.616,2.583583,0.0953,0.165,0.1144,4918.0461,1.694
25000,2.5991,2.566334,0.0944,0.1623,0.1129,4877.1963,1.708
30000,2.5696,2.534883,0.0963,0.1661,0.1154,4851.5061,1.717
35000,2.5558,2.518347,0.0976,0.1686,0.1171,4823.7063,1.727
40000,2.5361,2.503518,0.0985,0.167,0.1174,4746.9265,1.755


KeyboardInterrupt: ignored

In [None]:
# Evaluation
tokenizer, tf2tf = load_tokenizer_and_model(from_checkpoint=True)

train_data, val_data, test_data = load_data(
    language=language,
    ratio_corpus_wiki=ratio_corpus_wiki,
    ratio_corpus_news=ratio_corpus_news,
    ratio_corpus_mlsum=ratio_corpus_mlsum,
    ratio_corpus_eng=ratio_corpus_eng
)

test_cuda()
explore_corpus(train_data)
empty_cache()
rouge = datasets.load_metric("rouge")

tf2tf = configure_model(tf2tf, tokenizer)
tf2tf.to("cuda")


def generate_summary(batch):
    inputs = tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = tf2tf.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_summary"] = output_str

    return batch


results = test_data.map(
    generate_summary,
    batched=True,
    batch_size=batch_size
)

print(
    rouge.compute(
        predictions=results["pred_summary"],
        references=results["summary"],
        rouge_types=["rouge2"]
    )["rouge2"].mid
)

In [8]:
# Example
tokenizer, tf2tf = load_tokenizer_and_model(from_checkpoint=True)

tf2tf = configure_model(tf2tf, tokenizer)
tf2tf.to("cuda")

text = None
parts = []


def split_long_texts(text):
    limit = 512

    if len(text) > limit:
        end_index = max([
            text.rfind(".", 0, limit),
            text.rfind("!", 0, limit),
            text.rfind("?", 0, limit)
        ])

        parts.append(text[0:end_index + 1].strip())
        text = text[end_index + 1:len(text)].strip()
        split_long_texts(text)

    else:
        parts.append(text)


text = text_english if language == "english" else text_german
split_long_texts(text)

if len(parts) > 1:
    article = parts
    highlights = [None] * len(parts)

else:
    parts = [text]
    article = [text] * 2
    highlights = [None] * 2

test_cuda()
empty_cache()
rouge = datasets.load_metric("rouge")

df = pd.DataFrame({"text": article, "summary": highlights})
test_data = datasets.arrow_dataset.Dataset.from_pandas(df)


def generate_summary(batch):
    inputs = tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = tf2tf.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_summary"] = output_str

    return batch


summary = test_data.map(
    generate_summary,
    batched=True,
    batch_size=batch_size
)

result = ""

for i in range(0, len(parts)):
    result = result + " " + summary[i]["pred_summary"]

print(summary[0]["pred_summary"])
print("====================")
print(result)

The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


Device: cuda
Version: 1.8.1+cu101
17071734784
2164260864
1750203392
Variable                       Type                        Data/Info
--------------------------------------------------------------------
ClassLabel                     type                        <class 'datasets.features.ClassLabel'>
HTML                           type                        <class 'IPython.core.display.HTML'>
article                        list                        n=3
batch_size                     int                         4
compute_metrics                function                    <function compute_metrics at 0x7f5b3b2b9560>
configure_model                function                    <function configure_model at 0x7f5b3e159680>
csv                            module                      <module 'csv' from '/usr/lib/python3.7/csv.py'>
datasets                       module                      <module 'datasets' from '<...>es/datasets/__init__.py'>
display                        function        

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Der 11. September 2001 war der 11. Jahrestag des 11. 9. September. Die Bilder aus dem World Trade Center zeigen, wie die Ereignisse in New York und New York entstanden sind - und wie man sie heute noch erleben kann. Ein Besuch... und fassungslos.
 Der 11. September 2001 war der 11. Jahrestag des 11. 9. September. Die Bilder aus dem World Trade Center zeigen, wie die Ereignisse in New York und New York entstanden sind - und wie man sie heute noch erleben kann. Ein Besuch... und fassungslos. New York war der " 9 / 11 " - Terror - Tag. Doch die Katastrophe begann mit dem Anschlag auf das World Trade Center. Jetzt ist der Terror in der Heimat der Terrormiliz besonders groß. Die Verantwortlichen sind sich einig : Es ist nicht das einzige Problem, das sich in den vergangenen Jahren in New York ereignete. Das " National September 11 Memorial and Museum " in Manhattan ist das " National 9 Memorial of the World Trade Center ". Es ist die einzige Gedenkstätte des 11. September 11. Die Gedenkstä