In [1]:
# Installation
%%capture

!pip install transformers==4.5.1
!pip install datasets==1.6.2
!pip install tokenizers==0.10.2
!pip install torch==1.8.1+cu111
!pip install psutil==5.8.0
!pip install rouge_score
!pip install sacrebleu
!pip install openpyxl
!pip install xlrd
!pip install git-python
!pip install -U ipython==7.20
!pip install cmake
!pip install SentencePiece

In [2]:
# Imports
import gc
import csv
import torch
import psutil
import datasets
import transformers
import pandas as pd

from datasets import ClassLabel
from IPython.display import display, HTML

In [3]:
# Drive
from google.colab import drive
drive.mount("/content/drive")
path_drive = "/content/drive/My Drive/Temp/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Config
language = "german"  # english
model_name = "bert-base-multilingual-cased"
tokenizer_name = "bert-base-german-cased"
batch_size = 2  # 16

ratio_corpus_wiki = 0.1  # 1.0
ratio_corpus_news = 0.1  # 1.0

path_output = "./"
path_checkpoint = path_drive + "Models"

text_english = """Almost as soon as the World Trade Center's Twin Towers fell on September 11, 2001, thousands of firefighters, police officers, construction workers, search-and-rescue dogs and volunteers headed to Ground Zero to look for survivors. Because they didn’t know how many people were trapped alive in the wreckage, firefighters and other rescue workers had to search carefully through the unstable piles of rubble for air pockets, called \"voids\", where they might find people who had been unable to escape from the collapsing buildings. To be safe, they didn’t use any heavy equipment at first. Some dug with their bare hands, while others formed bucket brigades to move small amounts of debris as efficiently as possible. Unfortunately, there were not many survivors to find: Two firemen were pulled from their truck in a cavity beneath some wreckage, and a few people were pinned at the edges of the pile. By September 12, workers had rescued all of the people who were trapped at the site. After that, the Ground Zero workers had a new and more heartbreaking mission: to sift carefully through the debris in search of human remains. The fallen buildings were unstable, and engineers worried that the weight of trucks and cranes would cause the wreckage to shift and collapse again, so the workers had to keep using the bucket brigades. Meanwhile, huge fires continued to burn at the center of the pile. Jagged, sharp pieces of iron and steel were everywhere. The work was so dangerous that many firefighters and police officers wrote their names and phone numbers on their forearms in case they fell into the hole or were crushed."""
text_german = """Der 11. September 2001 war ein schöner Spätsommertag in New York – bis um 08:46 Uhr ein Flugzeug in den Nordturm des World Trade Centers flog. Zunächst ging man von einem tragischen Unfall aus. Dann aber flog eine 2. Boeing in den Südturm. Die Bilder, die an diesem Tag und an den folgenden Jahrestagen um die Welt gingen, machen noch heute sprach- und fassungslos. Für mehr als 2.500 Menschen wurden die brennenden Hochhaustürme zur Todesfalle; fast 400 Feuerwehrleute und Polizeibeamte verloren bei den Rettungsarbeiten ihr Leben. Dieser traurige Tag ging fortan als '9/11' in die Geschichte ein. New York stand nach dem Anschlag auf das World Trade Center verständlicherweise unter Schock und vor einem Desaster unglaublichen Ausmaßes. Die Trümmer qualmten noch bis in den Dezember 2001 hinein und es sollte rund 9 Monate dauern, bis die insgesamt 1,8 Mio. Tonnen Schutt weggeräumt waren. Seither klafft an der Stelle, wo zuvor die 'Twin Tower' standen, eine riesige Wunde in Manhattans Stadtbild. Nach den Aufräumarbeiten auf dem World Trade Center Gelände blieb eine riesige Grube zurück: Ground Zero. Am Zaun sind die Ereignisse des 11. September dokumentiert. Seit 2011 gibt es einen Gedenkpavillon, in 2012 wurde das 'National September 11 Memorial and Museum' eröffnet."""

In [6]:
# Helpers
def load_data(language, ratio_corpus_wiki=0.0, ratio_corpus_news=0.0):
    if str(language) == "english":
        train_data = datasets.load_dataset(
            "cnn_dailymail", "3.0.0", split="train")
        val_data = datasets.load_dataset(
            "cnn_dailymail", "3.0.0", split="validation[:10%]")
        test_data = datasets.load_dataset(
            "cnn_dailymail", "3.0.0", split="test[:5%]")

        return train_data, val_data, test_data

    elif str(language) == "german":
        data_txt, data_ref = [], []

        with open(path_drive + "Corpus/data_train.csv", "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_ALL)
            next(reader, None)

            for row in reader:
                data_txt.append(row[0])
                data_ref.append(row[1])

        tuples_wiki = list(zip(data_txt, data_ref))
        tuples_wiki = tuples_wiki[0:int(len(tuples_wiki) * ratio_corpus_wiki)]

        dataframe = pd.DataFrame(
            tuples_wiki, columns=["article", "highlights"]
        )

        tuples_news = pd.read_excel(
            path_drive + "Corpus/data_train_test.xlsx", engine="openpyxl"
        )

        tuples_news = tuples_news[0:int(len(tuples_news) * ratio_corpus_news)]
        del tuples_news["Unnamed: 0"]

        dataframe = pd.concat([dataframe, tuples_news])
        dataframe = dataframe.dropna()
        dataframe = dataframe[~dataframe["highlights"].str.contains("ZEIT")]

        german_data = datasets.arrow_dataset.Dataset.from_pandas(
            dataframe[["article", "highlights"]]
        )

        german_data = german_data.shuffle()

        train_size = int(len(dataframe) * 0.9)
        valid_size = int(len(dataframe) * 0.05)
        test_size = int(len(dataframe) * 0.05)

        train_data = german_data.select(
            range(0, train_size))
        val_data = german_data.select(
            range(train_size, train_size + valid_size))
        test_data = german_data.select(
            range(train_size + valid_size, len(dataframe)))
        
        del german_data

        return train_data.shuffle(), val_data.shuffle(), test_data.shuffle()


def explore_corpus(data):
    df = pd.DataFrame(data)

    text_list = []
    summary_list = []

    for index, row in df.iterrows():
        text = row["article"]
        summary = row["highlights"]
        text_list.append(len(text))
        summary_list.append(len(summary))

    df = pd.DataFrame(data[:1])

    for column, typ in data.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])


def test_cuda():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    print("Device:", device)
    print("Version:", torch.__version__)


def empty_cache():
    gc.collect()
    torch.cuda.empty_cache()
    psutil.virtual_memory()

    print(torch.cuda.get_device_properties(0).total_memory)
    print(torch.cuda.memory_reserved(0))
    print(torch.cuda.memory_allocated(0))

    %whos

In [7]:
# Training
tokenizer = transformers.BertTokenizer.from_pretrained(
    tokenizer_name
)

tf2tf = transformers.EncoderDecoderModel.from_encoder_decoder_pretrained(
    model_name, model_name, tie_encoder_decoder=False  # True
)

train_data, val_data, test_data = load_data(
    language=language,
    ratio_corpus_wiki=ratio_corpus_wiki,
    ratio_corpus_news=ratio_corpus_news
)

explore_corpus(train_data)
rouge = datasets.load_metric("rouge")

tf2tf.config.decoder_start_token_id = tokenizer.cls_token_id
tf2tf.config.bos_token_id = tokenizer.bos_token_id
tf2tf.config.eos_token_id = tokenizer.sep_token_id
tf2tf.config.pad_token_id = tokenizer.pad_token_id
tf2tf.config.vocab_size = tf2tf.config.encoder.vocab_size

tf2tf.config.max_length = 142
tf2tf.config.min_length = 56
tf2tf.config.no_repeat_ngram_size = 3
tf2tf.config.early_stopping = True
tf2tf.config.length_penalty = 2.0
tf2tf.config.num_beams = 4

tf2tf.to("cuda")


def process_data_to_model_inputs(batch):
    encoder_max_length = 512
    decoder_max_length = 128

    inputs = tokenizer(batch["article"], padding="max_length",
                       truncation=True, max_length=encoder_max_length)

    outputs = tokenizer(batch["highlights"], padding="max_length",
                        truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels]
                       for labels in batch["labels"]]

    return batch


train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "highlights"]
)

train_data.set_format(
    type="torch",
    columns=["input_ids",
             "attention_mask",
             "decoder_input_ids",
             "decoder_attention_mask",
             "labels"]
)

val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    remove_columns=["article", "highlights"]
)

val_data.set_format(
    type="torch",
    columns=["input_ids",
             "attention_mask",
             "decoder_input_ids",
             "decoder_attention_mask",
             "labels"]
)


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str,
        references=label_str,
        rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }
  

test_cuda()
empty_cache()
steps = 2000

training_args = transformers.Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir=path_output,
    warmup_steps=1000,
    save_steps=steps,
    logging_steps=1000,
    eval_steps=steps,
    eval_accumulation_steps=10,
    save_total_limit=1,
    fp16=True
)

trainer = transformers.Seq2SeqTrainer(
    model=tf2tf,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2444517405.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at facebook/mbart-large-cc25 were not used when initializing MBartForCausalLM: ['final_logits_bias', 'model.shared.weight', 'model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.encoder.layers.0.self_attn.k_proj.weight', 'model.encoder.layers.0.self_attn.k_proj.bias', 'model.encoder.layers.0.self_attn.v_proj.weight', 'model.encoder.layers.0.self_attn.v_proj.bias', 'model.encoder.layers.0.self_attn.q_proj.weight', 'model.encoder.layers.0.self_attn.q_proj.bias', 'model.encoder.layers.0.self_attn.out_proj.weight', 'model.encoder.layers.0.self_attn.out_proj.bias', 'model.encoder.layers.0.self_attn_layer_norm.weight', 'model.encoder.layers.0.self_attn_layer_norm.bias', 'model.encoder.layers.0.fc1.weight', 'model.encoder.layers.0.fc1.bias', 'model.encoder.layers.0.fc2.weight', 'model.encoder.layers.0.fc2.bias', 'model.encoder.layers.0.final_layer_norm.weight', 'model.encoder.layers.0.final_layer_norm.bias', 'model.encoder.lay

HBox(children=(FloatProgress(value=0.0, max=5058.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Device: cuda
Version: 1.8.1+cu101
17071734784
4303355904
4277977088
Variable                       Type                   Data/Info
---------------------------------------------------------------
ClassLabel                     type                   <class 'datasets.features.ClassLabel'>
HTML                           type                   <class 'IPython.core.display.HTML'>
batch_size                     int                    2
compute_metrics                function               <function compute_metrics at 0x7f914e5ff050>
csv                            module                 <module 'csv' from '/usr/lib/python3.7/csv.py'>
datasets                       module                 <module 'datasets' from '<...>es/datasets/__init__.py'>
display                        function               <function display at 0x7f9398293dd0>
drive                          module                 <module 'google.colab.dri<...>s/google/colab/drive.py'>
empty_cache                    function             

AttributeError: ignored

In [None]:
# Evaluation
tokenizer = transformers.BertTokenizer.from_pretrained(
    tokenizer_name
)

tf2tf = transformers.EncoderDecoderModel.from_pretrained(
    path_checkpoint + "/checkpoint-50000"
)

train_data, val_data, test_data = load_data(
    language=language,
    ratio_corpus_wiki=ratio_corpus_wiki,
    ratio_corpus_news=ratio_corpus_news
)

test_cuda()
explore_corpus(train_data)
empty_cache()
configure_model(tf2tf, tokenizer)
rouge = datasets.load_metric("rouge")


def generate_summary(batch):
    inputs = tokenizer(
        batch["article"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = tf2tf.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_summary"] = output_str

    return batch


results = test_data.map(
    generate_summary,
    batched=True,
    batch_size=batch_size
)

print(
    rouge.compute(
        predictions=results["pred_summary"],
        references=results["highlights"],
        rouge_types=["rouge2"]
    )["rouge2"].mid
)

In [None]:
# Example
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(
    tokenizer_name
)

tf2tf = transformers.EncoderDecoderModel.from_pretrained(
    path_checkpoint
)

tf2tf.to("cuda")
text = None
parts = []


def split_long_texts(text):
    limit = 512

    if len(text) > limit:
        end_index = max([
            text.rfind(".", 0, limit),
            text.rfind("!", 0, limit),
            text.rfind("?", 0, limit)
        ])

        parts.append(text[0:end_index + 1].strip())
        text = text[end_index + 1:len(text)].strip()
        split_long_texts(text)

    else:
        parts.append(text)


text = text_english if language == "english" else text_german
split_long_texts(text)

if len(parts) > 1:
    temp = {
        "article": parts,
        "highlights": ["Zusammenfassung"] * len(parts)
    }

else:
    temp = {
        "article": [text, text],
        "hightlights": ["Zusammenfassung", "Zusammenfassung"]
    }

test_cuda()
empty_cache()
rouge = datasets.load_metric("rouge")

test_data = datasets.arrow_dataset.Dataset.from_pandas(
    pd.DataFrame.from_dict(
        temp, columns=["article", "highlights"], orient="index"
    )
)


def generate_summary(batch):
    inputs = tokenizer(
        batch["article"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = tf2tf.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_summary"] = output_str

    return batch


summary = test_data.map(
    generate_summary,
    batched=True,
    batch_size=batch_size
)

for i in range(0, len(parts) - 1):
    print(f"HYP: {summary[i]['pred_summary']}")