In [2]:
# Installation
%%capture

!pip install transformers==4.5.1
!pip install datasets==1.6.2
!pip install tokenizers==0.10.2
!pip install torch==1.8.1+cu111
!pip install psutil==5.8.0
!pip install rouge_score
!pip install sacrebleu
!pip install openpyxl
!pip install xlrd
!pip install git-python
!pip install -U ipython==7.20
!pip install cmake
!pip install SentencePiece

In [3]:
# Imports
import csv
import datasets
import gc
import matplotlib.pyplot as plt
import psutil
import pandas as pd
import string
import torch
import transformers

from collections import Counter
from datasets import ClassLabel
from nltk import ngrams
from IPython.display import display, HTML
from typing import List, Tuple

In [4]:
# Drive
from google.colab import drive
drive.mount("/content/drive")
path_drive = "/content/drive/My Drive/Temp/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Config
language: str = "german"  # english, german, multilingual
model_name: str = "bert-base-multilingual-cased"
tokenizer_name: str = "bert-base-multilingual-cased"
batch_size: int = 8

ratio_corpus_wik: float = 0.25
ratio_corpus_nws: float = 0.25
ratio_corpus_mls: float = 0.25
ratio_corpus_eng: float = 0.25

path_output: str = "/content/drive/My Drive/Temp/Models/"
path_checkpoint: str = "/content/drive/My Drive/Temp/Models/DE/BERT"

train_size: float = 0.900
val_size: float = 0.025
test_size: float = 0.075

'''
- bert-base-multilingual-cased
- deepset/gbert-base
- xlm-roberta-base
- facebook/mbart-large-cc25
'''

'\n- bert-base-multilingual-cased\n- deepset/gbert-base\n- xlm-roberta-base\n- facebook/mbart-large-cc25\n'

In [5]:
# Helpers
def load_data() -> Tuple[datasets.Dataset, datasets.Dataset, datasets.Dataset]:
    if language == "english":
        return load_english_data()

    if language == "german":
        return load_german_data()

    if language == "multilingual":
        return load_multilingual_data()


def load_english_data() -> Tuple[datasets.Dataset, datasets.Dataset, datasets.Dataset]:
    train_data = datasets.load_dataset(
        "cnn_dailymail", "3.0.0",
        split="train",
        ignore_verifications=True
    )

    val_data = datasets.load_dataset(
        "cnn_dailymail", "3.0.0",
        split="validation[:50%]",
        ignore_verifications=True
    )

    test_data = datasets.load_dataset(
        "cnn_dailymail", "3.0.0",
        split="test[:50%]",
        ignore_verifications=True
    )

    train_data = train_data.select(
        range(0, int(len(train_data) * ratio_corpus_eng))
    )

    train_data = train_data.rename_column("article", "text")
    train_data = train_data.rename_column("highlights", "summary")
    train_data = train_data.remove_columns("id")

    val_data = val_data.rename_column("article", "text")
    val_data = val_data.rename_column("highlights", "summary")
    val_data = val_data.remove_columns("id")

    test_data = test_data.rename_column("article", "text")
    test_data = test_data.rename_column("highlights", "summary")
    test_data = test_data.remove_columns("id")

    return train_data.shuffle(), val_data.shuffle(), test_data.shuffle()


def load_german_data() -> Tuple[datasets.Dataset, datasets.Dataset, datasets.Dataset]:
    ds_wik = load_corpus_wik()
    ds_nws = load_corpus_nws()
    ds_mls = load_corpus_mls()

    german_data = datasets.concatenate_datasets([
        ds_wik.select(
            range(0, int(len(ds_wik) * ratio_corpus_wik))),
        ds_nws.select(
            range(0, int(len(ds_nws) * ratio_corpus_nws))),
        ds_mls.select(
            range(0, int(len(ds_mls) * ratio_corpus_mls)))
    ])

    train_size = int(len(german_data) * 0.900)
    valid_size = int(len(german_data) * 0.025)
    test_size = int(len(german_data) * 0.075)

    train_data = german_data.select(
        range(0, train_size)
    )

    val_data = german_data.select(
        range(train_size, train_size + valid_size)
    )

    test_data = german_data.select(
        range(train_size + valid_size, train_size + valid_size + test_size)
    )

    return train_data, val_data, test_data


def load_corpus_wik() -> datasets.Dataset:
    data_txt, data_ref = [], []

    with open("/content/drive/My Drive/Temp/Corpus/data_train.csv", "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_ALL)
        next(reader, None)

        for row in reader:
            data_txt.append(row[0])
            data_ref.append(row[1])

    df_wik = pd.DataFrame(
        list(zip(data_txt, data_ref)),
        columns=["text", "summary"]
    )

    ds_wik = datasets.arrow_dataset.Dataset.from_pandas(df_wik)

    return ds_wik.shuffle()


def load_corpus_nws() -> datasets.Dataset:
    df_nws = pd.read_excel("/content/drive/My Drive/Temp/Corpus/data_train_test.xlsx", engine="openpyxl")
    df_nws = df_nws[["article", "highlights"]]
    df_nws.columns = ["text", "summary"]
    df_nws = df_nws[~df_nws["summary"].str.contains("ZEIT")]
    df_nws = df_nws.dropna()
    ds_nws = datasets.arrow_dataset.Dataset.from_pandas(df_nws)
    ds_nws = ds_nws.remove_columns("__index_level_0__")

    return ds_nws.shuffle()


def load_corpus_mls() -> datasets.Dataset:
    ds_mls = datasets.load_dataset("mlsum", "de", split="train")
    ds_mls = ds_mls.remove_columns(["topic", "url", "title", "date"])

    text_corpus_mls = []
    summary_corpus_mls = []

    for entry in ds_mls:
        text = entry["text"]
        summary = entry["summary"]

        if summary in text:
            text = text[len(summary) + 1:len(text)]

        text_corpus_mls.append(text)
        summary_corpus_mls.append(summary)

    df_mls = pd.DataFrame(
        list(zip(text_corpus_mls, summary_corpus_mls)),
        columns=["text", "summary"]
    )

    ds_mls = datasets.arrow_dataset.Dataset.from_pandas(df_mls)

    return ds_mls.shuffle()


def load_multilingual_data() -> Tuple[datasets.Dataset, datasets.Dataset, datasets.Dataset]:
    english_data, _, _ = load_english_data()
    german_data, _, _ = load_german_data()

    multilingual_data = datasets.concatenate_datasets([
        german_data, english_data
    ]).shuffle()

    train_size = int(len(multilingual_data) * 0.900)
    valid_size = int(len(multilingual_data) * 0.025)
    test_size = int(len(multilingual_data) * 0.075)

    train_data = multilingual_data.select(
        range(0, train_size)
    )

    val_data = multilingual_data.select(
        range(train_size, train_size + valid_size)
    )

    test_data = multilingual_data.select(
        range(train_size + valid_size, train_size + valid_size + test_size)
    )

    return train_data, val_data, test_data


def test_cuda() -> None:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    print("Device:", device)
    print("Version:", torch.__version__)


def explore_corpus(data: datasets.Dataset) -> None:
    df = pd.DataFrame(data)

    text_list = []
    summary_list = []

    for index, row in df.iterrows():
        text = row["text"]
        summary = row["summary"]
        text_list.append(len(text))
        summary_list.append(len(summary))

    df = pd.DataFrame(data[:1])

    for column, typ in data.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])


def empty_cache() -> None:
    gc.collect()
    torch.cuda.empty_cache()
    psutil.virtual_memory()


def load_tokenizer_and_model(from_checkpoint: bool = False) -> Tuple[transformers.AutoTokenizer, transformers.EncoderDecoderModel]:
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        tokenizer_name, strip_accent=False
    )

    if from_checkpoint:
        if "mbart" in model_name:
            tf2tf = transformers.AutoModelForSeq2SeqLM.from_pretrained(
                path_checkpoint
            )

        else:
            tf2tf = transformers.EncoderDecoderModel.from_pretrained(
                path_checkpoint
            )

    else:
        if "mbart" in model_name:
            tf2tf = transformers.MBartForConditionalGeneration.from_pretrained(
                model_name
            )

        else:
            tf2tf = transformers.EncoderDecoderModel.from_encoder_decoder_pretrained(
                model_name, model_name, tie_encoder_decoder=True
            )

    return tokenizer, tf2tf


def configure_model(tf2tf: transformers.EncoderDecoderModel, tokenizer: transformers.AutoTokenizer):
    tf2tf.config.decoder_start_token_id = tokenizer.cls_token_id
    tf2tf.config.bos_token_id = tokenizer.bos_token_id
    tf2tf.config.eos_token_id = tokenizer.sep_token_id
    tf2tf.config.pad_token_id = tokenizer.pad_token_id

    tf2tf.config.max_length = 128
    tf2tf.config.min_length = 56
    tf2tf.config.no_repeat_ngram_size = 3
    tf2tf.config.early_stopping = True
    tf2tf.config.length_penalty = 2.0
    tf2tf.config.num_beams = 2

    return tf2tf


def split_long_texts(parts: List[str], text: str):
    limit = 512

    if len(text) > limit:
        end_index = max([
            text.rfind(".", 0, limit),
            text.rfind("!", 0, limit),
            text.rfind("?", 0, limit)
        ])

        parts.append(text[0:end_index + 1].strip())
        text = text[end_index + 1:len(text)].strip()
        parts = split_long_texts(parts, text)

    else:
        parts.append(text)

    return parts

In [6]:
# Methods
def clean(text: str) -> str:
    text = " ".join([w.lower() for w in text.split()])
    text = text.translate(str.maketrans("", "", string.punctuation))

    return text


def analyze_text_lenghts(corpus: List[str]) -> None:
    lengths = []

    for text in corpus:
        lengths.append(len(text.split()))


    fig = plt.figure(figsize=(10, 6))

    plt.hist(lengths, bins=40)
    plt.xlabel("Anzahl der Wörter", fontsize=18)
    plt.ylabel("Anzahl der Texte", fontsize=18)
    plt.xticks(rotation=0, fontsize=12)
    plt.yticks(rotation=0, fontsize=12)
    plt.tight_layout()

    plt.show()

    print(sum(lengths) / len(lengths))


def analyze_most_common_words(corpus: List[str], k: int = 20) -> None:
    words = []

    for text in corpus:
        token_list = clean(text).split()

        for token in token_list:
            if len(token) > 1:
                words.append(token)

    most_common_words = Counter(words).most_common(k)

    x = [tuple[0] for tuple in most_common_words]
    y = [tuple[1] for tuple in most_common_words]

    fig = plt.figure(figsize=(10, 6))

    plt.bar(x, y)
    plt.xlabel("Wörter", fontsize=18)
    plt.ylabel("Anzahl", fontsize=18)
    plt.xticks(rotation=90, fontsize=12)
    plt.yticks(rotation=0, fontsize=12)
    plt.tight_layout()

    plt.show()


def analyze_n_grams(corpus: List[str], n: int = 3, n_most_common_n_grams: int = 20) -> None:
    n_grams = []

    for text in corpus:
        try:
            for n_gram in ngrams(text.split(), n):
                n_grams.append(n_gram)

        except Exception as e:
            print(e)

    most_common_n_grams = Counter(n_grams).most_common(n_most_common_n_grams)

    x = [" ".join(tuple[0]) for tuple in most_common_n_grams]
    y = [tuple[1] for tuple in most_common_n_grams]

    fig = plt.figure(figsize=(10, 6))

    plt.bar(x, y)
    plt.xlabel("N-Gramme", fontsize=18)
    plt.ylabel("Anzahl", fontsize=18)
    plt.xticks(rotation=90, fontsize=12)
    plt.yticks(rotation=0, fontsize=12)
    plt.tight_layout()

    plt.show()

In [7]:
# Exploration
data, _, _ = load_data()
corpus = list(data["text"])

analyze_text_lenghts(corpus)
analyze_most_common_words(corpus)
analyze_n_grams(corpus, n=2)
analyze_n_grams(corpus, n=3)

Reusing dataset mlsum (/root/.cache/huggingface/datasets/mlsum/de/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)
