In [None]:
# Imports
%%capture

!pip install --upgrade tensorflow
!pip install tensorflow-gpu
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
!pip install datasets
!pip install tokenizers
!pip install git+https://github.com/huggingface/transformers
!pip install git-python==1.0.3
!pip install rouge_score
!pip install sacrebleu
!pip install wget

# !python -m wget https://github.com/huggingface/transformers/tree/master/src/transformers/trainer_seq2seq.py
# !python -m wget https://github.com/huggingface/transformers/tree/master/src/transformers/training_args_seq2seq.py

In [None]:
# Imports
import pandas as pd
import datasets
import csv
import transformers as ft

from IPython.display import display, HTML
from transformers import EncoderDecoderModel, BertTokenizer, BertTokenizerFast
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import ClassLabel

In [None]:
# Test CUDA
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
print("Device:", device)

Device: cuda


In [None]:
# Load swiss data
from google.colab import drive

path_drive = "/content/drive"
drive.mount(path_drive)
path_corpus = path_drive + "/My Drive/Temp/Corpus/data_train.csv"

data_txt = []
data_ref = []

with open(path_corpus, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_ALL)
    next(reader, None)

    for row in reader:
        data_txt.append(row[0])
        data_ref.append(row[1])

tuples = list(zip(data_txt, data_ref))
dataframe = pd.DataFrame(tuples, columns=["article", "highlights"])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load german data
from google.colab import drive

path_drive = "/content/drive"
drive.mount(path_drive)
path_corpus = path_drive + "/My Drive/Temp/Corpus/data_train_test.xlsx"

tuples = pd.read_excel(path_corpus)
del tuples["Unnamed: 0"]
dataframe = pd.DataFrame(tuples, columns=["article", "highlights"]) # dataframe = pd.concat([dataframe, tuples])
dataframe = dataframe.dropna()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Clean redactional data
print(len(dataframe))
dataframe = dataframe[~dataframe["highlights"].str.contains("ZEIT")]
print(len(dataframe))

12408
12408


In [None]:
# Concat swiss and german data
german_data = datasets.arrow_dataset.Dataset.from_pandas(dataframe[["article", "highlights"]])
german_data = german_data.shuffle()
print(dataframe)

                                                 article                                         highlights
0      Israelische Kampfjets haben in der Nacht Ziele...   Als Reaktion auf Attacken aus dem Gazastreife...
1      Monty Ott ist Vorsitzender von Keshet Deutschl...   Für Jüdinnen und Juden hat Deutschland nur di...
2      Was können Sie tun?\n- Sie können zur Startsei...  In seiner Amtszeit habe er eine “Mentalität de...
3      Es gibt das Gerücht, dass Xi Jinping in Wuhan ...   China wird von der Kommunistischen Partei aut...
4      Wie konnten sie nur, die Anführenden der Demok...   Die Wahlnacht von Iowa hat alle Beteiligten b...
...                                                  ...                                                ...
12533  Das in der Kritik stehende Kommando Spezialkrä...  KSK-Soldaten konnten laut Medienrecherchen unt...
12534  Franka Lu ist eine chinesische Journalistin un...  China hat die Pandemie erfolgreich unterdrückt...
12535  Nach drei Angriffen m

In [None]:
# Split data
train_size = int(len(dataframe) * 0.9)
valid_size = int(len(dataframe) * 0.075)
test_size = int(len(dataframe) * 0.025)

train_data = german_data.select(range(0, train_size))
val_data = german_data.select(range(train_size, train_size + valid_size))
test_data = german_data.select(range(train_size + valid_size, len(dataframe)))

In [None]:
# Load english data
train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
val_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:5%]")
test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:2%]")

Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0a01b1abede4f646130574f203de57a293ded8a7a11e3406a539453afdfeb2c0)
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0a01b1abede4f646130574f203de57a293ded8a7a11e3406a539453afdfeb2c0)
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0a01b1abede4f646130574f203de57a293ded8a7a11e3406a539453afdfeb2c0)


In [None]:
# Explore corpus
df = pd.DataFrame(train_data)

text_list = []
summary_list = []

for index, row in df.iterrows():
    text = row["article"]
    summary = row["highlights"]
    text_list.append(len(text))
    summary_list.append(len(summary))
    
print(sum(text_list) / len(text_list))
print(sum(summary_list) / len(summary_list))

3915.6129
264.5824


In [None]:
# Explore corpus
train_data.info.description
df = pd.DataFrame(train_data[:1])

# del df["id"]

for column, typ in train_data.features.items():
    if isinstance(typ, ClassLabel):
        df[column] = df[column].transform(lambda i: typ.names[i])

display(HTML(df.to_html()))

Unnamed: 0,article,highlights,id
0,"It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force ""to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction."" It's a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but because he wants to. ""While I believe I have the authority to carry out this military action without specific congressional authorization, I know that the country will be stronger if we take this course, and our actions will be even more effective,"" he said. ""We should have this debate, because the issues are too big for business as usual."" Obama said top congressional leaders had agreed to schedule a debate when the body returns to Washington on September 9. The Senate Foreign Relations Committee will hold a hearing over the matter on Tuesday, Sen. Robert Menendez said. Transcript: Read Obama's full remarks . Syrian crisis: Latest developments . U.N. inspectors leave Syria . Obama's remarks came shortly after U.N. inspectors left Syria, carrying evidence that will determine whether chemical weapons were used in an attack early last week in a Damascus suburb. ""The aim of the game here, the mandate, is very clear -- and that is to ascertain whether chemical weapons were used -- and not by whom,"" U.N. spokesman Martin Nesirky told reporters on Saturday. But who used the weapons in the reported toxic gas attack in a Damascus suburb on August 21 has been a key point of global debate over the Syrian crisis. Top U.S. officials have said there's no doubt that the Syrian government was behind it, while Syrian officials have denied responsibility and blamed jihadists fighting with the rebels. British and U.S. intelligence reports say the attack involved chemical weapons, but U.N. officials have stressed the importance of waiting for an official report from inspectors. The inspectors will share their findings with U.N. Secretary-General Ban Ki-moon Ban, who has said he wants to wait until the U.N. team's final report is completed before presenting it to the U.N. Security Council. The Organization for the Prohibition of Chemical Weapons, which nine of the inspectors belong to, said Saturday that it could take up to three weeks to analyze the evidence they collected. ""It needs time to be able to analyze the information and the samples,"" Nesirky said. He noted that Ban has repeatedly said there is no alternative to a political solution to the crisis in Syria, and that ""a military solution is not an option."" Bergen: Syria is a problem from hell for the U.S. Obama: 'This menace must be confronted' Obama's senior advisers have debated the next steps to take, and the president's comments Saturday came amid mounting political pressure over the situation in Syria. Some U.S. lawmakers have called for immediate action while others warn of stepping into what could become a quagmire. Some global leaders have expressed support, but the British Parliament's vote against military action earlier this week was a blow to Obama's hopes of getting strong backing from key NATO allies. On Saturday, Obama proposed what he said would be a limited military action against Syrian President Bashar al-Assad. Any military attack would not be open-ended or include U.S. ground forces, he said. Syria's alleged use of chemical weapons earlier this month ""is an assault on human dignity,"" the president said. A failure to respond with force, Obama argued, ""could lead to escalating use of chemical weapons or their proliferation to terrorist groups who would do our people harm. In a world with many dangers, this menace must be confronted."" Syria missile strike: What would happen next? Map: U.S. and allied assets around Syria . Obama decision came Friday night . On Friday night, the president made a last-minute decision to consult lawmakers. What will happen if they vote no? It's unclear. A senior administration official told CNN that Obama has the authority to act without Congress -- even if Congress rejects his request for authorization to use force. Obama on Saturday continued to shore up support for a strike on the al-Assad government. He spoke by phone with French President Francois Hollande before his Rose Garden speech. ""The two leaders agreed that the international community must deliver a resolute message to the Assad regime -- and others who would consider using chemical weapons -- that these crimes are unacceptable and those who violate this international norm will be held accountable by the world,"" the White House said. Meanwhile, as uncertainty loomed over how Congress would weigh in, U.S. military officials said they remained at the ready. 5 key assertions: U.S. intelligence report on Syria . Syria: Who wants what after chemical weapons horror . Reactions mixed to Obama's speech . A spokesman for the Syrian National Coalition said that the opposition group was disappointed by Obama's announcement. ""Our fear now is that the lack of action could embolden the regime and they repeat his attacks in a more serious way,"" said spokesman Louay Safi. ""So we are quite concerned."" Some members of Congress applauded Obama's decision. House Speaker John Boehner, Majority Leader Eric Cantor, Majority Whip Kevin McCarthy and Conference Chair Cathy McMorris Rodgers issued a statement Saturday praising the president. ""Under the Constitution, the responsibility to declare war lies with Congress,"" the Republican lawmakers said. ""We are glad the president is seeking authorization for any military action in Syria in response to serious, substantive questions being raised."" More than 160 legislators, including 63 of Obama's fellow Democrats, had signed letters calling for either a vote or at least a ""full debate"" before any U.S. action. British Prime Minister David Cameron, whose own attempt to get lawmakers in his country to support military action in Syria failed earlier this week, responded to Obama's speech in a Twitter post Saturday. ""I understand and support Barack Obama's position on Syria,"" Cameron said. An influential lawmaker in Russia -- which has stood by Syria and criticized the United States -- had his own theory. ""The main reason Obama is turning to the Congress: the military operation did not get enough support either in the world, among allies of the US or in the United States itself,"" Alexei Pushkov, chairman of the international-affairs committee of the Russian State Duma, said in a Twitter post. In the United States, scattered groups of anti-war protesters around the country took to the streets Saturday. ""Like many other Americans...we're just tired of the United States getting involved and invading and bombing other countries,"" said Robin Rosecrans, who was among hundreds at a Los Angeles demonstration. What do Syria's neighbors think? Why Russia, China, Iran stand by Assad . Syria's government unfazed . After Obama's speech, a military and political analyst on Syrian state TV said Obama is ""embarrassed"" that Russia opposes military action against Syria, is ""crying for help"" for someone to come to his rescue and is facing two defeats -- on the political and military levels. Syria's prime minister appeared unfazed by the saber-rattling. ""The Syrian Army's status is on maximum readiness and fingers are on the trigger to confront all challenges,"" Wael Nader al-Halqi said during a meeting with a delegation of Syrian expatriates from Italy, according to a banner on Syria State TV that was broadcast prior to Obama's address. An anchor on Syrian state television said Obama ""appeared to be preparing for an aggression on Syria based on repeated lies."" A top Syrian diplomat told the state television network that Obama was facing pressure to take military action from Israel, Turkey, some Arabs and right-wing extremists in the United States. ""I think he has done well by doing what Cameron did in terms of taking the issue to Parliament,"" said Bashar Jaafari, Syria's ambassador to the United Nations. Both Obama and Cameron, he said, ""climbed to the top of the tree and don't know how to get down."" The Syrian government has denied that it used chemical weapons in the August 21 attack, saying that jihadists fighting with the rebels used them in an effort to turn global sentiments against it. British intelligence had put the number of people killed in the attack at more than 350. On Saturday, Obama said ""all told, well over 1,000 people were murdered."" U.S. Secretary of State John Kerry on Friday cited a death toll of 1,429, more than 400 of them children. No explanation was offered for the discrepancy. Iran: U.S. military action in Syria would spark 'disaster' Opinion: Why strikes in Syria are a bad idea .","Syrian official: Obama climbed to the top of the tree, ""doesn't know how to get down""\nObama sends a letter to the heads of the House and Senate .\nObama to seek congressional approval on military action against Syria .\nAim is to determine whether CW were used, not by whom, says U.N. spokesman .",0001d1afc246a7964130f43ae940af6bc6c57f01


In [None]:
# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

In [None]:
# Prepare data
encoder_max_length = 512
decoder_max_length = 128
batch_size = 4 # 16

def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [None]:
# Training data
train_data = train_data.shuffle()

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["article", "highlights"] # "id"
)

train_data.set_format(
    type="torch",
    columns=["input_ids",
             "attention_mask",
             "decoder_input_ids",
             "decoder_attention_mask",
             "labels"]
)

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0a01b1abede4f646130574f203de57a293ded8a7a11e3406a539453afdfeb2c0/cache-7af9f5bf341e5915.arrow


HBox(children=(FloatProgress(value=0.0, max=71779.0), HTML(value='')))




In [None]:
# Validation data
val_data = val_data.shuffle()

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    remove_columns=["article", "highlights"] # id
)

val_data.set_format(
    type="torch",
    columns=["input_ids",
             "attention_mask",
             "decoder_input_ids",
             "decoder_attention_mask",
             "labels"]
)

val_data = val_data.select(range(2000))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
# Load models
tf2tf = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-multilingual-cased", "bert-base-multilingual-cased", tie_encoder_decoder=False)
tf2tf.save_pretrained("bert2bert_multilingual")
tf2tf = EncoderDecoderModel.from_pretrained("bert2bert_multilingual")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias

In [None]:
# Configure models
tf2tf.config.decoder_start_token_id = tokenizer.cls_token_id
tf2tf.config.eos_token_id = tokenizer.sep_token_id
tf2tf.config.pad_token_id = tokenizer.pad_token_id
tf2tf.config.vocab_size = tf2tf.config.encoder.vocab_size # tokenizer.vocab_size

In [None]:
# Configure beam search
tf2tf.config.max_length = 142
tf2tf.config.min_length = 56
tf2tf.config.no_repeat_ngram_size = 3
tf2tf.config.early_stopping = True
tf2tf.config.length_penalty = 2.0
tf2tf.config.num_beams = 4

In [None]:
# Prepare metric
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
# Load checkpoint
from google.colab import drive

path_drive = "/content/drive"
drive.mount(path_drive)

path_output = path_drive + "/My Drive/Temp/Models"
path_checkpoint = path_output + "/checkpoint-4000"

tf2tf = EncoderDecoderModel.from_pretrained(path_checkpoint)
tf2tf.to("cuda")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
# Empty cache
import psutil

torch.cuda.empty_cache()
psutil.virtual_memory()

svmem(total=13653569536, available=8010371072, percent=41.3, used=8964763648, free=168841216, active=9098362880, inactive=3929788416, buffers=35004416, cached=4484960256, shared=13643776, slab=187285504)

In [None]:
# Setup arguments
training_args = ft.Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir=path_output, # "./"
    warmup_steps=1000,
    save_steps=2000,
    logging_steps=100,
    eval_steps=2000,
    save_total_limit=1,
    fp16=True
)

In [None]:
# Start training
trainer = ft.Seq2SeqTrainer(
    model=tf2tf,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=750, training_loss=5.587833760579427, metrics={'train_runtime': 585.0085, 'train_samples_per_second': 1.282, 'total_flos': 4425924222720000.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 4096, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 15745024, 'train_mem_gpu_alloc_delta': 4702387712, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 4992374272})

In [None]:
# Load checkpoint
from google.colab import drive

path_drive = "/content/drive"
drive.mount(path_drive)

path_output = path_drive + "/My Drive/Temp/Models"
path_checkpoint = path_output + "/checkpoint-4000"

tf2tf = EncoderDecoderModel.from_pretrained(path_checkpoint)
tf2tf.to("cuda")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
# Evaluate training
def generate_summary(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    
    outputs = tf2tf.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_summary"] = output_str

    return batch

results = test_data.map(
    generate_summary,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article"]
)

print(results[0]["pred_summary"])
print(results[0]["highlights"])
print("====================")

rouge.compute(predictions=results["pred_summary"], references=results["highlights"], rouge_types=["rouge2"])["rouge2"].mid

HBox(children=(FloatProgress(value=0.0, max=58.0), HTML(value='')))


NEW : " It's the same same - sex sex sex, " police says. The boy's father's mother says she's a teenager. The girl's name's " I'm not a girl, " says says. She's not a teen teen's girlfriend, " she says.
James Best, who played the sheriff on "The Dukes of Hazzard," died Monday at 88 .
"Hazzard" ran from 1979 to 1985 and was among the most popular shows on TV .


Score(precision=0.009788744117622783, recall=0.013462376021361085, fmeasure=0.011080057178249322)