In [33]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())

12.1
True


In [34]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

def load_data(src_file, tgt_file):
    """ Loads the D-wikipedia files (scr = source articles),
    (tgt = target articles) and appends the article pairs in a list as dictionaries.
    Returns: 40 000 article pairs.
    """
    with open(src_file, 'r', encoding='utf-8') as f_src:
        src_lines = f_src.readlines()
    
    with open(tgt_file, 'r', encoding='utf-8') as f_tgt:
        tgt_lines = f_tgt.readlines()

    all_data = []
    for original_article, simplified_article in zip(src_lines, tgt_lines):
        entry = {
            "text": original_article,
            "summary": simplified_article,
        }
        all_data.append(entry)
    #print("length of all_data",len(all_data))
    return all_data[:40000]


In [35]:
training_data, test_data = train_test_split(load_data("train.src", "train.tgt"), test_size=0.2)        

# creating datasets 

train_dataset = Dataset.from_list(training_data)
test_dataset = Dataset.from_list(test_data)
#print(train_dataset[1])
#print(len(train_dataset))
#print(len(test_dataset))

In [36]:
for i in range(3):
    print("Summary:", train_dataset[i]['summary'])
    print("Original:", train_dataset[i]['text'])

Summary: the 1916 summer olympics , officially known as the games of the vi olympiad , were not held.the games were planned for berlin in germany.in 1912 , workers began building the sports facilities for the games . the `` deutsches stadion '' ( `` german stadium '' ) began in 1912 . in june 1913 , the stadium was officially opened with 60,000 people at the ceremonies.the games were cancelled because of world war i . 

Original: the 1916 summer olympics ( german : `` olympische sommerspiele 1916 '' ) , officially known as the games of the vi olympiad , were scheduled to be held in berlin , german empire , but were eventually cancelled due to the outbreak of world war i. berlin was selected as the host city during the 14th ioc session in stockholm on 4 july 1912 , defeating bids from alexandria , amsterdam , brussels , budapest and cleveland . after the 1916 games were cancelled , berlin would eventually host the 1936 summer olympics , twenty years later . 

Summary: brendon boyd urie 

In [37]:
from transformers import AutoTokenizer

checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) #legacy=False, use_fast=False) 



In [38]:
prefix = "simplify the language: "
def preprocess_function(examples):
    """ Preprocesses the data and adds the prefix.
        Tokenizes the data with mt5-tokenizer.
    """
    #inputs = []
    #for doc in examples["text"]:
    #    inputt = prefix + doc
    #    inputs.append(inputt)
    
    # input tokenizing    
    encoding = tokenizer(
    [prefix + sequence for sequence in examples["text"]],
    padding="longest",
    max_length=1200,
    truncation=True,
    return_tensors="pt")

    input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

    # targets tokenizing
    target_encoding = tokenizer(
        [sequence for sequence in examples["summary"]],
        padding="longest",
        max_length=600,
        truncation=True,
        return_tensors="pt",
    )
    labels = target_encoding.input_ids

    # replace padding token id's of the labels by -100 so it's ignored by the loss
    labels[labels == tokenizer.pad_token_id] = -100
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [39]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

In [40]:
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [41]:
from transformers import DataCollatorForSeq2Seq
# transforms inputs to tensors, batches
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [42]:
from evaluate import load
import numpy as np

sari = load("sari")

In [43]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred  
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    sources = [example["text"] for example in test_dataset]  
    references = [[example["summary"]] for example in test_dataset] 
    
    print("sources",sources)
    print("predictions",decoded_preds)
    print("references",references)

    # calculating SARI
    sari_score = sari.compute(sources=sources, predictions=decoded_preds, references=references)

    return {
        "sari": sari_score,
    }

In [44]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from peft import get_peft_model, LoraConfig

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


In [45]:
# LORA SETTINGS

lora_config = LoraConfig(
    r=16,  
    lora_alpha=32,  
    lora_dropout=0.1,  
    task_type="SEQ_2_SEQ_LM",
)

model = get_peft_model(model, lora_config)


In [48]:
training_args = Seq2SeqTrainingArguments(
    output_dir="final-task-fine-tuned-model-40k-traindata",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8, # for memory issues
    weight_decay=0.01, #to prevent overfitting
    save_total_limit=2,
    num_train_epochs=5,
    overwrite_output_dir=True,
    predict_with_generate=True,
    logging_strategy="steps",
    fp16=True, 
    warmup_steps=150, 
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [49]:
trainer.train()



Epoch,Training Loss,Validation Loss,Sari
1,3.741,2.818675,{'sari': 37.15475633080281}
2,3.452,2.695584,{'sari': 38.797998502753465}
3,3.3328,2.656935,{'sari': 39.71900946529877}
4,3.287,2.613796,{'sari': 40.07296562727385}


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Trainer is attempting to log a value of "{'sari': 37.15475633080281}" of type <class 'dict'> for key "eval/sari" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Trainer is attempting to log a value of "{'sari': 38.797998502753465}" of type <class 'dict'> for key "eval/sari" as a scalar. This invo

KeyboardInterrupt: 

In [55]:
text1 = "Mohammed Saleh, the acting director of Al-Awda Hospital in Jabalia, says his hospital is inundated with casualties every day, as Israel carries out raids in the area. He warns the attacks on the local refugee camp are putting the healthcare system 'on the brink of disaster'. 'Medical teams are dealing with complex injuries amid severe shortages of medicines, medical supplies and fuel, in addition to the lack of food for patients and medical staff in the hospital,' Saleh says in a statement to the Palestinian Press Agency. Many doctors at the Kamal Adwan, Al-Awda and the Indonesian hospitals have refused to leave their patients, despite Israel's recent offensive in Jabalia. 'We are talking about more than 300 medical staff working at Kamal Adwan Hospital, and we can't provide even a single meal for them to be able to offer medical services safely,' hospital director Hussam Abu Safiya tells Reuters. Israel says it's targeting Hamas in Jabalia, and earlier said it killed '50 terrorists in close-quarters encounters and aerial strikes over the past day'"

In [None]:
text2 = "Galileo was an American robotic space program that studied the planet Jupiter and its moons, as well as several other Solar System bodies. Named after the Italian astronomer Galileo Galilei, the Galileo spacecraft consisted of an orbiter and an atmospheric entry probe. It was delivered into Earth orbit on October 18, 1989, by Space Shuttle Atlantis on the STS-34 mission, and arrived at Jupiter on December 7, 1995, after gravity assist flybys of Venus and Earth, and became the first spacecraft to orbit Jupiter. The spacecraft then launched the first probe to directly measure its atmosphere. Despite suffering major antenna problems, Galileo achieved the first asteroid flyby, of 951 Gaspra, and discovered the first asteroid moon, Dactyl, around 243 Ida. In 1994, Galileo observed Comet Shoemaker–Levy 9's collision with Jupiter."

In [None]:
text3 = "Finland, officially the Republic of Finland, is a Nordic country in Northern Europe. It borders Sweden to the northwest, Norway to the north, and Russia to the east, with the Gulf of Bothnia to the west and the Gulf of Finland to the south, opposite Estonia. Finland covers a total area of 338,145 square kilometres (130,559 sq mi), including a land area of 303,815 square kilometres (117,304 sq mi),and has a population of 5.6 million. Helsinki is the capital and largest city. The vast majority of the population are ethnic Finns. The official languages are Finnish and Swedish; 84.9 percent of the population speak the first as their mother tongue and 5.1 percent the latter. Finland's climate varies from humid continental in the south to boreal in the north. The land cover is predominantly boreal forest biome, with more than 180,000 recorded lakes."

In [None]:
text4 = "Donald Trump enjoys a huge lead among men, while women tell pollsters they prefer Kamala Harris by a similarly large margin. The political gender gap reflects a decade of social upheaval and could help decide the US election. For the first woman of colour to secure a presidential nomination, and only the second woman to ever get this close, Kamala Harris goes to great lengths not to talk about her identity. “Listen, I am running because I believe that I am the best person to do this job at this moment for all Americans, regardless of race and gender,” the vice-president said in a CNN interview last month. In this piece, the BBC's US special correspondent Katty Kay delves deeper into how this November’s election has turned into a referendum on gender norms, and the social upheavals of recent years."

In [50]:
text5 = "Nineteen people died and six others were injured when a bus crashed on a highway in Mexico’s central state of Zacatecas on Saturday, local authorities said. The accident occurred in the early morning hours when the bus carrying the victims collided with the back of a tractor-trailer carrying corn, which had come loose. Zacatecas Governor David Monreal earlier on Saturday had initially reported a preliminary death toll of 24 people, but the state attorney general’s office later revised the tally in a statement. The attorney general’s office said it was “carrying out investigations to arrest the driver” of the tractor-trailer. Efforts were ongoing on Saturday morning to recover some of the bodies that had fallen into a ravine, a local government official who asked not to be named told Reuters. Video footage showed rescue teams and security forces, including military personnel, securing the area while rescuers worked to recover the bodies. The bus was headed for Ciudad Juarez, a city on the US-Mexico border in the state of Chihuahua. The victims did not include migrants, according to the attorney general’s office."

In [2]:
text6 = "At least 62 people are known to have died after torrential rain caused devastating flash floods in south-eastern Spain. In the town of Chiva near Valencia more than a year's worth of rain fell in just eight hours and local officials say it is 'impossible' to put a final figure on the number of people who have perished. Footage uploaded to social media shows floodwaters causing chaos in the wider region, knocking down bridges and dragging cars through the streets. Other video appears to show people clinging to trees to avoid being swept away. Much of the country has been badly hit by heavy rain and hailstorms, triggering rapid flooding across multiple areas."

In [9]:
text7 = "A manhunt is underway in northern Austria after a hunter allegedly fatally shot two people and fled the scene, local police said Monday. Franz Hofer, mayor of Kirchberg ob der Donau, was killed in the village of Altenfelden in Austria’s rural Muhlviertel region, near the border with Germany and the Czech Republic. A second man was also shot dead a short while later, Upper Austria police spokesperson Ulrike Handelbauer told CNN. A large-scale police operation with helicopters and special forces is underway, she said. Police said Roland Drexler, 56, is suspected of having killed the two men and made a getaway in a Volkswagen Caddy. “The man is believed to be extremely dangerous and armed,” police said. A dispute over hunting rights appeared to have sparked the incident, police added. It was not immediately clear why a long-running feud had escalated. According to Kronen Zeitung, a local outlet, the suspect was known to hunters in the area. “He was a difficult person,” said a hunter from the area who wished not to be named. The shooting shocked officials at the People’s Party (OVP) regional headquarters in Linz. “It’s madness,” said state party leader Florian Hiegelsperger. Herbert Sieghartsleitner, the state hunting master, said the incident was “unbelievable.” “I am deeply shocked by what has happened. I knew Franz Hofer very well personally,” he said, according to Kronen Zeitung."

In [10]:
from transformers import AutoTokenizer

fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./final-task-fine-tuned-model-40k-traindata/checkpoint-4000")
inputs = fine_tuned_tokenizer(text7, return_tensors="pt").input_ids



In [25]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("./final-task-fine-tuned-model-40k-traindata/checkpoint-4000")
generated_ids = model.generate(
    inputs,
    do_sample=False,
    repetition_penalty=2.0,
    max_new_tokens=300,
    min_new_tokens=200,   # min length for generation
    num_beams=5,     # n beam search
    no_repeat_ngram_size=3,  # prevent repetitive n-grams
)


In [26]:
fine_tuned_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

'<extra_id_0> Hofer, mayor of Kirchberg ob der Donau, was killed in the village of Altenfelden in Austria’s rural Muhlviertel region, near the border with Germany and the Czech Republic. The suspect was fatally shot dead a short while later, police said. It was known to have killed two people and fled the scene. The incident was shocked officials at the People’s Party (OVP) regional headquarters in Linz. The man was shot dead after a hunter who had killed the two people. A second man was attacked by a small-scale police operation with special forces on the front of Austria. He was arrested for a long-running feud that had escalated during the shooting. There was a large scale police operation involving helicopters and Special forces. The first man was named Roland Drexler, 56, where he was murdered.'

In [28]:
# COMPARING PERFORMANCE WITH THE BASE MODEL (MT5-SMALL)

base_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
base_inputs = base_tokenizer(text7, return_tensors="pt").input_ids

In [29]:
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
#outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
generated_ids = model.generate(
   inputs,
    do_sample=False,
    #temperature=0.2,
    #dola_layers="low",
    #top_k=40,
    repetition_penalty=2.0,
    max_new_tokens=300,
    min_new_tokens=200,   # min length for generation
    num_beams=5,     # n beam search
    no_repeat_ngram_size=3,  # prevent repetitive n-grams
)

In [30]:
base_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

'<extra_id_0> Hofer, mayor of Kirchberg ob der Donau, was killed in the village of Altenfelden in the region Muhlviertel region, near the border with Germany and the Czech Republic. “He was a difficult person,” said Herbert Sieghartsleitner. The suspect was known to have killed two people and fled the scene. The man was fatally shot dead a short while later, police said. It was reported that he had killed a long-running feud at the local headquarters in Linz. The incident was shocked officials at the People’s Party (OVP) regional head quarters in the area. He was also known for hunting rights. Police said the shooting was dangerous and armed. A second man was shot dead after a hunter allegedly fatally attacked a small-scale police operation with helicopters and special forces is underway.'