In [1]:
from datasets import load_dataset,Dataset, load_from_disk ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel
corpus = load_from_disk("../xlsum_fa_en_50k.hf")

In [2]:
corpus

DatasetDict({
    train: Dataset({
        features: ['summary', 'text'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['summary', 'text'],
        num_rows: 11812
    })
    validation: Dataset({
        features: ['summary', 'text'],
        num_rows: 11812
    })
})

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [24]:


def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["text"], padding="max_length",
                     max_length=1024, truncation=True)
  outputs = tokenizer(batch["summary"], padding="max_length",
                    max_length=128, truncation=True)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

#processing training data
corpus_encoded = corpus.map(
    process_data_to_model_inputs,
    batched=True,
    remove_columns=["text", "summary"]
)
corpus_encoded.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)



Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



In [25]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [26]:
from transformers import TrainingArguments, Trainer
import wandb


model.to("cuda")

training_args = TrainingArguments(
output_dir='./mT5-50k-10epochs', num_train_epochs=10,
per_device_train_batch_size=1, per_device_eval_batch_size=1,
weight_decay=0.01, logging_steps=10, push_to_hub=False,
evaluation_strategy='steps', eval_steps=3000, save_steps=1e6,
gradient_accumulation_steps=16)

# instantiate trainer
trainer = Trainer(model=model, args=training_args,
tokenizer=tokenizer, data_collator=seq2seq_data_collator,
train_dataset=corpus_encoded["train"],
eval_dataset=corpus_encoded["validation"])

In [27]:
result = trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mali-fartout[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,4.3315,3.073902
1000,3.588,2.794803
1500,3.5127,2.734263
2000,3.4209,2.688713
2500,3.5061,2.67475
3000,3.2974,2.664366


In [28]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to("cuda")

def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
    batch_size=16, device=device,
    column_text="text",
    column_summary="summary"):
    
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))
    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
        attention_mask=inputs["attention_mask"].to(device),
        length_penalty=0.8, num_beams=8, max_length=128)
        
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                            clean_up_tokenization_spaces=True)
                            for s in summaries]
        
#         decoded_summaries = [d.replace("</s>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
    score = metric.compute()
    return score

In [29]:
import pandas as pd
score = evaluate_summaries_pegasus(corpus["test"], rouge, trainer.model,
tokenizer,
column_summary="summary", batch_size=8)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1477/1477 [47:26<00:00,  1.93s/it]


In [30]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["mT5"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
mT5,0.101939,0.021832,0.080517,0.080526


In [31]:
trainer.save_model("mT5-50k")