In [1]:
from datasets import load_dataset,Dataset, load_from_disk ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel
corpus = load_from_disk("../xlsum_fa_en.hf")

In [2]:
corpus

DatasetDict({
    train: Dataset({
        features: ['summary', 'text'],
        num_rows: 353773
    })
    test: Dataset({
        features: ['summary', 'text'],
        num_rows: 17441
    })
    validation: Dataset({
        features: ['summary', 'text'],
        num_rows: 17441
    })
})

In [3]:
from transformers import AutoConfig  ,RobertaTokenizerFast, EncoderDecoderConfig, EncoderDecoderModel


config = AutoConfig.from_pretrained('xlm-roberta-base')
tokenizer = RobertaTokenizerFast.from_pretrained("xlm-roberta-base")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base", "xlm-roberta-base")
print("loading the model is done!");

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.
Some weights of XLMRobertaForCausalLM were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.encoder.layer.0.crossattention.output.dense.weight', 'roberta.encoder.layer.9.crossattention.self.value.weight', 'roberta.encoder.layer.10.crossattention.self.value.weight', 'roberta.encoder.layer.7.crossattention.self.query.weight', 'roberta.encoder.layer.5.crossattention.self.query.bias', 'roberta.encoder.layer.5.crossattention.self.key.bias', 'roberta.encoder.layer.2.crossattention.self.key.weight', 'roberta.encoder.layer.6.crossattention.self.key.bias', 'roberta.encoder.layer.8.crossattention.self.key.weight', 'roberta.encoder.layer.8.crossattentio

loading the model is done!


In [4]:

tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
#parameter setting


def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["text"], padding="max_length",
                     max_length=256, truncation=True)
  outputs = tokenizer(batch["summary"], padding="max_length",
                      max_length=128, truncation=True)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

#processing training data
corpus_encoded = corpus.map(
    process_data_to_model_inputs,
    batched=True,
    remove_columns=["text", "summary"]
)
corpus_encoded.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)



Map:   0%|          | 0/353773 [00:00<?, ? examples/s]

Map:   0%|          | 0/17441 [00:00<?, ? examples/s]

Map:   0%|          | 0/17441 [00:00<?, ? examples/s]

In [5]:
corpus_encoded

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
        num_rows: 353773
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
        num_rows: 17441
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
        num_rows: 17441
    })
})

In [6]:
import datasets
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid
#     wandb.log({"rouge2_precision": round(rouge_output.precision, 4),
#                "rouge2_recall": round(rouge_output.recall, 4),
#                "rouge2_fmeasure": round(rouge_output.fmeasure, 4)})

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = datasets.load_metric("rouge")


In [7]:
from transformers.generation import GenerationConfig
# set special tokens
model.config.decoder_start_token_id = tokenizer.bos_token_id                                             
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id 



# set decoding params                               
model.config.max_length = 256
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.config.vocab_size = model.config.encoder.vocab_size

In [8]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import wandb


model.to("cuda")

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs = 1,
    evaluation_strategy="steps",
    logging_strategy ="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=10,
    weight_decay=0.01,
    gradient_checkpointing=True,
    fp16=True,
    
    max_steps=100000,
    logging_steps=100,
    predict_with_generate=True,
    warmup_steps=500, 
    seed=43
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=corpus_encoded['train'],
    eval_dataset=corpus_encoded['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
# wandb.init(project="Summeraziation-fa-en",name='xmlrobreta-changing_args-second_try')
result = trainer.train()
# wandb.finish()



[34m[1mwandb[0m: Currently logged in as: [33mali-fartout[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


In [None]:
import torch
torch.cuda.empty_cache()