In [1]:
from datasets import load_dataset,Dataset, load_from_disk ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel
corpus = load_from_disk("../xlsum_fa_en.hf")

In [2]:
corpus

DatasetDict({
    train: Dataset({
        features: ['summary', 'text'],
        num_rows: 353773
    })
    test: Dataset({
        features: ['summary', 'text'],
        num_rows: 17441
    })
    validation: Dataset({
        features: ['summary', 'text'],
        num_rows: 17441
    })
})

In [3]:
from transformers import AutoConfig  ,RobertaTokenizerFast, EncoderDecoderConfig, EncoderDecoderModel


config = AutoConfig.from_pretrained('xlm-roberta-base')

tokenizer = RobertaTokenizerFast.from_pretrained("xlm-roberta-base")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base", "xlm-roberta-base")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f5c1e29b-1acf-4e20-a33d-4ad3c5011796)')' thrown while requesting HEAD https://huggingface.co/xlm-roberta-base/resolve/main/config.json
Some weights of XLMRobertaForCausalLM were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.encoder.layer.6.crossattention.self.key.bias', 'roberta.encoder.layer.3.crossattention.output.dense.weight', 'roberta.encoder.layer.11.crossattention.self.key.bias', 'roberta.encoder.layer.3.crossattention.self.value.weight', 'roberta.encoder.layer.6.crossattention.self.query.bias', 'roberta.encoder.layer.4.crossattention.output.de

In [4]:
model

EncoderDecoderModel(
  (encoder): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
    

In [5]:

tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
#parameter setting
batch_size=256  #
encoder_max_length=40
decoder_max_length=8

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

#processing training data
corpus_encoded = corpus.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["text", "summary"]
)
corpus_encoded.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)



Map:   0%|          | 0/353773 [00:00<?, ? examples/s]

Map:   0%|          | 0/17441 [00:00<?, ? examples/s]

Map:   0%|          | 0/17441 [00:00<?, ? examples/s]

In [6]:
corpus_encoded_17k = DatasetDict()
corpus_encoded_17k['train'] = corpus_encoded['train'].select(range(17000))
corpus_encoded_17k['test'] = corpus_encoded['test'].select(range(17000))
corpus_encoded_17k['validation'] = corpus_encoded['validation'].select(range(17000))

In [7]:
import datasets
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid
    wandb.log({"rouge2_precision": round(rouge_output.precision, 4),
               "rouge2_recall": round(rouge_output.recall, 4),
               "rouge2_fmeasure": round(rouge_output.fmeasure, 4)})

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = datasets.load_metric("rouge")


In [17]:
# set special tokens
model.config.decoder_start_token_id = tokenizer.bos_token_id                                             
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id 
# sensible parameters for beam search
# set decoding params                               
model.config.max_length = 40
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.config.vocab_size = model.config.encoder.vocab_size

In [18]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [19]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import wandb


model.to("cuda")

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True,
    predict_with_generate=True
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=corpus_encoded_17k['train'],
    eval_dataset=corpus_encoded_17k['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [20]:
wandb.init(project="Summeraziation-fa-en",name='xmlrobreta-changing_args-second_try')
result = trainer.train()
wandb.finish()


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…



Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,5.364,5.287632,0.0025,0.0019,0.0021
2,4.9994,5.28547,0.0063,0.0056,0.0059




ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [None]:
# # map data correctly
# batch_size = 24
# tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
# model.to("cuda")
# def generate_summary(batch):
#     # Tokenizer will automatically set [BOS] <text> [EOS]
#     inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=40, return_tensors="pt")
#     input_ids = inputs.input_ids.to("cuda")
#     attention_mask = inputs.attention_mask.to("cuda")
#     outputs = model.generate(input_ids, attention_mask=attention_mask)
#     # all special tokens including will be removed
#     output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

#     batch["pred"] = output_str

#     return batch

# results = corpus['test'].map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["text"])
# pred_str = results["pred"]
# label_str = results["summary"]

In [None]:
# import datasets
# rouge = datasets.load_metric("rouge")
# print("ROUGE 1 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid)
# print("ROUGE 2 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid)
# print("ROUGE L SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rougeL"])["rougeL"].mid)