In [1]:
from datasets import load_dataset, load_metric

In [2]:
import numpy as np

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [4]:
train_dataset = load_dataset('bakhitovd/data_science_arxiv', split='train')

Found cached dataset json (C:/Users/bakhi/.cache/huggingface/datasets/bakhitovd___json/bakhitovd--data_science_arxiv-d562cf23e63fbcaf/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


In [5]:
test_dataset = load_dataset('bakhitovd/data_science_arxiv', split='test')

Found cached dataset json (C:/Users/bakhi/.cache/huggingface/datasets/bakhitovd___json/bakhitovd--data_science_arxiv-d562cf23e63fbcaf/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


In [6]:
val_dataset = load_dataset('bakhitovd/data_science_arxiv', split='validation')

Found cached dataset json (C:/Users/bakhi/.cache/huggingface/datasets/bakhitovd___json/bakhitovd--data_science_arxiv-d562cf23e63fbcaf/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


In [6]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [14]:
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

In [22]:
# set generate hyperparameters
led.config.num_beams = 2
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [7]:
max_input_length = 7168 # it is calculated
max_output_length = 512
batch_size = 1

In [8]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["article"],
        padding='max_length',
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["abstract"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [18]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [9]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "abstract"],
)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "abstract"],
)

val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

Loading cached processed dataset at C:\Users\bakhi\.cache\huggingface\datasets\bakhitovd___json\bakhitovd--data_science_arxiv-d562cf23e63fbcaf\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-f77776da8f89a889.arrow


In [17]:
rouge = load_metric("rouge")

In [19]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    output_dir="./",
    logging_steps=250,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
)

In [20]:
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Using cuda_amp half precision backend


In [20]:
trainer.train()

***** Running training *****
  Num examples = 30280
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 22710
  Number of trainable parameters = 161844480
You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
500,2.8628,2.743932,0.1824,0.1284,0.1452
1000,2.7997,2.661958,0.1892,0.1311,0.149
1500,2.7303,2.655223,0.1924,0.1256,0.1462
2000,2.6413,2.617599,0.1857,0.1331,0.1496
2500,2.6169,2.581202,0.1869,0.1404,0.154
3000,2.6105,2.582049,0.1832,0.1475,0.1567
3500,2.6279,2.55202,0.182,0.1325,0.1474
4000,2.5583,2.513738,0.175,0.1529,0.1558
4500,2.5879,2.521286,0.1801,0.1479,0.1557
5000,2.5143,2.480737,0.1856,0.1528,0.1609


  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 1196
  Batch size = 1
Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500\config.json
Model weights saved in ./checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./checkpoint-500\tokenizer_config.json
Special tokens file saved in ./checkpoint-500\special_tokens_map.json
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 1196
  Batch size = 1
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000\config.json
Model weights saved in ./checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./checkpoint-1000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1196
  Batch size = 1
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500\config.json
Model weights saved in ./checkpoint-1500\py

TrainOutput(global_step=22710, training_loss=2.3153236849409846, metrics={'train_runtime': 495743.7726, 'train_samples_per_second': 0.183, 'train_steps_per_second': 0.046, 'total_flos': 4.292515625906995e+17, 'train_loss': 2.3153236849409846, 'epoch': 3.0})

In [20]:
led.save_pretrained("LED_7k_epoch_3")

Configuration saved in LED_7k_epoch_3\config.json
Model weights saved in LED_7k_epoch_3\pytorch_model.bin
