In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

In [None]:
from datasets import load_dataset

dataset = load_dataset("scientific_papers",'arxiv')

print(f"Features in arxiv : {dataset['train'].column_names}")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = T5Tokenizer.from_pretrained('t5-small')
prefix = "summarize: "

def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples['article']]
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples['abstract'], max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
  return model_inputs

tokenized_ds = dataset['train'].map(preprocess_function, batched=True)
tokenized_ds = tokenized_ds.remove_columns(dataset['train'].column_names)

In [None]:
tokenized_eval = dataset['validation'].map(preprocess_function, batched=True)

tokenized_eval = tokenized_ds.remove_columns(dataset['train'].column_names)

In [None]:
model_t5 = AutoModelForSeq2SeqLM.from_pretrained('t5-small').to(device)

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model_t5)

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='t5-arxiv', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16,
    remove_unused_columns=False,
    fp16=True,
)

In [None]:
trainer = Trainer(model=model_t5, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=tokenized_ds,
                  eval_dataset=tokenized_eval)

In [None]:
trainer.train()