In [1]:
!pip install transformers[sentencepiece] datasets py7zr -q

zsh:1: no matches found: transformers[sentencepiece]


In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[nltk_data] Downloading package punkt to /Users/christy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from datasets import load_dataset

dataset = load_dataset("scientific_papers",'arxiv')

print(f"Features in arxiv : {dataset['train'].column_names}")

Downloading and preparing dataset scientific_papers/arxiv to /Users/christy/.cache/huggingface/datasets/scientific_papers/arxiv/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = T5Tokenizer.from_pretrained('t5-small')
prefix = "summarize: "

def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples['article']]
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples['abstract'], max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
  return model_inputs

tokenized_ds = dataset['train'].map(preprocess_function, batched=True)
tokenized_ds = tokenized_ds.remove_columns(dataset['train'].column_names)

In [None]:
tokenized_eval = dataset['validation'].map(preprocess_function, batched=True)

tokenized_eval = tokenized_eval.remove_columns(dataset['train'].column_names)

In [None]:
model_t5 = AutoModelForSeq2SeqLM.from_pretrained('t5-small').to(device)

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model_t5)

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='t5-arxiv', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16,
    remove_unused_columns=False,
    fp16=True,
)

In [None]:
trainer = Trainer(model=model_t5, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=tokenized_ds,
                  eval_dataset=tokenized_eval)

In [None]:
trainer.train()

In [None]:
trainer.save_model('./t5_fine')

In [None]:
!pip install rouge_score
from datasets import load_metric

In [None]:
from transformers import pipeline

pipe = pipeline('summarization', model = './t5_fine')
pipe2= pipeline('summarization', model = 't5-small')


for i in range(5):
    summaries= {}
    sample_text = dataset["test"][i]["article"][:5000]
    reference = dataset['test'][i]['abstract']
    pipe_out = pipe(sample_text)
    summaries['t5'] = 'n'.join(sent_tokenize(pipe2(sample_text)[0]['summary_text']))
    summaries['t5F'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

    rouge_metric = load_metric('rouge')

    rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
    records = []

    
    for model_name in summaries:
        records.clear()
        rouge_metric.add(prediction = summaries[model_name], reference = reference )
        score = rouge_metric.compute()
        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
        print(model_name,rouge_dict)
        records.append(rouge_dict)