# Summarization

In [1]:
from datasets import load_dataset

billsum = load_dataset("xsum", split="train[:2%]").filter(lambda x: len(x["document"]) <= 1000)
billsumTest = load_dataset("xsum", split="test[:1%]").filter(lambda x: len(x["document"]) <= 1000)
billsumEval = load_dataset("xsum", split="validation[:1%]").filter(lambda x: len(x["document"]) <= 1000)

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
len(billsum)

929

In [3]:
from transformers import DataCollatorForLanguageModeling, GPT2LMHeadModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments, Trainer
from transformers import GPT2Tokenizer, GPT2Config

tokenizer = GPT2Tokenizer.from_pretrained("gpt2", max_length=1000, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to("cuda:0")




## Preprocess

In [4]:
def preprocess_function(examples):
    inputs = [ "summarize: " + doc1  + "TL.DR " + doc2 for doc1,doc2 in zip(examples["document"],examples["summary"])]
    model_inputs = tokenizer(inputs,return_tensors='pt', truncation=True, max_length=1000, padding=True)

    labels = tokenizer(text_target=inputs,return_tensors='pt', max_length=1000, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
len(billsum)

929

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [6]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)
tokenized_billsumTest = billsum.map(preprocess_function, batched=True)
tokenized_billsumEval = billsum.map(preprocess_function, batched=True)

In [7]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt")

In [9]:
def train(output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          resume_from_checkpoint):

  training_args = Seq2SeqTrainingArguments(
          output_dir=output_dir,
          logging_strategy="epoch",
          learning_rate=2e-5,
          weight_decay=0.01,
          save_total_limit=3,
          predict_with_generate=True,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          evaluation_strategy="epoch",
          per_device_eval_batch_size=per_device_train_batch_size,
        #   fp16=True,
      )

  trainer = Seq2SeqTrainer(
          model=model,
          args=training_args,
          train_dataset=tokenized_billsum,
          tokenizer=tokenizer,
          data_collator=data_collator,
          eval_dataset=tokenized_billsumEval,
          compute_metrics=compute_metrics,
  )
      
  trainer.train(resume_from_checkpoint = resume_from_checkpoint)
  trainer.save_model()
  return trainer

# Train
TrainerMain = train(
    output_dir='outputdir2',
    overwrite_output_dir=True,
    per_device_train_batch_size=10,
    num_train_epochs=20,
    resume_from_checkpoint = False
)

  3%|▎         | 53/1860 [06:04<3:18:45,  6.60s/it]

## Evaluate

In [None]:
print(generate_text(billsum[33]["document"], max_new_tokens = 50))

In [None]:
TrainerMain.state.log_history

In [None]:
import matplotlib.pyplot as plt

data = TrainerMain.state.log_history
# Rozdzielanie danych
epoch = [entry['epoch'] for entry in data if 'loss' in entry]
loss_values = [entry['loss'] for entry in data if 'loss' in entry]
eval_loss_values = [entry['eval_loss'] for entry in data if 'eval_loss' in entry]

# Tworzenie wykresów
plt.figure(figsize=(12, 6))

# Wykres funkcji straty
plt.subplot(1, 2, 1)
plt.plot(epoch, loss_values, marker='o', linestyle='-', color='b')
plt.title('Funkcja Straty')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

# Wykres funkcji straty ewaluacyjnej
plt.subplot(1, 2, 2)
plt.plot(epoch, eval_loss_values, marker='o', linestyle='-', color='r')

plt.axhline(y=1, color='green', linestyle='-', label='Linia pozioma')

plt.title('Funkcja wyniku ROUGE-L')
plt.xlabel('epoch')
plt.ylabel('ROUGE-L')

plt.tight_layout()
plt.show()
