<a href="https://colab.research.google.com/github/EricCallaway/COSC_6319_Project/blob/NavyaMakkena_COSC_6319_Project/finetune_pegasus_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
!pip install datasets



In [None]:
!pip install rouge_score



In [None]:
!pip install sentencepiece



In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusConfig, PegasusTokenizer, Seq2SeqTrainer , Seq2SeqTrainingArguments
from datasets import load_dataset,load_metric
import nltk
import os
import numpy as np
nltk.download('punkt')

class dataset_pegasus(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)
      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  # Prepare input data for model fine-tuning
  tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', model_name) 

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = dataset_pegasus(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

# Metric
metric = load_metric("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
      preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, output_dir='./results', freeze_encoder=False):
  # Prepare configurations and base model for fine-tuning
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  configuration = PegasusConfig(vocab_size=96103, encoder_layers=8, encoder_attention_heads=8, decoder_layers=8, decoder_attention_heads=8)
  model = PegasusForConditionalGeneration.from_pretrained(model_name,config = configuration).to(torch_device)
  configuration = model.config

  if freeze_encoder:
   for param in model.model.encoder.parameters():
    param.requires_grad = False

  if val_dataset is not None:
    training_args = Seq2SeqTrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=10,           # total number of training epochs
      per_device_train_batch_size=50,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=50,    # batch size for evaluation, can increase if memory allows
      save_steps=100,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=5000,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
      predict_with_generate =True
    )

    trainer = Seq2SeqTrainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
    )

  return trainer

if __name__=='__main__':

  dataset = load_dataset("csebuetnlp/xlsum",'english')

  train_report, train_summary, val_report, val_summary, test_report, test_summary = dataset['train']['text'][:100], dataset['train']['summary'][:100],dataset['validation']['text'][:100], dataset['validation']['summary'][:100],dataset['test']['text'][:100], dataset['test']['summary'][:100]
  
  model_name = 'google/pegasus-large'
  train_dataset,val_dataset, test_dataset, tokenizer = prepare_data(model_name, train_report, train_summary, val_report, val_summary, test_report, test_summary)
  
  # Training
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset,val_dataset, freeze_encoder=True)
  train_result = trainer.train()
  trainer.save_model()  # Saves the tokenizer too for easy upload

  metrics = train_result.metrics
  trainer.log_metrics("train", metrics)
  trainer.save_metrics("train", metrics)
  trainer.save_state()

   # Evaluation
  metrics = trainer.evaluate( metric_key_prefix="eval")
  trainer.log_metrics("eval", metrics)
  trainer.save_metrics("eval", metrics)

     # prediction
  predict_results = trainer.predict( test_dataset,metric_key_prefix="predict")
  metrics = predict_results.metrics
  trainer.log_metrics("predict", metrics)
  trainer.save_metrics("predict", metrics)
  if trainer.is_world_process_zero():
    predictions = tokenizer.batch_decode(
        predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    predictions = [pred.strip() for pred in predictions]
    output_prediction_file =  "/content/drive/My Drive/Colab Notebooks/Transformer_Predictions/generated_predictions.txt"
    with open(output_prediction_file, "w") as writer:
        writer.write("\n".join(predictions))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Reusing dataset xlsum (/root/.cache/huggingface/datasets/csebuetnlp___xlsum/english/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce)


  0%|          | 0/3 [00:00<?, ?it/s]

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main
Some weights of the model checkpoint at google/pegasus-large were not used when initializing PegasusForConditionalGeneration: ['model.decoder.layers.15.self_attn.k_proj.weight', 'model.decoder.layers.13.encoder_attn.out_proj.weight', 'model.encoder.layers.14.self_attn.out_proj.weight', 'model.encoder.layers.8.self_attn.v_proj.weight', 'model.decoder.layers.14.encoder_attn.q_proj.bias', 'model.decoder.layers.8.fc1.weight', 'model.decoder.layers.14.encoder_attn.q_proj.weight', 'model.decoder.layers.11.self_attn.q_proj.weight', 'model.decoder.layers.14.self_attn.k_proj.bias', 'model.encoder.layers.12.self_attn.k_proj.weight', 'model.decoder.layers.15.self_attn.k_proj.bias', 'model.encoder.layers.12.final_layer_norm.weight', 'model.decoder.layers.14.encoder_attn_layer_norm.bias', 'model.encoder.layers.15.final_layer_norm.weight', 'model.decoder.layers.14.self_attn.v_proj.bias', 'model.encoder.layers.8.fc1.bia

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin
tokenizer config file saved in ./results/tokenizer_config.json
Special tokens file saved in ./results/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 100
  Batch size = 50


***** train metrics *****
  epoch                    =       10.0
  total_flos               =  1345523GF
  train_loss               =    12.5375
  train_runtime            = 0:10:22.37
  train_samples_per_second =      1.607
  train_steps_per_second   =      0.032


***** Running Prediction *****
  Num examples = 100
  Batch size = 50


***** eval metrics *****
  epoch                   =       10.0
  eval_gen_len            =      18.64
  eval_loss               =    10.3205
  eval_rouge1             =     6.4464
  eval_rouge2             =        0.0
  eval_rougeL             =     6.4485
  eval_rougeLsum          =     6.5098
  eval_runtime            = 0:01:19.81
  eval_samples_per_second =      1.253
  eval_steps_per_second   =      0.025
***** predict metrics *****
  predict_gen_len            =       19.0
  predict_loss               =    10.3073
  predict_rouge1             =     9.5579
  predict_rouge2             =        0.0
  predict_rougeL             =     9.5913
  predict_rougeLsum          =     9.6169
  predict_runtime            = 0:01:21.20
  predict_samples_per_second =      1.231
  predict_steps_per_second   =      0.025
