<a href="https://colab.research.google.com/github/EricCallaway/COSC_6319_Project/blob/NavyaMakkena_COSC_6319_Project/finetune_pegasus_reviews_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rouge_score



In [None]:
!pip install datasets



In [None]:
!pip install transformers



In [None]:
!pip install sentencepiece



In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusConfig, PegasusTokenizer, Seq2SeqTrainer , Seq2SeqTrainingArguments
from datasets import load_dataset,load_metric
import nltk
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
nltk.download('punkt')

class dataset_pegasus(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)
      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  # Prepare input data for model fine-tuning
  tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', model_name) 

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = dataset_pegasus(encodings, decodings)
    return dataset_tokenized

  print(type(train_texts))
  print(type(train_labels))
  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

# Metric
metric = load_metric("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
      preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, output_dir='./results', freeze_encoder=False):
  # Prepare configurations and base model for fine-tuning
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  configuration = PegasusConfig(vocab_size=96103, encoder_layers=8, encoder_attention_heads=8, decoder_layers=8, decoder_attention_heads=8)
  model = PegasusForConditionalGeneration.from_pretrained(model_name,config = configuration).to(torch_device)
  #model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
  configuration = model.config

  if freeze_encoder:
   for param in model.model.encoder.parameters():
    param.requires_grad = False

  if val_dataset is not None:
    training_args = Seq2SeqTrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=1,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=100,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=500,                  # number of update steps before evaluation
      warmup_steps=100,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
      predict_with_generate =True
    )

    trainer = Seq2SeqTrainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
    )

  return trainer

if __name__=='__main__':

  dataset = load_dataset('csv', data_files='/content/drive/My Drive/Colab Notebooks/Reviews.csv')
  
  review_train, summary_train = dataset['train']['Text'][:1000],dataset['train']['Summary'][:1000]
  review_validation, summary_validation = dataset['train']['Text'][:1000],dataset['train']['Summary'][:1000]
  d = {'Text': review_train, 'Summary': summary_train} 
  test_df = pd.DataFrame(d)
  test_df = test_df.dropna()
  review_train = test_df["Text"].tolist()
  summary_train = test_df["Summary"].tolist()
  d = {'Text': review_validation, 'Summary': summary_validation} 
  test_df = pd.DataFrame(d)
  test_df = test_df.dropna()
  review_validation = test_df["Text"].tolist()
  summary_validation = test_df["Summary"].tolist()


  model_name = 'google/pegasus-large'
  train_dataset,val_dataset, test_dataset, tokenizer = prepare_data(model_name, review_train, summary_train,review_validation, summary_validation)
  
  # Training
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset,val_dataset, freeze_encoder=True)
  train_result = trainer.train()
  trainer.save_model()  # Saves the tokenizer too for easy upload

  metrics = train_result.metrics
  trainer.log_metrics("train", metrics)
  trainer.save_metrics("train", metrics)
  trainer.save_state()

   # Evaluation
  metrics = trainer.evaluate( metric_key_prefix="eval")
  trainer.log_metrics("eval", metrics)
  trainer.save_metrics("eval", metrics)

     # prediction
  predict_results = trainer.predict( review_validation,metric_key_prefix="predict")
  metrics = predict_results.metrics
  trainer.log_metrics("predict", metrics)
  trainer.save_metrics("predict", metrics)
  if trainer.is_world_process_zero():
    predictions = tokenizer.batch_decode(
        predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    predictions = [pred.strip() for pred in predictions]
    output_prediction_file =  "/content/drive/My Drive/Colab Notebooks/Transformer_Predictions/generated_predictions.txt"
    with open(output_prediction_file, "w") as writer:
        writer.write("\n".join(predictions))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using custom data configuration default-7602425e518ea195
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-7602425e518ea195/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main
loading configuration file https://huggingface.co/google/pegasus-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa0446657dd3714a950ba400a3fa72686d0f815da436514e4823a973ef23e20.f2dc0735a07d1a70170e8e0e4d5fb57ad90d8ea5201a0dbd4b33f2f499444852
Model config PegasusConfig {
  "_name_or_path": "google/pegasus-large",
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder

<class 'list'>
<class 'list'>


loading weights file https://huggingface.co/google/pegasus-large/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/ef3a8274e003ba4d3ae63f2728378e73affec0029e797c0bbb80be8856130c4f.a99cb24bd92c7087e95d96a1c3eb660b51e498705f8bd068a58c69c20616f514
Some weights of the model checkpoint at google/pegasus-large were not used when initializing PegasusForConditionalGeneration: ['model.decoder.layers.14.encoder_attn.q_proj.bias', 'model.decoder.layers.11.self_attn_layer_norm.weight', 'model.decoder.layers.13.self_attn_layer_norm.weight', 'model.decoder.layers.13.encoder_attn.out_proj.bias', 'model.decoder.layers.13.self_attn.q_proj.bias', 'model.decoder.layers.8.encoder_attn.out_proj.weight', 'model.decoder.layers.8.encoder_attn.k_proj.weight', 'model.decoder.layers.8.self_attn.q_proj.weight', 'model.encoder.layers.14.fc1.bias', 'model.decoder.layers.9.self_attn.q_proj.bias', 'model.encoder.layers.15.self_attn.k_proj.bias', 'model.decoder.layers.10.self_attn.out_

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,8.5141,8.266937,0.0,0.0,0.0,0.0,1.0
1000,8.092,7.8144,0.0,0.0,0.0,0.0,1.0


Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-100/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-600] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-200
Configuration saved in ./results/checkpoint-200/config.json
Model weights saved in ./results/checkpoint-200/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-200/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-700] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-300
Configuration saved in ./results/checkpoint-300/config.json
Model weights saved in ./results/checkpoint-300/pytorch_mode

***** train metrics *****
  epoch                    =        1.0
  total_flos               =  1345523GF
  train_loss               =     8.7836
  train_runtime            = 1:10:59.62
  train_samples_per_second =      0.235
  train_steps_per_second   =      0.235


***** Running Prediction *****
  Num examples = 1000
  Batch size = 1


***** eval metrics *****
  epoch                   =        1.0
  eval_gen_len            =        1.0
  eval_loss               =     7.8144
  eval_rouge1             =        0.0
  eval_rouge2             =        0.0
  eval_rougeL             =        0.0
  eval_rougeLsum          =        0.0
  eval_runtime            = 0:20:39.88
  eval_samples_per_second =      0.807
  eval_steps_per_second   =      0.807


AttributeError: ignored