# Fine-tuned PEGASUS_XSum on Xsum english data
- As model was already fine-tuned on english, fine-tuning aleady fine-tuned model on specific data, gave us good results
- It gave us ROUGE socre of ~48

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install inltk
!pip install datasets
!pip install rouge
!pip install rouge_score
!pip install nltk

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, Seq2SeqTrainingArguments,Seq2SeqTrainer
from datasets import load_dataset, list_datasets,load_metric
import torch
import sentencepiece as spm
import numpy as np
import nltk

nltk.download('punkt')

In [None]:
# converting our encodings in to Dataset objects
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
        
    def __len__(self):
        return len(self.labels['input_ids'])

In [None]:
# Input : model-name, input-text(X), label(y) 
# 1.) Take our text data
# 2.) Apply tokenizer on it according to our model
# 3.) encodings : Convert thoes tokens to numbers
# 4.) Prepare DataSet from thoes encodings
# return that DataSet object

def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """

  # create tokenizer for our model
  tokenizer = PegasusTokenizer.from_pretrained(model_name)
  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

# create encodings from out text data

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

# create combinded dataset object from text and label encodings
  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset

In [None]:
def prepare_fine_tuning(model_name, test_dataset, freeze_encoder=False, output_dir='./results'):
  
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  # batch = tokenizer.prepare_seq2seq_batch(in_text, truncation=True, padding='longest').to(torch_device) 

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  args = Seq2SeqTrainingArguments(
      output_dir=output_dir,           # output directory
      per_device_eval_batch_size=2,   # batch size for evaluation
      predict_with_generate=True,
    )

  trainer = Seq2SeqTrainer(
      model=model,                         
      args=args,                 
      compute_metrics=compute_metrics if args.predict_with_generate else None,
      tokenizer=tokenizer,

    )
  return trainer

In [None]:
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
metric = load_metric('rouge')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
model_name = 'google/pegasus-xsum'
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
from datasets import load_dataset

ds_test = load_dataset("xsum", split='test[:100]')
# ds_valid = load_dataset("xsum", split='validation[:10]')

test_texts, test_labels = ds_test['document'], ds_test['summary']

Downloading:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/954 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


In [None]:
test_dataset,_,_ = prepare_data(model_name, test_texts, test_labels)

In [None]:
trainer = prepare_fine_tuning(model_name,test_dataset)

In [None]:
%%time
predict_results = trainer.predict(test_dataset,metric_key_prefix="predict")

***** Running Prediction *****
  Num examples = 100
  Batch size = 2


CPU times: user 41min 24s, sys: 32.3 s, total: 41min 56s
Wall time: 41min 57s


In [None]:
predict_results.metrics

{'predict_gen_len': 22.92,
 'predict_loss': 7.481271743774414,
 'predict_rouge1': 48.2618,
 'predict_rouge2': 26.1769,
 'predict_rougeL': 41.4554,
 'predict_rougeLsum': 41.3227,
 'predict_runtime': 2517.5141,
 'predict_samples_per_second': 0.04,
 'predict_steps_per_second': 0.02}

In [None]:
final_output = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# **Testing**

In [None]:
test_texts[:1]

['Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.\nWorkers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.\nThe Welsh Government said more people than ever were getting help to address housing problems.\nChanges to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.\nPrison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.\nHowever, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.\nAndrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the need for ac

In [None]:
final_output[:1]

['There is a "desperate need" for more affordable housing for men leaving prison in Wales, a charity has said.']

In [None]:
test_labels[:1]

['There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.']