In [1]:
!pip install transformers
!pip install sentencepiece
!pip install inltk
!pip install datasets
!pip install rouge
!pip install rouge_score
!pip install nltk

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 46.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 63.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[?25l[K     |█                               | 10 kB 21.1 MB/s eta 0:00:01[K     |██                              | 20 kB 9.9 MB/s eta 0:00:01[K     |███▏                            | 30 kB 8.2 MB/s eta 0:00:01[K     |████▏                           | 40 kB 7.4 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 4.4 MB/s eta 0:00:01[K     |██████▎                         | 61 kB 5.1 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.5 MB/s eta 0:00:01[K     |████████▍                       | 81 kB 5.5 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 6.1 MB/s eta 0:00:01[K     |██████████▌                     | 102 kB 5.1 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 5.1 MB/s eta 0:00:01[K     |████████████▋                   | 122 kB 5.1 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 5.1 MB/s eta 0:00:01[

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, Seq2SeqTrainingArguments,Seq2SeqTrainer, AutoTokenizer
from datasets import load_dataset, list_datasets,load_metric
import torch
import sentencepiece as spm
import numpy as np
import nltk

nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
model_name = 'google/pegasus-cnn_dailymail'
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# converting our encodings in to Dataset objects
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
        
    def __len__(self):
        return len(self.labels['input_ids'])

#####################################################################################################################################################
#####################################################################################################################################################

def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """

  # create tokenizer for our model
  tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

# create encodings from out text data

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

# create combinded dataset object from text and label encodings
  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset

#####################################################################################################################################################
#####################################################################################################################################################

metric = load_metric('rouge')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

#####################################################################################################################################################
#####################################################################################################################################################

def prepare_fine_tuning(model_name, test_dataset, freeze_encoder=False, output_dir='./results'):
  
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
  tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")

  # batch = tokenizer.prepare_seq2seq_batch(in_text, truncation=True, padding='longest').to(torch_device) 

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  args = Seq2SeqTrainingArguments(
      output_dir=output_dir,           # output directory
      per_device_eval_batch_size=2,   # batch size for evaluation
      predict_with_generate=True,
    )

  trainer = Seq2SeqTrainer(
      model=model,                         
      args=args,                 
      compute_metrics=compute_metrics if args.predict_with_generate else None,
      tokenizer=tokenizer,

    )
  return trainer

Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Dataset

In [5]:
# ds_test = load_dataset('ccdv/cnn_dailymail','3.0.0' ,split='test[:100]')

ds_test = load_dataset('xsum' ,split='test[:100]')

Downloading:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/954 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


In [6]:
# test_texts, test_labels = ds_test['article'], ds_test['highlights']
test_texts, test_labels = ds_test['document'], ds_test['summary']

In [7]:
test_dataset,_,_ = prepare_data(model_name, test_texts, test_labels)

In [8]:
trainer = prepare_fine_tuning(model_name,test_dataset)

Predictions

In [9]:
%%time
predict_results = trainer.predict(test_dataset,metric_key_prefix="predict")

***** Running Prediction *****
  Num examples = 100
  Batch size = 2


CPU times: user 1h 39min 16s, sys: 1min 27s, total: 1h 40min 43s
Wall time: 1h 40min 20s


In [10]:
predict_results.metrics

{'predict_gen_len': 61.82,
 'predict_loss': 8.115155220031738,
 'predict_rouge1': 21.3764,
 'predict_rouge2': 3.8746,
 'predict_rougeL': 14.0101,
 'predict_rougeLsum': 14.0902,
 'predict_runtime': 6020.103,
 'predict_samples_per_second': 0.017,
 'predict_steps_per_second': 0.008}

In [11]:
final_output = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [12]:
final_output[:2]

['Some ex-offenders are living rough for up to a year before finding accommodation.<n>Prison Link Cymru had 1,099 referrals in 2015-16.<n>Charity workers say investment in housing would be cheaper than jailing homeless repeat offenders.<n>Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.',
 'Three firearms, ammunition and a five-figure sum of money were recovered.<n>A 26-year-old man was arrested and appeared at Edinburgh Sheriff Court.']

Score calculation

In [13]:
# #list of dictionary

# [{a:'csdv',b:'cdvsvs',c:'feffsefesf'},
#  {},{}
# ]

####################################################################################

# lisst = []
# for t,l,p in zip(test_texts, test_labels, final_output):
#   dic = {}
#   dic['text'] = t
#   dic['Predicted Summary'] = p
#   dic['Actual Summary'] = l

#   lisst.append(dic)

# print(lisst)

####################################################################################

In [44]:
rouge = load_metric('rouge')
id =[] 
text=[]
predicted_summary =[]
actual_summary =[]
r1 = []
r2 = []
rL =[]
rLSum = []

for i in range(1,101):
  index = i
  txt = test_texts[(i-1):(i)]
  actual = test_labels[(i-1):(i)]
  predicted = final_output[(i-1):(i)]

  result = rouge.compute(predictions=predicted, references=actual)
# print(result['rouge1'].mid.fmeasure)
  result1 = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  print(f'{i} -mid : {result1}')

  id.append(index)
  text.append(txt)
  predicted_summary.append(predicted)
  actual_summary.append(actual)
  r1.append(result1['rouge1'])
  r2.append(result1['rouge2'])
  rL.append(result1['rougeL'])
  rLSum.append(result1['rougeLsum'])


# .........................................................................
# Individual score :

# from rouge import Rouge

# rouge = Rouge()
# rouge.get_scores(final_output, test_labels, avg=True)

1 -mid : {'rouge1': 25.0, 'rouge2': 10.256410256410255, 'rougeL': 15.0, 'rougeLsum': 15.0}
2 -mid : {'rouge1': 41.860465116279066, 'rouge2': 9.75609756097561, 'rougeL': 23.25581395348837, 'rougeLsum': 23.25581395348837}
3 -mid : {'rouge1': 21.333333333333332, 'rouge2': 8.21917808219178, 'rougeL': 16.0, 'rougeLsum': 16.0}
4 -mid : {'rouge1': 13.88888888888889, 'rouge2': 2.857142857142857, 'rougeL': 5.555555555555556, 'rougeLsum': 5.555555555555556}
5 -mid : {'rouge1': 16.842105263157894, 'rouge2': 4.301075268817205, 'rougeL': 10.526315789473683, 'rougeLsum': 10.526315789473683}
6 -mid : {'rouge1': 14.705882352941178, 'rouge2': 0.0, 'rougeL': 8.823529411764705, 'rougeLsum': 8.823529411764705}
7 -mid : {'rouge1': 15.384615384615385, 'rouge2': 4.49438202247191, 'rougeL': 4.395604395604395, 'rougeLsum': 4.395604395604395}
8 -mid : {'rouge1': 16.666666666666664, 'rouge2': 2.4390243902439024, 'rougeL': 9.523809523809524, 'rougeLsum': 9.523809523809524}
9 -mid : {'rouge1': 16.438356164383556, 

In [45]:
import csv
from itertools import zip_longest

headline =('id', 'text', 'Predicted Summary','Actual Summary','rouge1','rouge2','rougeL','rougeLsum')
data = [id,text, predicted_summary,actual_summary,r1,r2,rL,rLSum]
export_data = zip_longest(*data, fillvalue = '')
with open('/content/Xsum100_on_PEGASUS_CNN.csv', 'w', newline='') as file:
      write = csv.writer(file)
      write.writerow(headline)
      write.writerows(export_data)

In [None]:
# # Tried for json file

# import json
# i=0
# with open("/content/File/sample.json", "a") as outfile:
#   for t,l,p in zip(test_texts, test_labels, final_output):
#     i+=1
#     dic = {}
#     dic['id'] = str(i)
#     dic['text'] = str(t)
#     dic['Predicted Summary'] = str(p)
#     dic['Actual Summary'] = str(l)
#     json_object = json.dumps(dic, indent = 4)
#     outfile.write(json_object)