# FIne-tuned PEGASUS-Xsum with Xsum-gujarati dataset
- We have updated tokenizer and directly did inference on gujarati data

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install inltk
!pip install datasets
!pip install rouge
!pip install rouge_score
!pip install nltk

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 48.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 31.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 8.3 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 69.6 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 60.7 MB/s 
Installing collected packages: fsspec, xxhash, datasets
Successfully installed datasets-1.18.3 fsspec-2022.1.0 xxhash-2.0.2
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, Seq2SeqTrainingArguments,Seq2SeqTrainer
from datasets import load_dataset, list_datasets,load_metric
import torch
import sentencepiece as spm
import numpy as np
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# converting our encodings in to Dataset objects
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
        
    def __len__(self):
        return len(self.labels['input_ids'])

In [4]:
# Input : model-name, input-text(X), label(y) 
# 1.) Take our text data
# 2.) Apply tokenizer on it according to our model
# 3.) encodings : Convert thoes tokens to numbers
# 4.) Prepare DataSet from thoes encodings
# return that DataSet object

def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """

  # create tokenizer for our model
  tokenizer = PegasusTokenizer(vocab_file='/content/drive/MyDrive/Tokenizer/gujarati_lm.model',name_or_path='PEGASUS', model_max_length=1024,bos_token='<s>')
  
  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

# create encodings from out text data

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

# create combinded dataset object from text and label encodings
  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset

In [6]:
def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
  tokenizer = PegasusTokenizer(vocab_file='/content/drive/MyDrive/Tokenizer/gujarati_lm.model',name_or_path='PEGASUS', model_max_length=1024,bos_token='<s>')
  model.resize_token_embeddings(len(tokenizer))

  # batch = tokenizer.prepare_seq2seq_batch(in_text, truncation=True, padding='longest').to(torch_device) 

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    pass

  else:
    training_args = Seq2SeqTrainingArguments(
      output_dir=output_dir,           # output directory
      per_device_eval_batch_size=1,   # batch size for evaluation
      predict_with_generate=True,
    )

    trainer = Seq2SeqTrainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer,
      compute_metrics=compute_metrics if training_args.predict_with_generate else None,


    )

  return trainer

In [7]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [9]:
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [8]:
tokenizer = PegasusTokenizer(vocab_file='/content/drive/MyDrive/Tokenizer/gujarati_lm.model',name_or_path='PEGASUS', model_max_length=1024,bos_token='<s>')

In [10]:
from datasets import load_dataset

ds = load_dataset("GEM/xlsum",'gujarati',split='test[:100]')
lable, text = ds['target'],ds['text']

Downloading:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/83.2k [00:00<?, ?B/s]

Downloading and preparing dataset xlsum/gujarati (download: 15.85 MiB, generated: 128.11 MiB, post-processed: Unknown size, total: 143.96 MiB) to /root/.cache/huggingface/datasets/GEM___xlsum/gujarati/2.0.0/c5f94b79254b76efed292f24957fe663c4b35c83e91284fbefc51adf4aea8dc0...


Downloading:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xlsum downloaded and prepared to /root/.cache/huggingface/datasets/GEM___xlsum/gujarati/2.0.0/c5f94b79254b76efed292f24957fe663c4b35c83e91284fbefc51adf4aea8dc0. Subsequent calls will reuse this data.


In [11]:
metric = load_metric('rouge')

Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [14]:
model_name = 'google/pegasus-xsum'
test_dataset,_,_ = prepare_data(model_name, text, lable)

In [13]:
trainer = prepare_fine_tuning(model_name,test_dataset)

Downloading:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [15]:
predict_results = trainer.predict(test_dataset,metric_key_prefix="predict")

***** Running Prediction *****
  Num examples = 100
  Batch size = 1


In [19]:
predict_results.metrics

{'predict_gen_len': 86.32,
 'predict_loss': 9.815193176269531,
 'predict_rouge1': 1.8333,
 'predict_rouge2': 0.0,
 'predict_rougeL': 1.8333,
 'predict_rougeLsum': 1.8333,
 'predict_runtime': 7489.6829,
 'predict_samples_per_second': 0.013,
 'predict_steps_per_second': 0.013}

In [18]:
final_output = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [20]:
print(final_output[:1])
print(lable[:1])

['મુખ્યમંત્રીએ કહ્યું હતું કે નર્મદાના પાણી પર નભતા રાજ્યના 10 હજાર થી વધુ ગામડા અને 167 જેટલા નગરોને પીવાના પાણીની તકલીફ ન પડે તેટલા માટે ઉનાળુ પાક ખેડૂતો ન કરે તેવી અપેક્ષા રખાય છે. તેમણે કહ્યું હતું કે રાજય સરકારે ચોમાસું અને શિયાળુ પાક માટે પાણી આપ્યું હતું અને શિયાળુ પાક પર કોઈ અસર ન પડે તેટલા માટે ઉનાળામાં પાણી ન આપવાનો નિર્ણય કર્યો છે.']
['દિવ્યભાસ્કરમાં પ્રકાશિત અહેવાલ મુજબ મુખ્યમંત્રી વિજય રૂપાણીએ ખેડૂતોને ઉનાળુ પાક ન કરવાની ચેતવણી આપી હતી.']
