### Evaluate LLM model for summarization using pipelines

In [1]:
# !pip install transformers datasets evaluate rouge_score

In [7]:
# load the dataset
from datasets import load_dataset

billsum = load_dataset("billsum")

# splits of the dataset
for split in billsum:
    print(f"Split : {split}")
    print(f"{billsum[split]}")
    print(f"Number of examples : {len(billsum[split])}")
    print()

Split : train
Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 18949
})
Number of examples : 18949

Split : test
Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 3269
})
Number of examples : 3269

Split : ca_test
Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})
Number of examples : 1237



In [8]:
# random example from a split
import random

split = 'ca_test'
eg = random.randint(0, len(billsum[split]))

print(f"Text : {billsum[split][eg]['text']}")
print()
print(f"Summary : {billsum[split][eg]['summary']}")
print()
print(f"Title : {billsum[split][eg]['title']}")

Text : The people of the State of California do enact as follows:


SECTION 1.
Section 1031 of the Government Code is amended to read:
1031.
Each class of public officers or employees declared by law to be peace officers shall meet all of the following minimum standards:
(a) Be a citizen of the United States or a permanent resident alien who is eligible for and has applied for citizenship, except as provided in Section 2267 of the Vehicle Code.
(b) Be at least 18 years of age.
(c) Be fingerprinted for purposes of search of local, state, and national fingerprint files to disclose a criminal record.
(d) Be of good moral character, as determined by a thorough background investigation.
(e) Be a high school graduate, pass the General Education Development Test or other high school equivalency test approved by the State Department of Education that indicates high school graduation level, pass the California High School Proficiency Examination, or have attained a two-year, four-year, or advan

In [39]:
# summarize the random example
from transformers import pipeline

# summarizer = pipeline("summarization")
summarizer = pipeline("summarization", model="google-t5/t5-small")
pred_summary = summarizer('summarize: ' + billsum[split][eg]['text'])
# pred_summary = summarizer('summarize: ' + billsum[split][eg]['text'], min_length=75, max_length=300, do_sample=False)

print(pred_summary)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (1406 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'section 2192 of the Streets and Highways Code is amended to read: 2192 . the commission shall allocate funds to projects that have identified and committed supplemental funding from appropriate local, federal, or private sources . it may give priority for funding to projects with higher levels of committment .'}]


In [6]:
# evaluate the result using rouge metric
import evaluate

rouge = evaluate.load('rouge')

result = rouge.compute(predictions=[pred_summary[0]['summary_text']], references=[billsum[split][eg]['summary']])
print(result)

{'rouge1': 0.284789644012945, 'rouge2': 0.0977198697068404, 'rougeL': 0.16828478964401297, 'rougeLsum': 0.20064724919093851}


### Evaluate any model on billsum dataset using rouge metric

In [2]:
import random
from transformers import pipeline
from datasets import load_dataset
import evaluate

# Load the billsum dataset
dataset = load_dataset("billsum")

# select split
split = 'ca_test'
# split = random.choice(['ca_test', 'test', 'train'])

print(f"Split : {split}")
print(f"{dataset[split]}")
print()

model_name = "google-t5/t5-small"
summarizer = pipeline("summarization", model=model_name, device='cuda')

rouge = evaluate.load("rouge")

summaries = []
references = []

n_egs = 10

# for idx in random.sample(range(len(dataset[split])), n_egs):
for idx in range(n_egs):
    text = dataset[split][idx]['text']
    reference_summary = dataset[split][idx]['summary']
    
    summarized_text = summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
    
    summaries.append(summarized_text)
    references.append(reference_summary)

results = rouge.compute(predictions=summaries, references=references)

print(f"\nRouge Score : {results}")

Split : ca_test
Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})



Token indices sequence length is longer than the specified maximum sequence length for this model (1694 > 512). Running this sequence through the model will result in indexing errors



Rouge Score : {'rouge1': 0.173212573146669, 'rouge2': 0.06451277215315138, 'rougeL': 0.1182440243345169, 'rougeLsum': 0.14274456177259698}


### Evaluate any model on billsum dataset using bleu score

In [3]:
import random
from transformers import pipeline
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu

# Load the billsum dataset
dataset = load_dataset("billsum")

# select split
split = 'ca_test'
# split = random.choice(['ca_test', 'test', 'train'])

print(f"Split : {split}")
print(f"{dataset[split]}")
print()

model_name = "google-t5/t5-small"
summarizer = pipeline("summarization", model=model_name, device='cuda')

# Evaluate the model using BLEU score
references = []
hypotheses = []

n_egs = 10

# for idx in random.sample(range(len(dataset[split])), n_egs):
for idx in range(n_egs):
    text = dataset[split][idx]['text']
    reference_summary = dataset[split][idx]['summary']

    # Generate summary using the model
    generated_summary = summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']

    references.append([reference_summary.split()])
    hypotheses.append(generated_summary.split())

# Calculate BLEU score
bleu_score = corpus_bleu(references, hypotheses)
print(f"\nBLEU Score: {bleu_score:.4f}")

Split : ca_test
Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})



Token indices sequence length is longer than the specified maximum sequence length for this model (1694 > 512). Running this sequence through the model will result in indexing errors



BLEU Score: 0.0011


## Summarization task from huggingface
https://huggingface.co/docs/transformers/en/tasks/summarization  

In [1]:
# Load BillSum dataset
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

billsum = billsum.train_test_split(test_size=0.2)

billsum["train"][0]

  from .autonotebook import tqdm as notebook_tqdm


{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nChapter 11 (commencing with Section 122380) is added to Part 6 of Division 105 of the Health and Safety Code, to read:\nCHAPTER  11. Pet Boarding Facilities\n122380.\nAs used in this chapter, the following definitions apply:\n(a) “Enrichment” means providing objects or activities, appropriate to the needs of the species, as well as the age, size, and condition of the pet, that stimulate the pet and promote the pet’s well-being.\n(b) “Permanent or fixed enclosure” means a structure, including, but not limited to, an exercise run, kennel, or room, used to restrict a pet, that provides for the effective separation of a pet from the pet’s waste products.\n(c) “Person” means an individual, partnership, firm, limited liability company, joint-stock company, corporation, association, trust, estate, or other legal entity.\n(d) “Pet” means any nonhuman animal housed in the pet boarding facility, including, but 

In [2]:
# Preprocess
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_billsum = billsum.map(preprocess_function, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map: 100%|██████████| 989/989 [00:04<00:00, 221.37 examples/s]
Map: 100%|██████████| 248/248 [00:01<00:00, 173.18 examples/s]


In [3]:
# Evaluate
import evaluate

rouge = evaluate.load("rouge")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="D:/Work/Research Group/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
                                                  
 25%|██▌       | 62/248 [33:58<1:04:50, 20.92s/it]

{'eval_loss': 2.8460168838500977, 'eval_rouge1': 0.126, 'eval_rouge2': 0.0361, 'eval_rougeL': 0.1039, 'eval_rougeLsum': 0.1042, 'eval_gen_len': 19.0, 'eval_runtime': 639.3669, 'eval_samples_per_second': 0.388, 'eval_steps_per_second': 0.025, 'epoch': 1.0}


                                                    
 50%|█████     | 124/248 [1:06:42<43:09, 20.89s/it]

{'eval_loss': 2.6353065967559814, 'eval_rouge1': 0.1351, 'eval_rouge2': 0.0452, 'eval_rougeL': 0.111, 'eval_rougeLsum': 0.111, 'eval_gen_len': 19.0, 'eval_runtime': 635.154, 'eval_samples_per_second': 0.39, 'eval_steps_per_second': 0.025, 'epoch': 2.0}


                                                      
 75%|███████▌  | 186/248 [1:39:11<20:55, 20.25s/it]

{'eval_loss': 2.572197437286377, 'eval_rouge1': 0.1376, 'eval_rouge2': 0.047, 'eval_rougeL': 0.113, 'eval_rougeLsum': 0.113, 'eval_gen_len': 19.0, 'eval_runtime': 617.1085, 'eval_samples_per_second': 0.402, 'eval_steps_per_second': 0.026, 'epoch': 3.0}


                                                   
100%|██████████| 248/248 [2:13:30<00:00, 32.30s/it]

{'eval_loss': 2.5564796924591064, 'eval_rouge1': 0.1383, 'eval_rouge2': 0.0482, 'eval_rougeL': 0.1149, 'eval_rougeLsum': 0.1149, 'eval_gen_len': 19.0, 'eval_runtime': 642.2628, 'eval_samples_per_second': 0.386, 'eval_steps_per_second': 0.025, 'epoch': 4.0}
{'train_runtime': 8010.71, 'train_samples_per_second': 0.494, 'train_steps_per_second': 0.031, 'train_loss': 3.0183575537896927, 'epoch': 4.0}





TrainOutput(global_step=248, training_loss=3.0183575537896927, metrics={'train_runtime': 8010.71, 'train_samples_per_second': 0.494, 'train_steps_per_second': 0.031, 'total_flos': 1070824333246464.0, 'train_loss': 3.0183575537896927, 'epoch': 4.0})

In [14]:
import random
from transformers import pipeline
from datasets import load_dataset
import evaluate

# Load the billsum dataset
dataset = load_dataset("billsum")

# select split
split = 'ca_test'
# split = random.choice(['ca_test', 'test', 'train'])

print(f"Split : {split}")
print(f"{billsum[split]}")
print()

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

rouge = evaluate.load("rouge")

summaries = []
references = []

n_egs = 10

# for idx in random.sample(range(len(dataset[split])), n_egs):
for idx in range(n_egs):
    text = dataset[split][idx]['text']
    reference_summary = dataset[split][idx]['summary']
    
    summarized_text = summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
    
    summaries.append(summarized_text)
    references.append(reference_summary)

results = rouge.compute(predictions=summaries, references=references, use_stemmer=True)

print(results)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Split : ca_test
Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})

{'rouge1': 0.29603126540733893, 'rouge2': 0.1404817532805786, 'rougeL': 0.18725051415981508, 'rougeLsum': 0.23762570502519492}
