# Setup Environment

In [14]:
import pandas as pd
import torch
import numpy as np

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    BartTokenizer,
)

from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install einops
!pip install rouge

# Import Data

In [5]:
df = pd.read_csv(
    "https://github.com/AnasElbattra/News-Understanding-NLP/raw/main/Summarization/Data/MahendrakharraBBC-News-Articles-Summaries/train.csv"
)
df

Unnamed: 0,articles,summaries
0,Faith schools citizenship warningSchools must ...,"Mr Bell said Muslim, Jewish and Evangelical Ch..."
1,Leaders meet over Turkish EU bidTony Blair has...,Mr Blair is an enthusiastic proponent of talks...
2,Guantanamo pair's passport banThe government h...,"The men's solicitor, Louise Christian, has rai..."
3,Terror powers expose 'tyranny'The Lord Chancel...,"Mr Forsyth links ""Islamic fundamentalism"" to t..."
4,Gurkhas to help tsunami victimsBritain has off...,Britain has offered to send a company of 120 G...
...,...,...
1795,London hope over ChepkemeiLondon Marathon orga...,London Marathon organisers are hoping that ban...
1796,Wilkinson return 'unlikely'Jonny Wilkinson loo...,"Despite not playing for England, Wilkinson is ..."
1797,Klinsmann issues Lehmann warningGermany coach ...,"Klinsmann added: ""If he is not playing regular..."
1798,Tomlinson stays focused on EuropeLong jumper C...,"The Birmingham athlete, who clocked a season's..."


# Models

In [16]:
flan_t5_base = "ybagoury/flan-t5-base-tldr_news"
bart_large = "facebook/bart-large-cnn"
pegasus = "google/pegasus-large"

bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
pegasus_tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")

bart_summ = pipeline(
    "summarization",
    model=bart_large,
    tokenizer=bart_tokenizer,
    max_length=100,
    min_length=50,
)
t5_summ = pipeline(
    "summarization",
    model=flan_t5_base,
    tokenizer=t5_tokenizer,
    max_length=100,
    min_length=50,
)
pegasus_summ = pipeline(
    "summarization",
    model=pegasus,
    tokenizer=pegasus_tokenizer,
    max_length=100,
    min_length=50,
)

t5_summ_3b = pipeline(
    "summarization", "jordiclive/flan-t5-3b-summarizer", torch_dtype=torch.bfloat16
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
def getSummary(text, summarizer_model, max_chunk_length=500):
    # input_ids = tokenizer.encode(text, return_tensors="pt")

    # # Generate Summary Text Ids
    # summary_text_ids = model_summarize.generate(
    #     input_ids=input_ids,
    #     bos_token_id=model_summarize.config.bos_token_id,
    #     eos_token_id=model_summarize.config.eos_token_id,
    #     length_penalty=2.0,
    #     max_length=142,
    #     min_length=30,
    #     num_beams=4,
    # )

    # text_summarized=summary_text_ids[0]
    # print(tokenizer.decode(text_summarized, skip_special_tokens=True))
    text_chunks = [
        text[i : i + max_chunk_length] for i in range(0, len(text), max_chunk_length)
    ]

    # Generate summaries for each chunk
    chunk_summaries = [
        summarizer_model(chunk)[0]["summary_text"] for chunk in text_chunks
    ]

    # Concatenate the chunk summaries to get the final summary
    text_summarized = " ".join(chunk_summaries)

    return text_summarized

In [10]:
def getSummary(text, summarizer_model):
    # Generate summary for the entire text
    summary = summarizer_model(
        text, max_length=142, min_length=30, length_penalty=2.0, num_beams=4
    )[0]["summary_text"]

    return summary

In [11]:
text = """When most presidents in the past have sent a message leading into the new year, they ve sent messages of hope, unity, and a better tomorrow. Clearly, President-elect Donald Trump is not trying to be like any former leader.Trump s New Year s Eve message, sent via his medium of choice   Twitter   went something like this: Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love! Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don't know what to do. Love!  Donald J. Trump (@realDonaldTrump) December 31, 2016This narcissistic message was clearly not about the nation or bringing us to together in any sort of way, but instead throws over half the nation under the bus.However, if one were to redact most of Trump s tweets and just leave the intro and the close, it would sound something like this: Happy New Year to all. Love! Redacted, Trump sounds just like Obama! ? pic.twitter.com/h9W5JbqE98  Sarah Wood (@SarahWoodwriter) December 31, 2016Which, quite honestly, sounds like someone who wants to unite the nation and move forward towards a better tomorrow. It actually sounds remarkably like the president we ve had for the past eight years, President Barack Obama.If only Trump could contain his narcissistic animosity towards those who don t like him or agree with him. However, his fragile ego doesn t seem to make that possible thus his constant Twitter fits of rage against anyone who dare speak against him.Redacted Trump could actually be a good leader. Unfortunately, he doesn t seem to know how to edit himself for the greater good. Hopefully that will change.Featured Photo by Chip Somodevilla, Twitter/Getty Images
"""
reference_summary = """President-elect Donald Trump's New Year's Eve message on Twitter was criticized for being narcissistic and not promoting unity or hope. If redacted, Trump's message would sound more like President Barack Obama, aiming to unite the nation and move towards a better tomorrow. However, Trump's fragile ego makes it difficult for him to edit himself for the greater good.
"""

In [23]:
print("Flan T5 Base summarization: \n")
t5_summarized = getSummary(text, t5_summ)
print(t5_summarized)

Flan T5 Base summarization: 

Donald Trump’s New Year s Eve message is a narcissistic one, and he’s not trying to be like any former leader


In [27]:
# generate summaries with bart on the dataset
df["bart_summ"] = df["articles"].apply(lambda x: getSummary(x, bart_summ))

Your max_length is set to 100, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 100, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 100, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your max_length is set to 100, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your

In [None]:
# generate summaries with t5 on the dataset
df["t5_summ"] = df["articles"].apply(lambda x: getSummary(x, t5_summ))

In [20]:
raw_document = """When most presidents in the past have sent a message leading into the new year, they ve sent messages of hope, unity, and a better tomorrow. Clearly, President-elect Donald Trump is not trying to be like any former leader.Trump s New Year s Eve message, sent via his medium of choice   Twitter   went something like this: Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love! Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don't know what to do. Love!  Donald J. Trump (@realDonaldTrump) December 31, 2016This narcissistic message was clearly not about the nation or bringing us to together in any sort of way, but instead throws over half the nation under the bus.However, if one were to redact most of Trump s tweets and just leave the intro and the close, it would sound something like this: Happy New Year to all. Love! Redacted, Trump sounds just like Obama! ? pic.twitter.com/h9W5JbqE98  Sarah Wood (@SarahWoodwriter) December 31, 2016Which, quite honestly, sounds like someone who wants to unite the nation and move forward towards a better tomorrow. It actually sounds remarkably like the president we ve had for the past eight years, President Barack Obama.If only Trump could contain his narcissistic animosity towards those who don t like him or agree with him. However, his fragile ego doesn t seem to make that possible thus his constant Twitter fits of rage against anyone who dare speak against him.Redacted Trump could actually be a good leader. Unfortunately, he doesn t seem to know how to edit himself for the greater good. Hopefully that will change.Featured Photo by Chip Somodevilla, Twitter/Getty Images
"""
prompt = "Produce an article summary of the following news article:"
results = t5_summ_3b(
    f"{prompt} {raw_document}",
    num_beams=5,
    min_length=5,
    no_repeat_ngram_size=3,
    truncation=True,
    max_length=512,
)
print(results)

Your max_length is set to 512, but your input_length is only 456. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=228)


[{'summary_text': "President-elect Donald Trump sent a narcissistic message on New Year's Eve . Redacted, Trump sounds just like President Barack Obama . Trump's message was clearly not about the nation or bringing us to together ."}]


In [29]:
df

Unnamed: 0,articles,summaries,bart_summ
0,Faith schools citizenship warningSchools must ...,"Mr Bell said Muslim, Jewish and Evangelical Ch...",Independent faith schools were singled out by ...
1,Leaders meet over Turkish EU bidTony Blair has...,Mr Blair is an enthusiastic proponent of talks...,Blair is an enthusiastic proponent of talks to...
2,Guantanamo pair's passport banThe government h...,"The men's solicitor, Louise Christian, has rai...",Martin Mubanga and Feroz Abbasi will not be al...
3,Terror powers expose 'tyranny'The Lord Chancel...,"Mr Forsyth links ""Islamic fundamentalism"" to t...",Lord Falconer insists that the proposals do no...
4,Gurkhas to help tsunami victimsBritain has off...,Britain has offered to send a company of 120 G...,The deployment would involve troops from the 2...
...,...,...,...
1795,London hope over ChepkemeiLondon Marathon orga...,London Marathon organisers are hoping that ban...,Susan Chepkemei has been suspended from all co...
1796,Wilkinson return 'unlikely'Jonny Wilkinson loo...,"Despite not playing for England, Wilkinson is ...",England's World Cup-winning fly-half said last...
1797,Klinsmann issues Lehmann warningGermany coach ...,"Klinsmann added: ""If he is not playing regular...",Jens Lehmann is understudy to Oliver Kahn in t...
1798,Tomlinson stays focused on EuropeLong jumper C...,"The Birmingham athlete, who clocked a season's...",Long jumper Chris Tomlinson has cut his schedu...


In [None]:
# apply bert score to the dataset
from bert_score import score
from tqdm import tqdm

df["bert_score"] = np.nan
df["bert_score"] = df["bert_score"].astype(object)

for i in tqdm(range(len(df))):
    df["bert_score"][i] = score(
        [df["t5_summ"][i]], [df["summaries"][i]], lang="en", verbose=False
    )

df