# Environment Installation

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install einops
!pip install rouge


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


# Import Libraries

In [2]:
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    BartTokenizer,
)


import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import nltk
import evaluate
from nltk.tokenize import sent_tokenize


nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shindy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Models

In [3]:
flan_t5_base = "ybagoury/flan-t5-base-tldr_news"
bart_large = "facebook/bart-large-cnn"
pegasus = "google/pegasus-large"

bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
pegasus_tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")

bart_summ = pipeline(
    "summarization",
    model=bart_large,
    tokenizer=bart_tokenizer,
    max_length=100,
    min_length=50,
)
t5_summ = pipeline(
    "summarization",
    model=flan_t5_base,
    tokenizer=t5_tokenizer,
    max_length=100,
    min_length=50,
)
pegasus_summ = pipeline(
    "summarization",
    model=pegasus,
    tokenizer=pegasus_tokenizer,
    max_length=100,
    min_length=50,
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# getSummary()

In [89]:
def getSummary(text, summarizer_model, max_chunk_length=500):
    # input_ids = tokenizer.encode(text, return_tensors="pt")

    # # Generate Summary Text Ids
    # summary_text_ids = model_summarize.generate(
    #     input_ids=input_ids,
    #     bos_token_id=model_summarize.config.bos_token_id,
    #     eos_token_id=model_summarize.config.eos_token_id,
    #     length_penalty=2.0,
    #     max_length=142,
    #     min_length=30,
    #     num_beams=4,
    # )

    # text_summarized=summary_text_ids[0]
    # print(tokenizer.decode(text_summarized, skip_special_tokens=True))
    text_chunks = [
        text[i : i + max_chunk_length] for i in range(0, len(text), max_chunk_length)
    ]

    # Generate summaries for each chunk
    chunk_summaries = [
        summarizer_model(chunk)[0]["summary_text"] for chunk in text_chunks
    ]

    # Concatenate the chunk summaries to get the final summary
    text_summarized = " ".join(chunk_summaries)

    return text_summarized

In [5]:
text = """When most presidents in the past have sent a message leading into the new year, they ve sent messages of hope, unity, and a better tomorrow. Clearly, President-elect Donald Trump is not trying to be like any former leader.Trump s New Year s Eve message, sent via his medium of choice   Twitter   went something like this: Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love! Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don't know what to do. Love!  Donald J. Trump (@realDonaldTrump) December 31, 2016This narcissistic message was clearly not about the nation or bringing us to together in any sort of way, but instead throws over half the nation under the bus.However, if one were to redact most of Trump s tweets and just leave the intro and the close, it would sound something like this: Happy New Year to all. Love! Redacted, Trump sounds just like Obama! ? pic.twitter.com/h9W5JbqE98  Sarah Wood (@SarahWoodwriter) December 31, 2016Which, quite honestly, sounds like someone who wants to unite the nation and move forward towards a better tomorrow. It actually sounds remarkably like the president we ve had for the past eight years, President Barack Obama.If only Trump could contain his narcissistic animosity towards those who don t like him or agree with him. However, his fragile ego doesn t seem to make that possible thus his constant Twitter fits of rage against anyone who dare speak against him.Redacted Trump could actually be a good leader. Unfortunately, he doesn t seem to know how to edit himself for the greater good. Hopefully that will change.Featured Photo by Chip Somodevilla, Twitter/Getty Images
"""
reference_summary = """President-elect Donald Trump's New Year's Eve message on Twitter was criticized for being narcissistic and not promoting unity or hope. If redacted, Trump's message would sound more like President Barack Obama, aiming to unite the nation and move towards a better tomorrow. However, Trump's fragile ego makes it difficult for him to edit himself for the greater good.
"""

In [6]:
print("Flan T5 Base summarization: \n")
t5_summarized = getSummary(text, t5_summ)
print(t5_summarized)

Flan T5 summarization: 

Donald Trump’s New Year s Eve message is a narcissistic one, and it’s not about the nation or bringing us together in any sort of way, but instead throws over half the nation under the bus


In [7]:
print("Bart summarization: \n")
bart_summarized = getSummary(text, bart_summ)
print(bart_summarized)

Bart summarization: 

Trump's New Year s Eve message was clearly not about the nation or bringing us to together in any sort of way, but instead throws over half the nation under the bus. If one were to redact most of Trump s tweets and just leave the intro and the close, it would sound something like this: Happy New Year to all. Love!


In [8]:
print("Pegasus summarization: \n")
pegasus_summarized = getSummary(text, pegasus_summ)
print(pegasus_summarized)

Pegasus summarization: 

Clearly, President-elect Donald Trump is not trying to be like any former leader.Trump s New Year s Eve message, sent via his medium of choice Twitter went something like this: Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Trump (@realDonaldTrump) December 31, 2016This narcissistic message was clearly not about the nation or bringing us to together in any sort of way


In [9]:
rouge_score = evaluate.load("rouge")

In [10]:
rouge_score.compute(predictions=[t5_summarized], references=[text], use_stemmer=True)

{'rouge1': 0.21652421652421652,
 'rouge2': 0.16045845272206302,
 'rougeL': 0.18233618233618235,
 'rougeLsum': 0.18233618233618235}

In [20]:
df = pd.read_csv(
    "D:/DEBI/News Understanding/FAKE-REAL/DATA/pushpdeepfake_news_combined/train.csv"
)
df

Unnamed: 0,text,label
0,"Wall Street raises targets on Netflix, citing ...",1
1,Trump picks conservative loyalists for top sec...,1
2,Factbox: U.S.-Pakistan ties falter as Afghanis...,1
3,"Brexit talks put back a week, EU expects May s...",1
4,U.S. House tax chief says Trump border tax not...,1
...,...,...
19995,Republicans Despondent After Final Benghazi R...,0
19996,Fox News ‘Expert’ Will Now Be A Prison Expert...,0
19997,Seth Meyers Takes No Prisoners In GLORIOUS Sm...,0
19998,ALABAMA: MIDDLE CLASS WHITE WOMAN Rejected For...,0


# Sample data

In [21]:
# take a sample of 100 rows
df = df.sample(100)
df

Unnamed: 0,text,label
17659,Florida GOP’s Anti-Abortion Law Struck Down I...,0
17063,Basketball Great LeBron James Delivers STINGI...,0
17162,Democrat Hilariously Mocks Paul Ryan During H...,0
12442,Philippines suspends trade with North Korea to...,1
18916,Watch The Moment A White Teacher Gets SCHOOLE...,0
...,...,...
16905,TAKE THE POLL: Tell Us Who You Think Will Be T...,0
7876,Exclusive: Philippines defied experts' advice ...,1
11051,U.S. bans travel to North Korea from September...,1
2558,Trump Is ACTIVELY Working To RIP Families Awa...,0


# T5

In [22]:
# apply getSummary function to each row of the dataframe
df["t5_summary"] = df["text"].apply(getSummary, args=(t5_summ,))

Your max_length is set to 100, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 100, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 100, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


In [23]:
# evaluate the summaries using rouge and store the scores in a new column
df["rouge"] = df.apply(
    lambda row: rouge_score.compute(
        predictions=[row["summary"]], references=[row["text"]], use_stemmer=True
    ),
    axis=1,
)

In [40]:
df["t5_rouge1"] = df["rouge"].apply(lambda x: x["rouge1"])
df["t5_rouge2"] = df["rouge"].apply(lambda x: x["rouge2"])
df["t5_rougeL"] = df["rouge"].apply(lambda x: x["rougeL"])
df["t5_rougeLsum"] = df["rouge"].apply(lambda x: x["rougeLsum"])

df.drop(columns=["rouge"], inplace=True)

# Bart

In [84]:
df["Bart_summary"] = df["text"].apply(getSummary, args=(bart_summ,))

Your max_length is set to 100, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
Your max_length is set to 100, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 100, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
Your max_length is set to 100, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your

In [85]:
df["rouge"] = df.apply(
    lambda row: rouge_score.compute(
        predictions=[row["Bart_summary"]], references=[row["text"]], use_stemmer=True
    ),
    axis=1,
)

In [86]:
df["bart_rouge1"] = df["rouge"].apply(lambda x: x["rouge1"])
df["bart_rouge2"] = df["rouge"].apply(lambda x: x["rouge2"])
df["bart_rougeL"] = df["rouge"].apply(lambda x: x["rougeL"])
df["bart_rougeLsum"] = df["rouge"].apply(lambda x: x["rougeLsum"])

df.drop(columns=["rouge"], inplace=True)

# Pegasus

In [90]:
df["Peagsus_summary"] = df["text"].apply(getSummary, args=(pegasus_summ,))

Your max_length is set to 100, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 100, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 100, but your input_length is only 93. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)
Your max_length is set to 100, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your

In [91]:
df["rouge"] = df.apply(
    lambda row: rouge_score.compute(
        predictions=[row["Peagsus_summary"]], references=[row["text"]], use_stemmer=True
    ),
    axis=1,
)

In [92]:
df["pegasus_rouge1"] = df["rouge"].apply(lambda x: x["rouge1"])
df["pegasus_rouge2"] = df["rouge"].apply(lambda x: x["rouge2"])
df["pegasus_rougeL"] = df["rouge"].apply(lambda x: x["rougeL"])
df["pegasus_rougeLsum"] = df["rouge"].apply(lambda x: x["rougeLsum"])

df.drop(columns=["rouge"], inplace=True)

In [93]:
df

Unnamed: 0,text,label,t5_summary,t5_rouge1,t5_rouge2,t5_rougeL,t5_rougeLsum,Bart_summary,bart_rouge1,bart_rouge2,bart_rougeL,bart_rougeLsum,Peagsus_summary,pegasus_rouge1,pegasus_rouge2,pegasus_rougeL,pegasus_rougeLsum
17659,Florida GOP’s Anti-Abortion Law Struck Down I...,0,Florida GOP’s Anti-Abortion Law Struck Down In...,0.153846,0.106667,0.137931,0.137931,"The bill, HB1411, was signed into law by Flori...",0.740072,0.710145,0.595668,0.595668,Florida GOP’s Anti-Abortion Law Struck Down In...,0.739903,0.713128,0.726979,0.726979
17063,Basketball Great LeBron James Delivers STINGI...,0,LeBron James Delivers STINGING Ohio Blow To Tr...,0.105495,0.052980,0.079121,0.079121,Basketball Great LeBron James Delivers STINGIN...,0.651090,0.584375,0.532710,0.532710,Basketball Great LeBron James Delivers STINGIN...,0.738686,0.711567,0.727007,0.727007
17162,Democrat Hilariously Mocks Paul Ryan During H...,0,Democrat Hilariously Mocks Paul Ryan During Ho...,0.214112,0.210269,0.214112,0.214112,"On Wednesday, the Democratic party staged a po...",0.691910,0.614853,0.547332,0.547332,Democrat Hilariously Mocks Paul Ryan During Ho...,0.608048,0.591928,0.608048,0.608048
12442,Philippines suspends trade with North Korea to...,1,Manila suspends trade relations with North Kor...,0.255102,0.235897,0.239796,0.239796,Philippines suspends trade with North Korea to...,0.708720,0.629423,0.586271,0.586271,resolutionMANILA (Reuters) - The Philippines h...,0.752000,0.696629,0.710400,0.710400
18916,Watch The Moment A White Teacher Gets SCHOOLE...,0,Watch The Moment A White Teacher Gets SCHOOLED...,0.196078,0.192118,0.196078,0.196078,A high school student turned the tables and ga...,0.658363,0.600000,0.587189,0.587189,Watch The Moment A White Teacher Gets SCHOOLED...,0.852778,0.821727,0.836111,0.836111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16905,TAKE THE POLL: Tell Us Who You Think Will Be T...,0,TAKE THE POLL: Tell Us Who You Think Will Be T...,0.065703,0.052701,0.060447,0.060447,Indiana Gov. Mike Pence would bring constituti...,0.619048,0.568807,0.578755,0.578755,Who do you think would be his best choice?TAKE...,0.738095,0.703578,0.731293,0.731293
7876,Exclusive: Philippines defied experts' advice ...,1,The Philippines Defied Experts' Advice in Tryi...,0.030391,0.020290,0.027496,0.027496,"In January 2016, the Philippines announced it ...",0.653789,0.609817,0.631005,0.631005,Exclusive: Philippines defied experts' advice ...,0.809239,0.758562,0.749358,0.749358
11051,U.S. bans travel to North Korea from September...,1,U.S. bans travel to North Korea from Sept. 1 a...,0.186364,0.173516,0.168182,0.168182,U.S. bans travel to North Korea from September...,0.660348,0.564184,0.568720,0.568720,"bans travel to North Korea from September 1, s...",0.766716,0.730253,0.745914,0.745914
2558,Trump Is ACTIVELY Working To RIP Families Awa...,0,Trump Is ACTIVELY Working To RIP Families Away...,0.084257,0.057906,0.070953,0.070953,Trump Is ACTIVELY Working To RIP Families Away...,0.672489,0.586861,0.602620,0.602620,Trump Is ACTIVELY Working To RIP Families Away...,0.722892,0.681879,0.698795,0.698795


In [94]:
df.to_csv("D:/DEBI/News Understanding/Summarization/Data/df.csv")