In [22]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
import evaluate
import nltk

# Ensure that the NLTK sentence tokenizer is available
nltk.download('punkt')

# Load the T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the PEGASUS model and tokenizer
pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Function to generate a summary using T5
def generate_t5_summary(text):
    num_beams = 25  # Further increase beams for more diverse summaries
    length_penalty = 1.0  # Neutral to balance summary length
    no_repeat_ngram_size = 2  # Allow for more bigram coverage
    max_length = 150  # Focus on concise yet informative summaries
    min_length = 80  # Ensure summary includes core content
    do_sample = False

    t5_inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    t5_summary_ids = t5_model.generate(t5_inputs, max_length=max_length, min_length=min_length, 
                                       num_beams=num_beams, length_penalty=length_penalty, 
                                       no_repeat_ngram_size=no_repeat_ngram_size, 
                                       do_sample=do_sample, early_stopping=True)
    t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)
    
    return t5_summary

# Function to generate a summary using PEGASUS
def generate_pegasus_summary(text):
    num_beams = 25
    length_penalty = 1.2
    no_repeat_ngram_size = 2
    max_length = 150
    min_length = 80
    do_sample = False

    pegasus_inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
    pegasus_summary_ids = pegasus_model.generate(pegasus_inputs['input_ids'], max_length=max_length, min_length=min_length, 
                                                 num_beams=num_beams, length_penalty=length_penalty, 
                                                 no_repeat_ngram_size=no_repeat_ngram_size, 
                                                 do_sample=do_sample, early_stopping=True)
    pegasus_summary = pegasus_tokenizer.decode(pegasus_summary_ids[0], skip_special_tokens=True)
    
    return pegasus_summary

# Function to generate a combined summary with an emphasis on bigrams
def generate_weighted_combined_summary(text, weight_t5=0.4, weight_pegasus=0.6):
    t5_summary = generate_t5_summary(text)
    pegasus_summary = generate_pegasus_summary(text)

    # Tokenize summaries into sentences
    t5_sentences = nltk.sent_tokenize(t5_summary)
    pegasus_sentences = nltk.sent_tokenize(pegasus_summary)

    # Combine sentences with a focus on maximizing bigram overlap
    combined_sentences = []
    combined_sentences.extend(t5_sentences[:int(len(t5_sentences) * weight_t5)])
    combined_sentences.extend(pegasus_sentences[:int(len(pegasus_sentences) * weight_pegasus)])

    # Reorder sentences to maximize bigram overlap (use n-gram analysis if needed)
    combined_summary = " ".join(combined_sentences)
    
    return combined_summary

# Function to calculate ROUGE scores
def calculate_rouge_scores(generated_summary, reference_summary):
    scores = rouge.compute(predictions=[generated_summary], references=[reference_summary])
    return {
        "ROUGE-1": scores['rouge1'],
        "ROUGE-2": scores['rouge2'],
        "ROUGE-L": scores['rougeL']
    }

# Provided article text
text_to_summarize = """
Two men across the Greater Toronto Area are speaking out after they said they lost thousands of dollars on vacation in Mexico.

"They said it's my word versus their word. So it's my word versus a scammer's word," Adam Attard, of Mississauga, told CTV News Toronto. Attard was vacationing with his girlfriend near Cancun, Mexico, in early July.

Attard recalled an individual telling him he was at the wrong terminal when they arrived at the airport for their flight home, so they took a short ride in a shuttle to reach the correct one. Before he left the shuttle, Attard said the driver locked the doors and demanded payment by credit card.

"We couldn't get out of the van. There were no visible latches or locks to open the door. The [shuttle] driver said, 'You are not leaving until you pay the $3,'" said Attard.

When Attard was told the payment didn't go through, the driver then said he accepted cash. However, Attard said his credit card was immediately charged $3,142.

Attard said he contacted the Royal Bank of Canada's Visa customer care line and was initially told he would be refunded the money, but later, he was told he wouldn't be.

"After I was told I would be covered, I was told because I punched in my PIN and did not get a receipt, they would not refund me anything," said Attard.

Glenn Egan of Toronto also travelled to Mexico in March of this year. He was visiting Mexico City with his family when they decided to take a taxi back from a museum to his hotel.

Egan said the taxi ride should have cost about $15, but he was charged $2,300 on his credit card.

According to Egan, the driver also demanded he pay with a credit card and then claimed the charges didn't go through, so he accepted cash. A minute later, Egan's bank notified him he had been scammed.

"I stepped out of the taxi and immediately got a text from RBC saying $2,300 had been charged to my Visa," said Egan.

Egan said he'd contacted Visa right away to dispute the charge, but after four months of trying, he was told he would not be given a refund.

"At the end, they said I didn't get a receipt and without one, I can't dispute the charges," said Egan. "They say with Visa you're protected against fraud and the fact they won't step up is infuriating. It's not 20 bucks, it's $2,300."

An RBC spokesperson told CTV News Toronto that it reviews each report of potential fraud on a case-by-case basis and urges its customers to take precautions when receiving or transferring funds.

"While we cannot comment on the specifics of this situation, we can advise that we take this matter seriously and are working with our client directly throughout the process to keep them informed," the spokesperson said.

"Scams are increasingly sophisticated, and we work closely with industry associations, government and law enforcement to prevent, detect and investigate fraud, including when it happens in other jurisdictions."

Not long after Egan reached out to CTV News, he was told he would receive a full refund of his $2,300. Attard was also refunded his $3,142.

To avoid being caught in a fake taxi scam, make sure you're in a licenced cab or shuttle and book through a trusted source, like a hotel or tour company. You should also ask in advance if you can pay in cash and how much the charge will be.
"""

# Reference summary
reference_summary = """
Two men from the Greater Toronto Area were scammed during their vacations in Mexico, losing thousands of dollars. Adam Attard and Glenn Egan were both charged large sums by drivers who then claimed the charges didn't go through, leading to disputes with their banks. Eventually, both men were refunded after their cases were highlighted. Travelers are advised to use licensed transport and confirm payment methods to avoid such scams.
"""

# Generate the combined summary
combined_summary = generate_weighted_combined_summary(text_to_summarize, weight_t5=0.4, weight_pegasus=0.6)

# Calculate the ROUGE scores
rouge_scores = calculate_rouge_scores(combined_summary, reference_summary)

# Print the results
print("Combined Summary:")
print(combined_summary)
print("\nROUGE Scores:")
for key, score in rouge_scores.items():
    print(f"{key}: {score:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Combined Summary:
two men across the Greater Toronto Area say they lost thousands of dollars on vacation in Mexico. "I stepped out of the taxi and immediately got a text from RBC saying $2,300 had been charged to my Visa," said Glenn Egan, who travelled to Mexico with his family in March of this year.

ROUGE Scores:
ROUGE-1: 0.3387
ROUGE-2: 0.1311
ROUGE-L: 0.2419


In [30]:
pip install flask gradio transformers evaluate nltk torch





In [42]:
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
import evaluate
import nltk

# Ensure that the NLTK sentence tokenizer is available
nltk.download('punkt')

# Load the T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the PEGASUS model and tokenizer
pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Function to generate a summary using T5
def generate_t5_summary(text):
    num_beams = 25  # Further increase beams for more diverse summaries
    length_penalty = 1.0  # Neutral to balance summary length
    no_repeat_ngram_size = 2  # Allow for more bigram coverage
    max_length = 150  # Focus on concise yet informative summaries
    min_length = 80  # Ensure summary includes core content
    do_sample = False

    t5_inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    t5_summary_ids = t5_model.generate(t5_inputs, max_length=max_length, min_length=min_length, 
                                       num_beams=num_beams, length_penalty=length_penalty, 
                                       no_repeat_ngram_size=no_repeat_ngram_size, 
                                       do_sample=do_sample, early_stopping=True)
    t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)
    
    return t5_summary

# Function to generate a summary using PEGASUS
def generate_pegasus_summary(text):
    num_beams = 20
    length_penalty = 1.2
    no_repeat_ngram_size = 2
    max_length = 150
    min_length = 80
    do_sample = False

    pegasus_inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
    pegasus_summary_ids = pegasus_model.generate(pegasus_inputs['input_ids'], max_length=max_length, min_length=min_length, 
                                                 num_beams=num_beams, length_penalty=length_penalty, 
                                                 no_repeat_ngram_size=no_repeat_ngram_size, 
                                                 do_sample=do_sample, early_stopping=True)
    pegasus_summary = pegasus_tokenizer.decode(pegasus_summary_ids[0], skip_special_tokens=True)
    
    return pegasus_summary

# Function to generate a combined summary with an emphasis on bigrams
def generate_weighted_combined_summary(text, weight_t5=0.4, weight_pegasus=0.6):
    t5_summary = generate_t5_summary(text)
    pegasus_summary = generate_pegasus_summary(text)

    # Tokenize summaries into sentences
    t5_sentences = nltk.sent_tokenize(t5_summary)
    pegasus_sentences = nltk.sent_tokenize(pegasus_summary)

    # Combine sentences with a focus on maximizing bigram overlap
    combined_sentences = []
    combined_sentences.extend(t5_sentences[:int(len(t5_sentences) * weight_t5)])
    combined_sentences.extend(pegasus_sentences[:int(len(pegasus_sentences) * weight_pegasus)])

    # Reorder sentences to maximize bigram overlap (use n-gram analysis if needed)
    combined_summary = " ".join(combined_sentences)
    
    return combined_summary

# Define the Gradio interface
iface = gr.Interface(
    fn=generate_weighted_combined_summary,
    inputs="textbox",
    outputs="textbox",
    title="Text Summarizer with T5 and PEGASUS",
    description="Enter a text to generate its summary using a combined T5 and PEGASUS model."
)

# Launch the interface
iface.launch()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


