# CNN-DailyMail News Text Summarization

In [94]:
import pandas as pd
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
import sacrebleu


In [95]:
df = pd.read_csv("test.csv")
df

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
...,...,...,...
11485,ed8674cc15b29a87d8df8de1efee353d71122272,Our young Earth may have collided with a body ...,Oxford scientists say a Mercury-like body stru...
11486,2f58d1a99e9c47914e4b1c31613e3a041cd9011e,A man facing trial for helping his former love...,Man accused of helping former lover kill woman...
11487,411f6d57825161c3a037b4742baccd6cd227c0c3,A dozen or more metal implements are arranged ...,Marianne Power tried the tuning fork facial at...
11488,b5683ef8342056b17b068e0d59bdbe87e3fe44ea,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...


### Initialize the summarization pipeline and Tokenizer

In [96]:
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = TFAutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [97]:
# Function to generate summaries
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="tf", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=130, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

### Function to calculate BLEU score

In [98]:
# Function to calculate BLEU score using sacrebleu
def calculate_bleu(reference, candidate):
    return sacrebleu.raw_corpus_bleu([candidate], [[reference]]).score

Process only the first 100 rows

In [99]:
df_sample = df.head(100)

A list to collect the results

In [100]:
results = []

### Generate summaries and calculate BLEU scores

In [101]:
# Generate summaries and calculate BLEU scores
for index, row in df_sample.iterrows():
    article = row.get('article', '')
    reference_summary = row.get('highlights', '')
    
    try:
        # Generate summary
        generated_summary = generate_summary(article)
        
        # Calculate BLEU score
        bleu_score = calculate_bleu(reference_summary, generated_summary)
        
        # Append the results
        results.append({
            'Article ID': row.get('id', ''),
            'Generated Summary': generated_summary,
            'Reference Summary': reference_summary,
            'BLEU Score': bleu_score
        })
    except Exception as e:
        print(f"Error processing row {index}: {e}")

In [102]:
# Convert results to a DataFrame for better display
results_df = pd.DataFrame(results)

In [103]:
# Display the results in a nicely formatted table
from IPython.display import display

display(results_df)

Unnamed: 0,Article ID,Generated Summary,Reference Summary,BLEU Score
0,92c514c913c0bdfe25341af9fd72b29db544099b,U.S consumer advisory group set up by Departme...,Experts question if packed out planes are put...,5.726137
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,"Rahul Kumar, 17, clambered over enclosure fenc...",Drunk teenage boy climbed into lion enclosure ...,18.814762
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,1.986163
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Fiorentina goalkeeper Neto is wanted by a numb...,Fiorentina goalkeeper Neto has been linked wit...,10.583910
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,"The former Olympian and reality TV star, 65, w...","Tell-all interview with the reality TV star, 6...",13.730144
...,...,...,...,...
95,64ee7c9eb9f1efbb7da0ce80498434c623615b84,Barcelona face Paris Saint-Germain in the Cham...,Zlatan Ibrahimovic will line up against former...,2.814418
96,5cf4682cd03238d5867027ce9492b626cd1ed011,"Jameela Jamil spent £3,000 on having all her a...","Jameela Jamil, 29, is convinced dental work tr...",1.464945
97,3815d19af18ff22be6ad6095722d7367bb7271af,"Christopher Bridger, 25, attacked three women ...","Christopher Bridger, 25, attacked three women ...",34.760361
98,fb207604ffa7e8371c622840445825db8993d4d2,Paris Saint-Germain face Nice in Ligue 1 on Sa...,Paris Saint-Germain captain Thiago Silva suffe...,2.356906


In [107]:
(results_df['BLEU Score']).mean()

9.938888510300457