**Cell 1 :Installing the basic libraries.**

In [None]:
!pip install transformers datasets accelerate evaluate --quiet
!pip install wandb --quiet
!pip install rouge_score --quiet
!pip install torch torchvision torchaudio --quiet
!pip install sentencepiece --quiet
!pip install fastai timm peft --quiet
!pip install numpy pandas matplotlib tqdm scipy flask fastapi gradio streamlit --quiet

**Cell 2 : Import necessary libraries**


In [None]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import numpy as np
import pandas as pd


In [None]:
!pip install -U datasets fsspec huggingface_hub --quiet

**Cell 3 :Load the CNN/DailyMail Dataset**

In [None]:
# Install datasets

from datasets import load_dataset

# Load the CNN/DailyMail dataset (first 500 articles)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:500]")
print(f"Dataset contains {len(dataset)} samples.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset contains 500 samples.


**Cell 4: this cell is to check for long articles and see if we need to handle them**\

---
The BART model has a maximum token limit of 1024 for summarization.
Articles longer than 1024 tokens need to be processed differently (either truncated or chunked). That why we need to know if we have large articles and how many.


In [None]:


tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# here i want to fined the long articles
def count_tokens(text):
    tokens = tokenizer.encode(text, truncation=False)
    return len(tokens)

article_lengths = [count_tokens(article) for article in dataset["article"]]

print(f"Average Length: {np.mean(article_lengths)} tokens")
print(f"Maximum Length: {np.max(article_lengths)} tokens")
print(f"Number of Articles > 1024 tokens: {sum(1 for l in article_lengths if l > 1024)}")


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Average Length: 756.882 tokens
Maximum Length: 2225 tokens
Number of Articles > 1024 tokens: 108


**Cell 5 :sentence-based chunking, tokenization, and padding of each article**

---
Cell 5 is responsible for preparing each article so it can be used with the BART model. It starts by breaking the article into individual sentences, then groups those sentences into chunks—each no longer than 1024 tokens, which is the maximum that BART can handle at once. If a chunk eis shorter than 1024 tokens, it's padded with zeros to reach the full length. Alongside this, attention mask is created, marking real tokens with 1s and padding with 0s. In the end, the cell produces two outputs: a list of the tokenized chunks and a corresponding list of attention masks. This ensures everything is properly formatted for input into the model.










In [None]:
import nltk
# to define the function for sentence-based chunking
def sentence_based_chunking(article, max_length=1024):
    """
    Splits the article into sentences, creates chunks of sentences
    that do not exceed 1024 tokens, and prepares them for tokenization.

    Args:
    - article (str): The full article text.
    - max_length (int): Maximum number of tokens per chunk.

    Returns:
    - chunks (list of lists): Tokenized chunks padded to max_length.
    - attention_masks (list of lists): Attention masks for each chunk.
    """

    sentences = nltk.sent_tokenize(article)

    # the variables
    chunks = []
    attention_masks = []
    current_chunk = []
    current_length = 0

    # this will loop through each sentence
    for sentence in sentences:
        # count the setnece length
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
        sentence_length = len(tokenized_sentence)

        # if adding the sentence is more than  the max length, start a new chunk
        if current_length + sentence_length > max_length:
            # here it will pad the chunk with 0s to 1024 tokens
            padding_length = max_length - current_length
            current_chunk.extend([0] * padding_length)

            # this line is to be sure that the attention mask is exactly 1024
            attention_mask = [1] * (max_length - padding_length) + [0] * padding_length

            # to save the chunk
            chunks.append(current_chunk)
            attention_masks.append(attention_mask)
            current_chunk = tokenized_sentence
            current_length = sentence_length
        else:
            # Add the sentence to the current chunk
            current_chunk.extend(tokenized_sentence)
            current_length += sentence_length

    # to handle the last chunk if it is not empty
    if current_chunk:
        # to pad the last chunk
        padding_length = max_length - len(current_chunk)
        current_chunk.extend([0] * padding_length)


        attention_mask = [1] * (max_length - padding_length) + [0] * padding_length

        # to append the final chunk
        chunks.append(current_chunk)
        attention_masks.append(attention_mask)

    return chunks, attention_masks


**Cell 6:Full Dataset Chunking and DataLoader Initialization**

---
Cell 6 takes all the articles in the dataset and prepares them for the model by using the chunking function from Cell 5. It goes through each article, breaks it into chunks of up to 1024 tokens, and creates the corresponding attention masks. These chunks and masks are stored in lists, which are then converted into PyTorch tensors for better performance. The tensors are packed into a TensorDataset, and a DataLoader is set up with a batch size of 2. This setup allows the data to be processed in small batches, helping the model run more efficiently and use memory more effectively during summarization.



















In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch

#  empty lists to keep  alle chunks and attention masks
all_chunks = []
all_attention_masks = []

# to  apply the sentence-based chunking to article
for article in dataset['article']:
    chunks, attention_masks = sentence_based_chunking(article)
    all_chunks.extend(chunks)
    all_attention_masks.extend(attention_masks)

# this will convert lists to PyTorch tensors
input_ids = torch.tensor(all_chunks)
attention_masks = torch.tensor(all_attention_masks)

# this will great a TensorDataset
dataset = TensorDataset(input_ids, attention_masks)

# to nitialize DataLoader
BATCH_SIZE = 2
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

# show information
print(f"Total Chunks: {len(all_chunks)}")
print(f"Total Batches: {len(dataloader)}")
print(f"First Batch Input IDs Shape: {next(iter(dataloader))[0].shape}")
print(f"First Batch Attention Mask Shape: {next(iter(dataloader))[1].shape}")

Total Chunks: 610
Total Batches: 305
First Batch Input IDs Shape: torch.Size([2, 1024])
First Batch Attention Mask Shape: torch.Size([2, 1024])


**Cell 7: Display the Tokenized Data**

In [None]:
# show the first tokenized example
print("Tokenized Article:\n", all_chunks[0])
print("\nAttention Mask:\n", all_attention_masks[0])


Tokenized Article:
 [574, 4524, 6, 1156, 36, 1251, 43, 480, 3268, 10997, 999, 3028, 7312, 20152, 3077, 899, 7, 10, 431, 984, 844, 153, 1358, 4006, 4, 134, 153, 43, 13016, 25, 37, 4072, 504, 15, 302, 6, 53, 37, 9838, 5, 418, 351, 75, 2471, 10, 8921, 15, 123, 4, 18322, 7312, 20152, 25, 3268, 10997, 11, 22, 29345, 10997, 8, 5, 9729, 9, 5, 5524, 113, 598, 5, 10208, 9, 20445, 6730, 1952, 198, 5, 232, 6, 5, 664, 2701, 161, 37, 34, 117, 708, 7, 856, 3961, 1334, 39, 1055, 409, 15, 1769, 1677, 6, 4076, 8, 6794, 1799, 4, 113, 100, 218, 75, 563, 7, 28, 65, 9, 167, 82, 54, 6, 25, 1010, 25, 51, 1004, 504, 6, 6017, 907, 1235, 10, 2232, 1612, 512, 2783, 50, 402, 1122, 60, 37, 174, 41, 2059, 33242, 656, 42, 353, 4, 113, 100, 218, 75, 206, 38, 581, 28, 1605, 31879, 4, 113, 133, 383, 38, 101, 2159, 32, 383, 14, 701, 59, 158, 2697, 480, 2799, 8, 32570, 8, 37206, 72, 3750, 504, 6, 7312, 20152, 40, 28, 441, 7, 23104, 11, 10, 10297, 6, 907, 10, 4076, 11, 10, 8881, 50, 192, 5, 8444, 822, 22, 40534, 523, 35, 

**Cell 8: Load the Pre-trained BART Model**

In [None]:
from transformers import BartForConditionalGeneration
from tqdm import tqdm
import time

# to load the pre-trained  model and to use GPU
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
model.to('cuda')

# a list to for  all generated summaries
generated_summaries = []

#  timer
start_time = time.time()


print(" Generating summaries...")
for batch in tqdm(dataloader):
    input_ids, attention_mask = batch
    input_ids = input_ids.to('cuda')
    attention_mask = attention_mask.to('cuda')

    # to generate summaries
    summary_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=4,
        max_length=350,
        min_length=30,
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
    generated_summaries.extend(summaries)
end_time = time.time()
elapsed = end_time - start_time

print(f"\n Summarization complete. Total summaries generated: {len(generated_summaries)}")
print(f" Total time taken: {elapsed:.2f} seconds")


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

 Generating summaries...


100%|██████████| 305/305 [05:25<00:00,  1.07s/it]


 Summarization complete. Total summaries generated: 610
 Total time taken: 325.57 seconds





**Cell 9: Display Summaries and original article**

---
Cell 9 is responsible for displaying a few random summaries generated by the BART model, alongside their original article chunks. It selects 3 random samples from the generated_summaries list and the corresponding all_chunks list. The original chunk is decoded back into readable text using the tokenizer, and it is printed along with its generated summary. This allows for
quick visual inspection to evaluate the quality and accuracy of the summarization process. The separator lines (---) are added to make it easier to distinguish between different samples.

The output helps us understand:

If the summaries are accurate and well-formed.

If important information is preserved in the summarization.

If there are any obvious issues like incomplete sentences or repetition.


In [None]:

import random

# the number of samples i want to display
num_samples = 3


print(" Displaying a few random summaries:")
for _ in range(num_samples):

    idx = random.randint(0, len(generated_summaries) - 1)

    # to show the original article
    print(f"\n **Original Article (Chunk {idx}):**")
    original_text = tokenizer.decode(all_chunks[idx], skip_special_tokens=True)
    print(original_text[:700], "...")  # here i want to show thefirst 700 characters, it is not nessasary to show the entire article

    # show the generated summary
    print(f"\n **Generated Summary (Chunk {idx}):**")
    print(generated_summaries[idx])
    print("-" * 100)


 Displaying a few random summaries:

 **Original Article (Chunk 107):**
NYON, Switzerland -- Celtic have been fined $50,800 by UEFA and AC Milan's Dida has been banned for two matches after the incident which saw a pitch-invading supporter approach the Brazilian goalkeeper in last week's Champions League match at Celtic Park.Dida's theatrical over-reaction has resulted in UEFA suspending him for two matches.The incident occurred when the Scottish side beat Milan 2-1 in Glasgow.A fan ran onto the field in the 90th minute, soon after the home side scored their winning goal, and made what appeared to be minimal contact with Dida.The Milan goalkeeper turned to chase the supporter before dropping to the ground.He was carried off the field on a stretcher and replaced. ...

 **Generated Summary (Chunk 107):**
Celtic have been fined $50,800 by UEFA and AC Milan's Dida has been banned for two matches. The incident occurred when the Scottish side beat Milan 2-1 in Glasgow. A fan ran onto the fie

**Cell 10: Merging Chunked Summaries Back into Full Article Summaries**

---

This code starts by creating a mapping that links each chunk back to its original article. Then, it goes through all 500 articles, collects the summaries of their chunks, and joins them together into one complete summary per article. These merged summaries are saved in a list called combined_summaries. At the end, it checks that there are exactly 500 combined summaries—one for each article—to confirm that everything was successfully reconstructed.

In [None]:
from collections import defaultdict

# mapping  original articles to their chunks
article_mapping = defaultdict(list)

# load the original dataset to keep the order
original_dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:500]")
original_articles = original_dataset['article']

# this is counter to record chunk index
chunk_counter = 0

for idx, article in enumerate(original_articles):
    # to tokenize and to split the article into chunks
    chunks, _ = sentence_based_chunking(article)
    # here it will map each chunk to its original article index
    for _ in chunks:
        article_mapping[idx].append(chunk_counter)
        chunk_counter += 1

# in this part i combine the summaries for each original article
combined_summaries = []
for i in range(500):  # loop through each  article
    article_chunks = article_mapping[i]
    # this will merge the summaries for the chunks belonging to the same article
    merged_summary = " ".join(generated_summaries[j] for j in article_chunks)
    combined_summaries.append(merged_summary)


print(f"Number of combined summaries after merging: {len(combined_summaries)}")


Number of combined summaries after merging: 500


**Cell 11: ROUGE Evaluation of Combined Summaries**

---
This code evaluates the quality of the combined summaries by comparing them to their corresponing highlightsfrom the dataset using the ROUGE metric, which measures word and phrase overlap.

In [None]:

from rouge_score import rouge_scorer

# first  i Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# the i load original dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:500]")
reference_summaries = dataset['highlights']

# the the evaluate ROUGE between generated and reference summaries
print("Evaluating generated summaries against reference highlights...")
rouge_scores = []

for reference, generated in zip(reference_summaries, combined_summaries):
    score = scorer.score(reference, generated)
    rouge_scores.append(score)

# calculate the average ROUGE scores
rouge1 = [s['rouge1'].fmeasure for s in rouge_scores]
rouge2 = [s['rouge2'].fmeasure for s in rouge_scores]
rougeL = [s['rougeL'].fmeasure for s in rouge_scores]

print("\n ROUGE Evaluation Results....")
print(f"  ROUGE-1 (Unigram Overlap): {np.mean(rouge1):.4f}")
print(f"  ROUGE-2 (Bigram Overlap):  {np.mean(rouge2):.4f}")
print(f"  ROUGE-L (Longest Match):   {np.mean(rougeL):.4f}")



Evaluating generated summaries against reference highlights...

 ROUGE Evaluation Results....
  ROUGE-1 (Unigram Overlap): 0.3948
  ROUGE-2 (Bigram Overlap):  0.1798
  ROUGE-L (Longest Match):   0.2803


**Cell 12: Save the Model**

In [None]:
import os
import shutil
from transformers import BartForConditionalGeneration, BartTokenizer

model_dir = "./bart_summarization_model"

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)




('./bart_summarization_model/tokenizer_config.json',
 './bart_summarization_model/special_tokens_map.json',
 './bart_summarization_model/vocab.json',
 './bart_summarization_model/merges.txt',
 './bart_summarization_model/added_tokens.json')

**Download the files**

In [None]:
# Zip the folder the download it
zip_path = shutil.make_archive(model_dir, 'zip', model_dir)

from google.colab import files
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>