In [1]:
# import libraries
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import spacy
# spacy.cli.download("en_core_web_sm")
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
if torch.cuda.is_available():
    print("GPU is enabled and available!")
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available.")

GPU is enabled and available!
GPU Name: Tesla T4


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# load the model and tokenizer
name_model = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(name_model)
model = BartForConditionalGeneration.from_pretrained(name_model)
model = model.to(device)
nlp = spacy.load("en_core_web_sm")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [5]:
# Example text to summarize
text_to_summarize = """
In recent years, the rapid advancement of artificial intelligence (AI) and machine learning has transformed numerous fields, including healthcare, finance, and transportation. One of the most significant breakthroughs has been the development of deep learning models that can process vast amounts of data to uncover patterns and make predictions with impressive accuracy. For instance, in the healthcare sector, AI is used to assist in diagnosing diseases, predicting patient outcomes, and personalizing treatment plans. In a recent study published in Nature Medicine, researchers demonstrated that a deep learning model could accurately diagnose skin cancer with performance comparable to that of experienced dermatologists.
The paper also discusses the ethical and privacy challenges that accompany the widespread adoption of AI. For example, ensuring the security of patient data in healthcare applications remains a critical issue. Moreover, the potential for bias in AI algorithms raises concerns, as models trained on biased data may produce discriminatory outcomes, disproportionately affecting marginalized communities. The authors suggest that as AI continues to evolve, it will be essential to establish robust ethical frameworks to guide its development and deployment across different sectors.
Additionally, the research explores the future potential of AI in understanding and simulating human cognition. By modeling complex neural networks, scientists hope to gain insights into how the human brain processes information and makes decisions. This area of research, known as cognitive computing, holds promise not only for enhancing AI systems but also for advancing our understanding of human intelligence. In summary, while AI presents immense opportunities for innovation, its growth must be tempered with careful consideration of ethical implications and regulatory measures to ensure its responsible use in society.
"""

In [None]:
# create function "preprocessing"
def preprocessing_func(text):
    # convert text to lowercase
    lower_case = text.lower()
    # remove special characters (ie. "?/,)
    remove_special_char = re.sub(r"[\",?;]", "", lower_case)
    # return processed_text
    return remove_special_char

In [7]:
# call function preprocessing
processed_text = preprocessing_func(text = text_to_summarize)
print(processed_text)


in recent years the rapid advancement of artificial intelligence (ai) and machine learning has transformed numerous fields including healthcare finance and transportation. one of the most significant breakthroughs has been the development of deep learning models that can process vast amounts of data to uncover patterns and make predictions with impressive accuracy. for instance in the healthcare sector ai is used to assist in diagnosing diseases predicting patient outcomes and personalizing treatment plans. in a recent study published in nature medicine researchers demonstrated that a deep learning model could accurately diagnose skin cancer with performance comparable to that of experienced dermatologists.
the paper also discusses the ethical and privacy challenges that accompany the widespread adoption of ai. for example ensuring the security of patient data in healthcare applications remains a critical issue. moreover the potential for bias in ai algorithms raises concerns as model

In [8]:
# function to generate summary of text
def text_summarization(text):
    # using for loop to convert the text to number "Encoder", get the encoder to summarize and finally decode embedding
    # encode form text to number
    inputs = tokenizer(text, return_tensors = "pt", truncation = True, padding = True).to(device)

    # generate summary for each chunk
    summary_id = model.generate(
        inputs["input_ids"],
        min_length = 100,
        max_length = 200,
        num_beams = 6,   # using beam search to more accurate result of summary
        early_stopping = True
    )

    # decode and store each summary
    outputs = tokenizer.decode(summary_id[0], skip_special_tokens = True)

    # return summaries list
    return outputs

In [9]:
# call text_summarization function
summary_result = text_summarization(processed_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
# load the model and tokenizer
name_model_2 = "google/pegasus-large"
tokenizer_2 = PegasusTokenizer.from_pretrained(name_model)
model_2 = PegasusForConditionalGeneration.from_pretrained(name_model)
model_2 = model_2.to(device)

In [11]:
# function to generate summary of text
def text_summarization_2(text):
    # using for loop to convert the text to number "Encoder", get the encoder to summarize and finally decode embedding
    # encode form text to number
    inputs = tokenizer_2(text, return_tensors = "pt", truncation = True, padding = True).to(device)

    # generate summary for each chunk
    summary_id = model_2.generate(
        inputs["input_ids"],
        min_length = 100,
        max_length = 200,
        num_beams = 6,   # using beam search to more accurate result of summary
        early_stopping = True
    )

    # decode and store each summary
    outputs = tokenizer_2.decode(summary_id[0], skip_special_tokens = True)

    # return summaries list
    return outputs

In [12]:
summary_result_2 = text_summarization_2(processed_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
# load the model and tokenizer
name_model_3 = "sshleifer/distilbart-cnn-12-6"
tokenizer_3 = BartTokenizer.from_pretrained(name_model)
model_3 = BartForConditionalGeneration.from_pretrained(name_model)
model_3 = model_3.to(device)

In [14]:
# function to generate summary of text
def text_summarization_3(text):
    # using for loop to convert the text to number "Encoder", get the encoder to summarize and finally decode embedding
    # encode form text to number
    inputs = tokenizer_3(text, return_tensors = "pt", truncation = True, padding = True).to(device)

    # generate summary for each chunk
    summary_id = model_3.generate(
        inputs["input_ids"],
        min_length = 100,
        max_length = 200,
        num_beams = 6,   # using beam search to more accurate result of summary
        early_stopping = True
    )

    # decode and store each summary
    outputs = tokenizer_3.decode(summary_id[0], skip_special_tokens = True)

    # return summaries list
    return outputs

In [15]:
summary_result_3 = text_summarization_3(processed_text)
# print orginal text
print(f"Orginal Text :\n {text_to_summarize.strip()}\n")
# print summary text facebook/bart-large-cnn
print(f"Summary 'facebook/bart-large-cnn' Text :\n {summary_result}\n")
# print summary text google/pegasus-large
print(f"Summary 'google/pegasus-large' Text :\n {summary_result_2}\n")
# print summary text sshleifer/distilbart-cnn-12-6
print(f"Summary 'sshleifer/distilbart-cnn-12-6' Text :\n {summary_result_3}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Orginal Text :
 In recent years, the rapid advancement of artificial intelligence (AI) and machine learning has transformed numerous fields, including healthcare, finance, and transportation. One of the most significant breakthroughs has been the development of deep learning models that can process vast amounts of data to uncover patterns and make predictions with impressive accuracy. For instance, in the healthcare sector, AI is used to assist in diagnosing diseases, predicting patient outcomes, and personalizing treatment plans. In a recent study published in Nature Medicine, researchers demonstrated that a deep learning model could accurately diagnose skin cancer with performance comparable to that of experienced dermatologists.
The paper also discusses the ethical and privacy challenges that accompany the widespread adoption of AI. For example, ensuring the security of patient data in healthcare applications remains a critical issue. Moreover, the potential for bias in AI algorithm

In [26]:
# Load PEGASUS model and tokenizer
model_name_4 = "google/pegasus-large"
tokenizer_4 = PegasusTokenizer.from_pretrained(model_name_4)
model_4 = PegasusForConditionalGeneration.from_pretrained(model_name_4)
model_4 = model_4.to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def split_text(text, max_length=128):
    """Splits text into segments that fit within the model's token limit."""
    words = text.split()
    segments = []
    current_segment = []

    for word in words:
        current_segment.append(word)
        if len(tokenizer(" ".join(current_segment)).input_ids) >= max_length:
            segments.append(" ".join(current_segment))
            current_segment = []

    if current_segment:  # Add any remaining words
        segments.append(" ".join(current_segment))

    return segments

def summarize_text(text):
    """Generates a summary for a given text segment."""
    inputs = tokenizer_4(text, return_tensors="pt", truncation=True).to(device)
    summary_ids = model_4.generate(inputs.input_ids,min_length = 100, max_length=200, num_beams=5, early_stopping=True)
    summary = tokenizer_4.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [28]:
def hierarchical_summarization(text):
    """Applies segmentation and hierarchical summarization."""
    # Step 1: Segment the large text
    segments = split_text(text)

    # Step 2: Summarize each segment
    segment_summaries = [summarize_text(segment) for segment in segments]
    for i in range(len(segment_summaries)):
        print(f"Segment {i+1}: {segment_summaries[i]}\n")

    # Step 3: Combine segment summaries into a final summary
    combined_summary = " ".join(segment_summaries)
    final_summary = summarize_text(combined_summary)

    return final_summary

In [29]:
# Sample text (use a large document like a research paper or book chapter here)

# Generate the hierarchical summary
summary = hierarchical_summarization(processed_text)
print("Final Summary:\n", summary)

Segment 1: in recent years the rapid advancement of artificial intelligence (ai) and machine learning has transformed numerous fields including healthcare finance and transportation. in a recent study published in nature medicine researchers demonstrated that a deep learning model could accurately diagnose skin cancer with performance comparable to that of experienced dermatologists., the paper also discusses the ethical and privacy challenges that accompany the widespread adoption  Copyright (c) The Vancouver Sun E-mail this Article Print this Article Share this Article The paper also discusses the ethical and privacy challenges that accompany the widespread adoption

Segment 2: moreover the potential for bias in ai algorithms raises concerns as models trained on biased data may produce discriminatory outcomes disproportionately affecting marginalized communities. additionally the research explores the future potential of ai in understanding and simulating human cognition. the authors