# 0. Imports

In [None]:
!pip install sumy PyPDF2 textstat transformers sentencepiece pysummarization

In [41]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import transformers
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
#import sentencepiece

import textstat
import nltk
import PyPDF2
import numpy

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Automated Extractive Summarization

1.1. Using sumy

In [3]:
# URL of the PDF document
pdf_url = "/content/Bolt.pdf"

# Extract text from the PDF
pdf_text = ""
with open(pdf_url, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        pdf_text += page.extract_text()

# Create a parser for the extracted ToS text
parser = PlaintextParser.from_string(pdf_text, Tokenizer("english"))

# Use LSA (Latent Semantic Analysis) for summarization
lsa_summarizer = LsaSummarizer()
lsa_summary = lsa_summarizer(parser.document, sentences_count=10)  # Change the number of sentences as needed

# Use LexRank for summarization
lex_rank_summarizer = LexRankSummarizer()
lex_rank_summary = lex_rank_summarizer(parser.document, sentences_count=10)  # Change the number of sentences as needed

# Use TextRank for summarization
text_rank_summarizer = TextRankSummarizer()
text_rank_summary = text_rank_summarizer(parser.document, sentences_count=10)  # Change the number of sentences as needed

# Print summaries
print("LSA Summary:")
for sentence in lsa_summary:
    print(sentence)

print("\nLexRank Summary:")
for sentence in lex_rank_summary:
    print(sentence)

print("\nTextRank Summary:")
for sentence in text_rank_summary:
    print(sentence)


LSA Summary:
The list of Bolt group companies and partners is available at https://bolt.eu/cities/ In order to use Bolt app you must agree to the terms and conditions that are set out below: 1.
Transport services are provided by drivers under a contract (with you) for the carriage of passengers.
Drivers provide transport services on an independent basis (either in person or via a company) as economic and professional service providers.
These service providers may charge you additional fees when processing payments in connection with the Bolt in-App Payment.
Bolt is not responsible for any such fees and disclaims all liability in this regard.
The resolution of disputes related to Bolt in-App Payment also takes place through us.
Inquiries submitted by e-mail or Bolt App will receive a response within one business day.
Sometimes driver may decide to cancel your request, please note that Bolt is not responsible for such situations.
Amendments to the General Terms and Conditions 8.1 If any 

In [4]:
# Convert summaries to strings
lsa_summary_text = ' '.join(map(str, lsa_summary))
lex_rank_summary_text = ' '.join(map(str, lex_rank_summary))
text_rank_summary_text = ' '.join(map(str, text_rank_summary))

# Content-based evaluation
original_sentences = pdf_text.split('.')
lsa_summary_sentences = lsa_summary_text.split('.')
lex_rank_summary_sentences = lex_rank_summary_text.split('.')
text_rank_summary_sentences = text_rank_summary_text.split('.')

# Calculate overlap between original and summary sentences
def calculate_overlap(summary_sentences):
    overlap_count = sum(1 for sentence in summary_sentences if sentence in original_sentences)
    overlap_percentage = (overlap_count / len(original_sentences)) * 100
    return overlap_percentage

print("Overlap with Original Text (LSA): {:.2f}%".format(calculate_overlap(lsa_summary_sentences)))
print("Overlap with Original Text (LexRank): {:.2f}%".format(calculate_overlap(lex_rank_summary_sentences)))
print("Overlap with Original Text (TextRank): {:.2f}%".format(calculate_overlap(text_rank_summary_sentences)))

Overlap with Original Text (LSA): 0.59%
Overlap with Original Text (LexRank): 2.37%
Overlap with Original Text (TextRank): 7.69%


In [5]:
# Calculate readability scores for the original text
flesch_reading_original = textstat.flesch_reading_ease(pdf_text)
flesch_kincaid_original = textstat.flesch_kincaid_grade(pdf_text)

print("Readability Scores for Original Text:")
print(f"Flesch Reading Ease: {flesch_reading_original}")
print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_original}\n")

# Calculate readability scores for the generated summaries (LSA, LexRank, TextRank)
summaries = {
    "LSA Summary": lsa_summary_text,
    "LexRank Summary": lex_rank_summary_text,
    "TextRank Summary": text_rank_summary_text
}

for summary_name, summary_text in summaries.items():
    flesch_reading = textstat.flesch_reading_ease(summary_text)
    flesch_kincaid = textstat.flesch_kincaid_grade(summary_text)

    print(f"Readability Scores for {summary_name}:")
    print(f"Flesch Reading Ease: {flesch_reading}")
    print(f"Flesch-Kincaid Grade Level: {flesch_kincaid}\n")

Readability Scores for Original Text:
Flesch Reading Ease: 49.35
Flesch-Kincaid Grade Level: 11.8

Readability Scores for LSA Summary:
Flesch Reading Ease: 54.32
Flesch-Kincaid Grade Level: 9.9

Readability Scores for LexRank Summary:
Flesch Reading Ease: 55.47
Flesch-Kincaid Grade Level: 11.5

Readability Scores for TextRank Summary:
Flesch Reading Ease: 49.28
Flesch-Kincaid Grade Level: 13.9



# 2. Automated Abstractive Summarization

In [12]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)



inputs = tokenizer.encode("summarize: " + text,
return_tensors='pt',
max_length=512,
truncation=True)


summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [25]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

# Function to generate abstractive summaries using BART model
def generate_bart_summary(input_text, max_length=150):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

    # Tokenize and preprocess the input text
    inputs = tokenizer([input_text], max_length=1024, return_tensors='pt', truncation=True)

    # Generate abstractive summary
    summary_ids = model.generate(inputs['input_ids'], max_length=max_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return generated_summary

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_url)

# Generate abstractive summary using BART model
generated_summary = generate_bart_summary(pdf_text, max_length=1000)

# Print the generated summary
print("Generated Abstractive Summary:")
print(generated_summary)

Generated Abstractive Summary:
Bolt app connects passengers with drivers to help them move around cities more efficiently. Terms and conditions apply to and governing the usage of the Bolt app. This website uses cookies to ensure that we offer you the best experience while browsing our website. Check our Cookie Declaration for more information.


In [33]:
# Function to generate abstractive summaries using T5 model
def generate_t5_summary(input_text, max_length=150):
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    # Tokenize and preprocess the input text
    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate abstractive summary
    summary_ids = model.generate(inputs, max_length=max_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return generated_summary

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_url)

# Generate abstractive summary using T5 model
generated_summary = generate_t5_summary(pdf_text, max_length=300)

# Print the generated summary
print("Generated Abstractive Summary:")
print(generated_summary)

ImportError: ignored

In [36]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

# Function to generate abstractive summaries using Pegasus model
def generate_pegasus_summary(input_text, max_length=150):
    tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
    model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')

    # Tokenize and preprocess the input text
    inputs = tokenizer([input_text], return_tensors='pt', max_length=1024, truncation=True)

    # Generate abstractive summary
    summary_ids = model.generate(inputs['input_ids'], max_length=max_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return generated_summary

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_url)

# Generate abstractive summary using Pegasus model
generated_summary = generate_pegasus_summary(pdf_text, max_length=250)  # Adjust max_length as needed

# Print the generated summary
print("Generated Abstractive Summary:")
print(generated_summary)

ImportError: ignored

# 3. Automated Hybrid Summarization

In [40]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

# Function for extractive summarization using pysummarization
def extractive_summarization(text):
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = [".", "\n"]
    abstractable_doc = TopNRankAbstractor()

    result_dict = auto_abstractor.summarize(text, abstractable_doc)
    summary = result_dict['summarize_result']

    return summary

# Function for abstractive summarization using BART model
def abstractive_summarization(text, max_length=150):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return generated_summary

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_url)

# Perform extractive summarization
extractive_summary = extractive_summarization(pdf_text)

# Perform abstractive summarization on the extractive summary
abstractive_summary = abstractive_summarization(extractive_summary, max_length=300)

# Print the hybrid summary
print("Generated Hybrid Summary:")
print(abstractive_summary)

TypeError: ignored