<a href="https://colab.research.google.com/github/AshwathiE/nlp/blob/main/Text_summaraisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>






Extractive Summaraisation

In [None]:
#library installation
!pip install PyPDF2
!apt install urllib

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
[1;31mE: [0mUnable to locate package urllib[0m


In [None]:
import sys
import math
import bs4 as bs
import urllib.request
import re
import PyPDF2
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
import textwrap

nltk.download('wordnet')

# Initializing variables
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()

# Define functions for Reading Input Text
def file_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read().replace("\n", ' ')

def pdf_reader(pdf_path):
    with open(pdf_path, 'rb') as pdfFileObject:
        pdf_reader = PyPDF2.PdfReader(pdfFileObject)
        return ''.join(page.extract_text() for page in pdf_reader.pages)

def wiki_text(url):
    scrap_data = urllib.request.urlopen(url)
    parsed_article = bs.BeautifulSoup(scrap_data.read(), 'lxml')
    article_text = ''.join(p.text for p in parsed_article.find_all('p'))
    return re.sub(r'\[[0-9]*\]', '', article_text)

#  Getting Text
input_text_type = int(input("Select one way of inputting your text: \n1. Type your Text (or Copy-Paste)\n2. Load from .txt file\n3. Load from .pdf file\n4. From Wikipedia Page URL\n\n"))

if input_text_type == 1:
    original_text = input(u"Enter your text: \n\n")
elif input_text_type == 2:
    txt_path = input("Enter file path: ")
    original_text = file_text(txt_path)
elif input_text_type == 3:
    file_path = input("Enter file path: ")
    original_text = pdf_reader(file_path)
elif input_text_type == 4:
    wiki_url = input("Enter Wikipedia URL to load Article: ")
    original_text = wiki_text(wiki_url)
else:
    print("Sorry! Wrong Input, Try Again.")
    sys.exit()

#  Define functions to create Tf-Idf Matrix
text = nlp(original_text)
sentences = list(text.sents)
total_sentences = len(sentences)

def frequency_matrix(sentences):
    freq_matrix = {}
    stop_words = nlp.Defaults.stop_words

    for sent in sentences:
        freq_table = {}
        words = [lemmatizer.lemmatize(word.text.lower()) for word in sent if word.text.isalnum() and word.text.lower() not in stop_words]
        for word in words:
            freq_table[word] = freq_table.get(word, 0) + 1
        freq_matrix[sent.text[:15]] = freq_table

    return freq_matrix

def tf_matrix(freq_matrix):
    return {sent: {word: count / len(freq_table) for word, count in freq_table.items()} for sent, freq_table in freq_matrix.items()}

def idf_matrix(freq_matrix, sent_per_words, total_sentences):
    return {sent: {word: math.log10(total_sentences / float(sent_per_words[word])) for word in f_table} for sent, f_table in freq_matrix.items()}

def score_sentences(tf_idf_matrix):
    return {sent: sum(f_table.values()) / len(f_table) if f_table else 0 for sent, f_table in tf_idf_matrix.items()}

def average_score(sentence_score):
    return sum(sentence_score.values()) / len(sentence_score) if sentence_score else 0

def create_summary(sentences, sentence_score, threshold):
    return ' '.join(sentence.text for sentence in sentences if sentence.text[:15] in sentence_score and sentence_score[sentence.text[:15]] >= threshold)

#  Generate summary
if total_sentences > 100:  # Limit processing for long texts
    sentences = sentences[:100]  # Process only the first 100 sentences

freq_matrix = frequency_matrix(sentences)
tf_matrix = tf_matrix(freq_matrix)
sent_per_words = {word: sum(1 for f_table in freq_matrix.values() if word in f_table) for f_table in freq_matrix.values() for word in f_table}
idf_matrix = idf_matrix(freq_matrix, sent_per_words, total_sentences)
tf_idf_matrix = {sent1: {word: tf_value * idf_matrix[sent1][word] for word, tf_value in f_table1.items()} for sent1, f_table1 in tf_matrix.items()}
sentence_scores = score_sentences(tf_idf_matrix)
threshold = average_score(sentence_scores)
summary = create_summary(sentences, sentence_scores, 1.3 * threshold)

#  output text
def justify_text(text, width=80):
    wrapped_text = textwrap.fill(text, width=width)
    justified_lines = []

    for line in wrapped_text.split('\n'):
        words = line.split()
        if len(words) > 1:
            spaces_needed = width - sum(len(word) for word in words)
            space_between_words = spaces_needed // (len(words) - 1)
            extra_spaces = spaces_needed % (len(words) - 1)
            justified_line = ''
            for i, word in enumerate(words):
                justified_line += word
                if i < len(words) - 1:
                    justified_line += ' ' * (space_between_words + (1 if i < extra_spaces else 0))
            justified_lines.append(justified_line)
        else:
            justified_lines.append(line)

    return '\n'.join(justified_lines)

justified_summary = justify_text(summary)

print("\n\n Summarised Passage:\n\n", justified_summary, "\n\n")
print("Total words in original article =", len(original_text.split()))
print("Total words in summarized article =", len(justified_summary.split()))


[nltk_data] Downloading package wordnet to /root/nltk_data...


Select one way of inputting your text: 
1. Type your Text (or Copy-Paste)
2. Load from .txt file
3. Load from .pdf file
4. From Wikipedia Page URL

4
Enter Wikipedia URL to load Article: https://en.wikipedia.org/wiki/Chennai


 Summarised Passage:

 Chennai  (/ˈtʃɛnaɪ/  ⓘ;  Tamil: It is located on the Coromandel Coast of the Bay
of  Bengal.  Historically, the region was part of the Chola, Pandya, Pallava and
Vijayanagara  kingdoms  during  various eras. The city was officially renamed as
Chennai  in  1996.  It  is a major film production centre and home to the Tamil-
language  film  industry.  The region was part of Tondaimandalam which was ruled
by  the  Early  Cholas  in the 2nd century CE by subduing Kurumbas, the original
inhabitants  of  the  region.  In  1612,  the  Dutch established themselves near
Pulicat,  north of Chennai. The region was then formerly a fishing village known
as  "Madraspatnam". They resisted a French siege attempt in 1759. Chennai's soil
is  mostly clay, shal

Abstarctive Summaraisation

In [None]:
!pip install transformers



In [None]:
import sys
import math
import bs4 as bs
import urllib.request
import re
import PyPDF2
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
from transformers import pipeline
import textwrap


nltk.download('wordnet')

# Initialize the model for summarization
summarizer = pipeline("summarization")

# Initialize NLP tools
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()

def file_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read().replace("\n", ' ')

def pdf_reader(pdf_path):
    with open(pdf_path, 'rb') as pdfFileObject:
        pdf_reader = PyPDF2.PdfReader(pdfFileObject)
        count = len(pdf_reader.pages)
        print("\nTotal Pages in pdf =", count)

        if input("Do you want to read entire pdf? [Y]/N: ").strip().lower() == 'n':
            start_page = int(input("Enter start page number (Indexing starts from 0): "))
            end_page = int(input(f"Enter end page number (Less than {count}): "))

            if start_page < 0 or start_page >= count:
                print("\nInvalid Start page given")
                sys.exit()
            if end_page < 0 or end_page >= count:
                print("\nInvalid End page given")
                sys.exit()
        else:
            start_page, end_page = 0, count - 1

        return ''.join(pdf_reader.pages[i].extract_text() for i in range(start_page, end_page + 1))

def wiki_text(url):
    scrap_data = urllib.request.urlopen(url)
    parsed_article = bs.BeautifulSoup(scrap_data.read(), 'lxml')
    article_text = ''.join(p.text for p in parsed_article.find_all('p'))
    return re.sub(r'\[[0-9]*\]', '', article_text)

# Getting Text
input_text_type = int(input("Select one way of inputting your text: \n1. Type your Text (or Copy-Paste)\n2. Load from .txt file\n3. Load from .pdf file\n4. From Wikipedia Page URL\n\n"))

if input_text_type == 1:
    text = input("Enter your text: \n\n")
elif input_text_type == 2:
    txt_path = input("Enter file path: ")
    text = file_text(txt_path)
elif input_text_type == 3:
    file_path = input("Enter file path: ")
    text = pdf_reader(file_path)
elif input_text_type == 4:
    wiki_url = input("Enter Wikipedia URL to load Article: ")
    text = wiki_text(wiki_url)
else:
    print("Sorry! Wrong Input, Try Again.")
    sys.exit()

# function to Generate Abstractive Summary
def generate_summary(text, max_chunk_size=250):
    if len(text.split()) > max_chunk_size:
        sentences = text.split('. ')
        summary = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk.split()) + len(sentence.split()) <= max_chunk_size:
                current_chunk += sentence + '. '
            else:
                summary_chunk = summarizer(current_chunk, max_length=80, min_length=20, do_sample=False)
                summary.append(summary_chunk[0]['summary_text'])
                current_chunk = sentence + '. '

        if current_chunk:
            summary_chunk = summarizer(current_chunk, max_length=80, min_length=20, do_sample=False)
            summary.append(summary_chunk[0]['summary_text'])

        return ' '.join(summary)
    else:
        summary = summarizer(text, max_length=80, min_length=20, do_sample=False)
        return summary[0]['summary_text']

summary = generate_summary(text)

#  output text
def justify_text(text, width=80):
    wrapped_text = textwrap.fill(text, width=width)
    justified_lines = []

    for line in wrapped_text.split('\n'):
        words = line.split()
        if len(words) > 1:
            spaces_needed = width - sum(len(word) for word in words)
            space_between_words = spaces_needed // (len(words) - 1)
            extra_spaces = spaces_needed % (len(words) - 1)
            justified_line = ''
            for i, word in enumerate(words):
                justified_line += word
                if i < len(words) - 1:
                    justified_line += ' ' * (space_between_words + (1 if i < extra_spaces else 0))
            justified_lines.append(justified_line)
        else:
            justified_lines.append(line)

    return '\n'.join(justified_lines)

justified_summary = justify_text(summary)

print("\n Summarised Passage:\n\n", justified_summary, "\n\n")
print("Total words in original article =", len(text.split()))
print("Total words in summarized article =", len(justified_summary.split()))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Select one way of inputting your text: 
1. Type your Text (or Copy-Paste)
2. Load from .txt file
3. Load from .pdf file
4. From Wikipedia Page URL

4
Enter Wikipedia URL to load Article: https://en.wikipedia.org/wiki/Chennai


Your max_length is set to 80, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)



 Summarised Passage:

 Chennai  is  the  capital and largest city of Tamil Nadu, the southernmost state
of  India  .  It  is  located on the Coromandel Coast of the Bay of Bengal . The
Greater Chennai Corporation is the oldest municipal corporation in India and the
second  oldest  in  the  world  after London . The region was part of the Chola,
Pandya,  Pallava  and  Vijayanagara . The name Chennai was derived from the name
of Chennappa Nayaka, a Nayak ruler who served as a general under Venkata Raya of
the  Vijayanagara  Empire  from whom the British East India Company acquired the
town  in  1639  .  Chennai  is  a major centre for medical tourism and is termed
"India's  health  capital"  The  name  Madras  is of native origin, and has been
shown to have been in use before the British established a presence in India . A
land  grant  was  given  to  the Chennakesava Perumal Temple in Chennapatanam in
1646,  which  some  scholars  argue to be the first use of the name . The region
was 