<a href="https://colab.research.google.com/github/Anderson-Andre-P/Summary-Generator-With-Python-and-NLTK/blob/main/SummarizerAlgorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install PyPDF2
!pip install reportlab
!pip install rouge

from google.colab import files
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
import heapq
import re
import io
from reportlab.pdfgen import canvas
import textwrap
from PyPDF2 import PdfReader
from rouge import Rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [25]:
class Summarizer:
    def __init__(self, pdf_file_path, language='english', summary_quality=5):
        nltk.download('punkt')
        nltk.download('stopwords')
        self.pdf_file_path = pdf_file_path
        self.language = language
        self.summary_quality = summary_quality
        self.pdf_text = self.extract_text_from_pdf()
        self.stop_words = set(stopwords.words(self.language))

    def extract_text_from_pdf(self):
        pdf_text = ""
        pdf_reader = PdfReader(self.pdf_file_path)
        for page in pdf_reader.pages:
            pdf_text += page.extract_text()
        return ' '.join(pdf_text.split())

    def preprocess_text(self, text):
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'[0-9]', ' ', text)
        return re.sub(r' +', ' ', text)

    def tokenize_and_filter_words(self, text):
        words = word_tokenize(text.lower())
        return [word for word in words if word not in self.stop_words and word.isalpha()]

    def stem_words(self, words):
        ps = PorterStemmer()
        return [ps.stem(word) for word in words]

    def calculate_word_frequency(self, words):
        return nltk.FreqDist(words)

    def calculate_sentence_scores(self, sentences, word_freq):
        scores = {}
        for sentence in sentences:
            for word in word_tokenize(sentence.lower()):
                if word in self.words_without_stopwords:
                    if sentence not in scores:
                        scores[sentence] = word_freq[word]
                    else:
                        scores[sentence] += word_freq[word]
        return scores

    def summarize(self):
        self.pdf_text = self.preprocess_text(self.pdf_text)
        self.words_without_stopwords = self.tokenize_and_filter_words(self.pdf_text)
        self.stemmed_words = self.stem_words(self.words_without_stopwords)
        self.word_frequency = self.calculate_word_frequency(self.stemmed_words)
        self.sentence_scores = self.calculate_sentence_scores(sent_tokenize(self.pdf_text), self.word_frequency)
        best_sentences = heapq.nlargest(self.summary_quality, self.sentence_scores, key=self.sentence_scores.get)
        summary = ' '.join(best_sentences)
        return summary

def add_line_breaks(text, characters_per_line=75):
    lines = textwrap.wrap(text, characters_per_line)
    return "\n".join(lines)

def create_pdf(summary, pdf_file_path, characters_per_line=70):
    buffer = io.BytesIO()
    pdf_canvas = canvas.Canvas(buffer)

    summary_pages = textwrap.wrap(summary, characters_per_line)

    x, y = 100, 750

    for page_index, page in enumerate(summary_pages):
        pdf_canvas.drawString(x, y, page)
        y -= 15
        if page_index < len(summary_pages) - 1 and y <= 100:
            pdf_canvas.showPage()
            y = 750

    pdf_canvas.save()

    with open("generated_summary.pdf", "wb") as output_file:
        output_file.write(buffer.getvalue())

    return "generated_summary.pdf"

# def calculate_rouge(summary):
#     rouge = Rouge()
#     scores = rouge.get_scores(summary, summary)
#     return scores[0]

if __name__ == "__main__":
    pdf_file_path = upload_file()
    if pdf_file_path:
        language = input("Enter the language for the summary (default is 'english'): ")
        summary_quality = int(input("Enter the quality of the summary (number of sentences): "))
        summarizer = Summarizer(pdf_file_path, language=language, summary_quality=summary_quality)
        summary = summarizer.summarize()
        summary_with_line_breaks = add_line_breaks(summary)
        pdf_path = create_pdf(summary_with_line_breaks, pdf_file_path)
        print("Summary: ", summary)
        print("Generated PDF: ", pdf_path)

        # rouge_scores = calculate_rouge(summary)
        # print("ROUGE Scores:")
        # print("ROUGE-1: ", rouge_scores['rouge-1']['f'])
        # print("ROUGE-2: ", rouge_scores['rouge-2']['f'])
        # print("ROUGE-L: ", rouge_scores['rouge-l']['f'])
    else:
        print("No file uploaded.")

Saving article.pdf to article.pdf
Enter the language for the summary (default is 'english'): portuguese
Enter the quality of the summary (number of sentences): 5


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Summary:  Nome Anderson André Pereira Eleutério RA Introdução Segundo Fadel e Silveira as metodologias ágeis são muito utilizadas por empresas desenvolvedoras de software que visam administrar melhor seus recursos e o tempo disponível para desenvolvimento das atividades Um dos objetivos que as organizações têm ao utilizar as metodologias ágeis é melhorar o fluxo do desenvolvimento de software alocando menos recursos na produção e aumentando a eficiência das equipes isso melhora a qualidade dos produtos desenvolvidos Os mesmos autores informam que o um encontro realizado por desenvolvedores de software identificou princípios e premissas que estão presentes no manifesto ágil As premissas podem ser representados nos seguintes tópicos Valorizar pessoas e interações acima de processos e ferramentas Priorizar o software funcional em vez de documentação detalhada Dar importância à colaboração com o cliente em vez de negociação de contratos Ser adaptável às mudanças em vez de seguir um plano i

In [None]:
# class Summarizer:
#     def __init__(self, pdf_file_path, language='english', summary_quality=5):
#         nltk.download('punkt')
#         nltk.download('stopwords')
#         self.pdf_file_path = pdf_file_path
#         self.language = language
#         self.summary_quality = summary_quality
#         self.pdf_text = self.extract_text_from_pdf()
#         self.stop_words = set(stopwords.words(self.language))


#     def extract_text_from_pdf(self):
#         pdf_text = ""
#         reader = PdfReader(self.pdf_file_path)
#         for page in reader.pages:
#             pdf_text += page.extract_text()
#         return ' '.join(pdf_text.split())

#     def preprocess_text(self, text):
#         text = re.sub(r'[^\w\s]', '', text)
#         text = re.sub(r'[0-9]', ' ', text)
#         return re.sub(r' +', ' ', text)

#     def tokenize_and_filter_words(self, text):
#         palavras = word_tokenize(text.lower())
#         return [palavra for palavra in palavras if palavra not in self.stop_words and palavra.isalpha()]

#     def stem_words(self, words):
#         ps = PorterStemmer()
#         return [ps.stem(palavra) for palavra in words]

#     def calculate_word_frequency(self, words):
#         return nltk.FreqDist(words)

#     def calculate_sentence_scores(self, sentences, word_freq):
#         scores = {}
#         for sentenca in sentences:
#             for palavra in word_tokenize(sentenca.lower()):
#                 if palavra in self.palavras_sem_stopwords:
#                     if sentenca not in scores:
#                         scores[sentenca] = word_freq[palavra]
#                     else:
#                         scores[sentenca] += word_freq[palavra]
#         return scores

#     def summarize(self):
#         self.pdf_text = self.preprocess_text(self.pdf_text)
#         self.palavras_sem_stopwords = self.tokenize_and_filter_words(self.pdf_text)
#         self.palavras_stemmed = self.stem_words(self.palavras_sem_stopwords)
#         self.frequencia_palavras = self.calculate_word_frequency(self.palavras_stemmed)
#         self.pontuacao_sentencas = self.calculate_sentence_scores(sent_tokenize(self.pdf_text), self.frequencia_palavras)
#         melhores_sentencas = heapq.nlargest(self.summary_quality, self.pontuacao_sentencas, key=self.pontuacao_sentencas.get)
#         resumo = ' '.join(melhores_sentencas)
#         return resumo


# def adicionar_quebras_de_linha(texto, caracteres_por_linha=75):
#     linhas = textwrap.wrap(texto, caracteres_por_linha)
#     return "\n".join(linhas)

# def criar_pdf(resumo, arquivo_pdf_path, caracteres_por_linha=70):
#     buffer = io.BytesIO()
#     pdf_canvas = canvas.Canvas(buffer)

#     paginas_do_resumo = textwrap.wrap(resumo, caracteres_por_linha)

#     x, y = 100, 750

#     for indice_pagina, pagina in enumerate(paginas_do_resumo):
#         pdf_canvas.drawString(x, y, pagina)
#         y -= 15  # ajusta a posição da próxima linha
#         if indice_pagina < len(paginas_do_resumo) - 1 and y <= 100:
#             pdf_canvas.showPage()
#             y = 750

#     pdf_canvas.save()

#     with open("resumo_gerado.pdf", "wb") as output_file:
#         output_file.write(buffer.getvalue())

#     return "resumo_gerado.pdf"


# def upload_file():
#     uploaded = files.upload()
#     if len(uploaded) > 0:
#         return list(uploaded.keys())[0]
#     else:
#         return None

# if __name__ == "__main__":
#     pdf_file_path = upload_file()
#     if pdf_file_path:
#         language = input("Enter the language for the summary (default is 'english'): ")
#         summary_quality = int(input("Enter the quality of the summary (number of sentences): "))
#         summarizer = Summarizer(pdf_file_path, language=language, summary_quality=summary_quality)
#         resumo = summarizer.summarize()
#         resumo_com_quebras = adicionar_quebras_de_linha(resumo)
#         pdf_path = criar_pdf(resumo_com_quebras, pdf_file_path)
#         print("Resumo: ", resumo)
#         print("PDF gerado: ", pdf_path)
#     else:
#         print("Nenhum arquivo enviado.")