In [1]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import PyPDF2
import os

def combine_pdf_text_to_single_file(pdf_path, output_txt_path):
    """
    Extracts text from all pages of a PDF and saves it into a single .txt file,
    with separators between pages for better readability.

    Args:
        pdf_path (str): The path to the input PDF file.
        output_txt_path (str): The path for the final output .txt file.
    """
    # First, check if the PDF file exists in the folder
    if not os.path.exists(pdf_path):
        print(f"Error: The file '{pdf_path}' was not found. Make sure it's in the same folder as the script.")
        return

    # A list to hold the text from each page
    all_pages_text = []

    try:
        # Open the PDF file in binary read mode
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            print(f"📄 Reading {num_pages} pages from '{pdf_path}'...")

            # Loop through all the pages and extract their text
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()

                # Add a separator to know where a new page begins
                all_pages_text.append(f"--- PAGE {page_num + 1} ---\n")

                if text:
                    all_pages_text.append(text.strip())
                else:
                    all_pages_text.append("[No text could be extracted from this page]")

                # Add space between pages
                all_pages_text.append("\n\n")

        # Join all the collected text into one string
        full_content = "".join(all_pages_text)

        # Write the entire string to the output text file
        with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(full_content)

        print(f"✅ Success! All text has been saved to '{output_txt_path}'.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# --- Main execution block ---
if __name__ == "__main__":
    # The name of the PDF file you provided
    pdf_input = "/media/danielterra/Windows-SSD/Users/danie/Documents/Documents/MESTRADO/25-2_aprendizado-por-reforco/aulas/Aula 5 - Programação Dinâmica.pdf"

    # The name of the text file that will be created
    txt_output = "Aula_5_conteudo_completo.txt"

    combine_pdf_text_to_single_file(pdf_input, txt_output)

📄 Reading 26 pages from '/media/danielterra/Windows-SSD/Users/danie/Documents/Documents/MESTRADO/25-2_aprendizado-por-reforco/aulas/Aula 5 - Programação Dinâmica.pdf'...
✅ Success! All text has been saved to 'Aula_5_conteudo_completo.txt'.
