In [None]:
import fitz  # type: ignore # PyMuPDF
import re
import os

def trim_and_structure_text(raw_text):
    # Remove excessive line breaks and combine multiple spaces into one
    normalized_text = re.sub(r"\n\s*\n", "\n", raw_text)  # Remove multiple newlines
    normalized_text = re.sub(r"\s{2,}", " ", normalized_text)  # Replace multiple spaces with one

    # Remove unnecessary leading/trailing whitespace
    normalized_text = normalized_text.strip()

    # Add spacing after headers like "1.", "2.", etc.
    normalized_text = re.sub(r"(\d+\.)", r"\n\1 ", normalized_text)

    return normalized_text


def convert_pdf_to_text(pdf_path):
    if not os.path.isfile(pdf_path):
        raise FileNotFoundError(f"No such file: '{pdf_path}'")

    # Extract the base name of the PDF file without extension
    document_base_name = os.path.splitext(os.path.basename(pdf_path))[0]

    # Open the PDF file
    with fitz.open(pdf_path) as doc:
        extracted_text = ""
        for page_number in range(len(doc)):
            page = doc[page_number]
            text = page.get_text()
            extracted_text += f"{text}\n"

    # Clean the extracted text
    cleaned_text = trim_and_structure_text(extracted_text)

    # Save the cleaned text to a .txt file with the same base name
    output_path = f"{document_base_name}.txt"
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(cleaned_text)

    print(f"The cleaned text has been saved to {output_path}")
    return output_path

def process_pdf_files_in_directory(folder_path):
    text_file_paths = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            pdf_file_path = os.path.join(folder_path, file_name)
            text_output_path = convert_pdf_to_text(pdf_file_path)
            text_file_paths.append(text_output_path)
    return text_file_paths

if __name__ == "__main__":
    syllabus_directory = os.path.abspath("/Users/lennon/Github/rumad-v2-app-segmentation-fault/syllabuses")
    text_file_paths = process_pdf_files_in_directory(syllabus_directory)
    print(f"Generated text files: {text_file_paths}")

The cleaned text has been saved to CIIC-4998-Undergraduate-Research.txt
The cleaned text has been saved to CIIC-3075-Foundations-of-Computing.txt
The cleaned text has been saved to INSO-4151-Software-Engineering-Project-I.txt
The cleaned text has been saved to INSO-4116-Software-Design.txt
The cleaned text has been saved to CIIC-3081-Computer-Architecture-I.txt
The cleaned text has been saved to CIIC-5017-Operating-Systems-and-Network-Administration-and-Security.txt
The cleaned text has been saved to CIIC-5110-Bioinformatics-Algorithms.txt
The cleaned text has been saved to CIIC-5140-Big-Data-Analytics.txt
The cleaned text has been saved to CIIC-5019-High-Performance-Computing.txt
The cleaned text has been saved to CIIC-4030-Programming-Languages.txt
The cleaned text has been saved to INSO-4101-Introduction-to-Software-Engineering.txt
The cleaned text has been saved to CIIC-5029-Compilers-Development.txt
The cleaned text has been saved to CIIC-4025-Analysis-and-Design-of-Algorithms.txt