In [None]:
pip install PyMuPDF

In [None]:
import os
import fitz  # PyMuPDF
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path
folder_path = "/content/drive/My Drive/Portfolio/Novel_RAG_Project/Data/beginning_processed_pdfs/"

def remove_last_page(pdf_path):
    """Removes the last page of a given PDF and saves it as a new file before replacing the original."""
    try:
        doc = fitz.open(pdf_path)
        if len(doc) > 1:
            doc.delete_page(-1)  # Delete last page

            # Create a temporary filename
            temp_pdf_path = pdf_path.replace(".pdf", "_temp.pdf")

            # Save as a new file
            doc.save(temp_pdf_path)
            doc.close()

            # Replace original file
            os.replace(temp_pdf_path, pdf_path)
            print(f"Updated: {pdf_path}")
        else:
            print(f"Skipping {pdf_path}, as it only has one page.")
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")

# Iterate through the folder and process PDFs
for filename in os.listdir(folder_path):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        remove_last_page(pdf_path)

print("Processing complete.")

In [None]:
# Define source and destination folders
source_folder = "/content/drive/My Drive/Portfolio/Novel_RAG_Project/Data/trimmed_BettyNeelsDataset_IA/"
dest_folder = "/content/drive/My Drive/Portfolio/Novel_RAG_Project/Data/text_BettyNeelsDataset/"

# Ensure destination folder exists
os.makedirs(dest_folder, exist_ok=True)

def extract_text_from_pdf(pdf_path):
    """Extracts and cleans text from a given PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        doc.close()

        # Basic cleaning: removing excessive newlines and spaces
        cleaned_text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
        return cleaned_text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

def process_pdfs():
    """Processes all PDFs, extracts text, and saves as structured files."""
    for filename in os.listdir(source_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(source_folder, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                # Define output text file path
                text_filename = filename.replace(".pdf", ".txt")
                text_path = os.path.join(dest_folder, text_filename)

                # Save extracted text
                with open(text_path, "w", encoding="utf-8") as f:
                    f.write(text)

                print(f"Processed and saved: {text_path}")

                # Copy original PDF to new folder
                shutil.copy(pdf_path, os.path.join(dest_folder, filename))

# Run the processing function
process_pdfs()

print("Processing complete.")