In [1]:
import hashlib
import magic
import os
import pathlib
import subprocess
from IPython.display import clear_output
from io import StringIO
from pathlib import Path
# from pdfminer.converter import TextConverter
# from pdfminer.layout import LAParams
# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
# from pdfminer.pdfpage import PDFPage
# from pdfminer.pdfparser import PDFSyntaxError
# from pdfminer.pdfdocument import PDFEncryptionError
# import pdfplumber
import docx2txt
import fitz
from tqdm import tqdm

ModuleNotFoundError: No module named 'docx2txt'

In [None]:
def extract_text_from_pdf_fitz(pdf_file_path):
    """Extracts text from a PDF file."""
    with pdf_file_path.open('rb') as file:
        with fitz.open(file) as pdf:
            text = ""
            for page in pdf:
                text += page.get_text()
        return text

In [2]:
# def extract_text_from_pdf_pdfplumber(pdf_file_path):
#     """Extracts text from a PDF file."""
#     with pdf_file_path.open('rb') as file:
#         with pdfplumber.open(file) as pdf:
#             text = ""
#             for page in pdf.pages:
#                 text += page.extract_text()
#                 clear_output(wait=True)
#                 print("Processed page", page.page_number, "for", pdf_file_path.name)
#         return text

In [3]:
# def extract_text_from_pdf_pdfminer(pdf_file_path):
#     """Extracts text from a PDF file."""
#     with pdf_file_path.open('rb') as file:
#         resource_manager = PDFResourceManager()
#         string_io = StringIO()
#         pdf_converter = TextConverter(resource_manager, string_io, laparams=LAParams())
#         page_interpreter = PDFPageInterpreter(resource_manager, pdf_converter)
#         pages = PDFPage.get_pages(file, caching=True, check_extractable=False)

#         for i, page in enumerate(pages):
#             page_interpreter.process_page(page)
#             clear_output(wait=True)
#             print("Processed page", i + 1, "for", pdf_file_path.name)
#         text = string_io.getvalue()
#         pdf_converter.close()
#         string_io.close()
#         return text

In [3]:
def repair_pdf(pdf: Path):
    if 'repaired' in pdf.as_posix():
        return pdf
    ghostscript = "gs"
    repaired_pdf = pdf.with_suffix(".repaired.pdf")
    repair_command = [ghostscript, "-q", "-dNOPAUSE", "-dBATCH", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/prepress", "-sOutputFile=" + str(repaired_pdf), str(pdf)]
    subprocess.run(repair_command, check=True)
    return repaired_pdf

In [6]:
def process_directory(root_dir: Path, progress: tqdm):
    pdf_files = list(root_dir.rglob('*.pdf'))
    docx_files = list(root_dir.rglob('*.docx'))
    doc_files = list(root_dir.rglob('*.doc'))

    for file in root_dir.rglob('*'):
        if not file.is_file():
            continue

        mime_type = magic.from_file(file, mime=True)

        if mime_type.startswith("text/"):
            parsed_file = file.with_suffix('.parsed.txt')
            if parsed_file.exists() or file.suffix == '.html':
                continue
            process_text(file, parsed_file)
            progress.update(1)

    
    for file in pdf_files:
        parsed_file = file.with_suffix('.parsed.txt')
        if parsed_file.exists():
            continue
        process_pdf(file, parsed_file)
        progress.update(1)

    for file in docx_files:
        parsed_file = file.with_suffix('.parsed.txt')
        if parsed_file.exists():
            continue
        process_docx(file, parsed_file)
        progress.update(1)

    for file in doc_files:
        if parsed_file.exists():
            continue
        parsed_file = file.with_suffix('.parsed.txt')
        process_doc(file, parsed_file)
        progress.update(1)

            
def process_pdf(file, parsed_file):
    try:
        texts = extract_text_from_pdf_fitz(file)
    except Exception as e:
        repaired_pdf = repair_pdf(file)
        texts = extract_text_from_pdf_fitz(repaired_pdf)
    with open(parsed_file, 'w') as f:
        f.write(texts)

def process_docx(file, parsed_file):
    texts = docx2txt.process(file)
    with open(parsed_file, 'w') as f:
        f.write(texts)

def process_doc(file, parsed_file):
    subprocess.run(['libreoffice', '--convert-to', 'txt', '--outdir', str(file.parent), str(file)], check=True)
    txt_file = file.with_suffix('.txt')
    with open(txt_file, 'r') as f:
        texts = f.read()
    txt_file.unlink()
    with open(parsed_file, 'w') as f:
        f.write(texts)

def process_text(file, parsed_file):
    with open(file, 'r', errors='ignore') as f:
        texts = f.read()
        with open(parsed_file, 'w') as f:
            f.write(texts)

In [7]:
def search_for_identical_pdfs(root_dir):
    total_deleted = 0
    """Searches for identical PDF files in a directory and its subdirectories."""
    root_path = Path(root_dir)
    pdf_hashes = {}
    for file_path in root_path.rglob('*.pdf'):
        with file_path.open('rb') as file:
            pdf_hash = hashlib.sha1(file.read()).hexdigest()
            if pdf_hash not in pdf_hashes:
                pdf_hashes[pdf_hash] = [file_path]
            else:
                pdf_hashes[pdf_hash].append(file_path)
    for pdf_hash, pdf_files in pdf_hashes.items():
        if len(pdf_files) > 1:
            print(f"PDF with hash {pdf_hash} occurs {len(pdf_files)} times:")
            for pdf_file in pdf_files:
                print(f"  {pdf_file}")
            directory_count = {}
            for pdf_file in pdf_files:
                directory = pdf_file.parts[0:2]
                if directory in directory_count:
                    directory_count[directory] += 1
                else:
                    directory_count[directory] = 1
            for directory, count in directory_count.items():
                if count > 1:
                    for pdf_file in pdf_files:
                        if pdf_file.parts[0:2] == directory:
                            os.remove(pdf_file)
                            total_deleted += 1
                            print(f"  Deleted {pdf_file}")
                            pdf_files.remove(pdf_file)
                            break

            print()
    print(f'Total deleted: {total_deleted}')

#search_for_identical_pdfs('data')

In [8]:
root = Path('data')
pb = tqdm()

process_directory(root, pb)

1400it [02:10, 29.25it/s]