In [1]:
import hashlib
import magic
import os
import pathlib
import spacy
from IPython.display import clear_output
from io import StringIO
from pathlib import Path
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from tqdm import tqdm

In [2]:
def extract_text_from_pdf(pdf_file_path):
    """Extracts text from a PDF file."""
    with pdf_file_path.open('rb') as file:
        resource_manager = PDFResourceManager()
        string_io = StringIO()
        pdf_converter = TextConverter(resource_manager, string_io, codec='utf-8', laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, pdf_converter)
        pages = PDFPage.get_pages(file, caching=True, check_extractable=False)

        for i, page in enumerate(pages):
            page_interpreter.process_page(page)
            clear_output(wait=True)
            print("Processed page", i + 1, "for", pdf_file_path.name)
        text = string_io.getvalue()
        pdf_converter.close()
        string_io.close()
        return text

In [3]:
def count_text(root_dir: Path):
    count = 0
    for file in root_dir.iterdir():
        if file.is_dir():
            count += count_text(file)
        elif file.suffix == '.pdf':
            count += 1
        elif magic.from_file(file, mime=True).startswith("text/"):
            count += 1
    return count

In [4]:
def process_file(file, progress):
    if file.is_dir():
        return search_for_text(file, progress)
    elif file.suffix == '.pdf':
        texts = extract_text_from_pdf(file)
        progress.update(1)
        return [texts]
    elif magic.from_file(file, mime=True).startswith("text/"):
        with open(file, 'r') as f:
            texts = f.read()
            progress.update(1)
            return [texts]
    return []

def search_for_text(root_dir: Path, progress: tqdm):
    texts = []
    for file in root_dir.iterdir():
        texts = texts + process_file(file, progress)
    # with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
    #     futures = [executor.submit(process_file, file, progress) for file in root_dir.iterdir()]
    #     for future in concurrent.futures.as_completed(futures):
    #         texts = texts + future.result()
    return texts

In [5]:
def delete_parsed_text_files(root_dir):
    """Deletes every file named parsed_text.txt in a directory and its subdirectories."""
    root_path = Path(root_dir)
    for file_path in root_path.rglob('parsed_text.txt'):
        file_path.unlink()
# delete_parsed_text_files('data')

In [6]:
root = Path('data')
total_files = count_text(root)
pb = tqdm(total=total_files, smoothing=0)

for folder in root.iterdir():
    if not folder.is_dir():
        continue
    parsed_text_file = folder.joinpath('parsed_text.txt')
    if parsed_text_file.exists():
        pb.update(count_text(folder))
        continue

    documents = search_for_text(folder, progress=pb)
    with open(parsed_text_file, 'w') as f:
        f.write("\n".join(documents))

100%|██████████| 1514/1514 [5:16:35<00:00, 12.55s/it]

Processed page 1 for Doc3817220750 - Addendum 1.pdf


In [2]:



def extract_sentences(folder):
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2000000
    chunk_size = 100000
    root_folder = pathlib.Path(folder)

    total_files = len(list(root_folder.rglob("parsed_text.txt")))
    for file_path in tqdm(root_folder.rglob("parsed_text.txt"), total=total_files):
        with open(file_path, 'r') as f:
            parsed_text = f.read()
            sentences = []
            for i in range(0, len(parsed_text), chunk_size):
                chunk = parsed_text[i:i + chunk_size]
                doc = nlp(chunk)
                sentences.extend([sent.text for sent in doc.sents])
            sentences_text = '\n'.join(sentences)

        output_file = file_path.parent / 'sentences.txt'
        with open(output_file, 'w') as f:
            f.write(sentences_text)

extract_sentences('data')

 21%|██▏       | 18/84 [14:44<54:02, 49.13s/it]  


KeyboardInterrupt: 

In [None]:
def search_for_identical_pdfs(root_dir):
    total_deleted = 0
    """Searches for identical PDF files in a directory and its subdirectories."""
    root_path = Path(root_dir)
    pdf_hashes = {}
    for file_path in root_path.rglob('*.pdf'):
        with file_path.open('rb') as file:
            pdf_hash = hashlib.sha1(file.read()).hexdigest()
            if pdf_hash not in pdf_hashes:
                pdf_hashes[pdf_hash] = [file_path]
            else:
                pdf_hashes[pdf_hash].append(file_path)
    for pdf_hash, pdf_files in pdf_hashes.items():
        if len(pdf_files) > 1:
            print(f"PDF with hash {pdf_hash} occurs {len(pdf_files)} times:")
            for pdf_file in pdf_files:
                print(f"  {pdf_file}")
            directory_count = {}
            for pdf_file in pdf_files:
                directory = pdf_file.parts[0:2]
                if directory in directory_count:
                    directory_count[directory] += 1
                else:
                    directory_count[directory] = 1
            for directory, count in directory_count.items():
                if count > 1:
                    for pdf_file in pdf_files:
                        if pdf_file.parts[0:2] == directory:
                            os.remove(pdf_file)
                            total_deleted += 1
                            print(f"  Deleted {pdf_file}")
                            pdf_files.remove(pdf_file)
                            break

            print()
    print(f'Total deleted: {total_deleted}')

