In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [2]:
import multiprocessing as mp
import os
# import argparse
import fitz  # PyMuPDF
import pandas as pd
import time
import re

In [3]:
directory = '/kaggle/input/dataset-125-pdf-files'

In [4]:
start = time.process_time()
# CPU-bound code

#### Create a producer to add the pdf files to the shared queue

In [5]:
def producer(task_queue, pdf_directory):
    """Enqueue paths of all PDFs in the directory."""
    for root, _, files in os.walk(pdf_directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                task_queue.put(os.path.join(root, file))

#### Create the consumers to take the PDF files from the shared queue (task_queue), process them and then send the extracted text to the results queue (results_queue) 

In [6]:
def consumer(task_queue, results_queue):
    """Dequeue PDFs, process them and send extracted text to results_queue."""    
    while True:
        pdf_path = task_queue.get()
        if pdf_path is None:  # Poison pill received
            task_queue.task_done()
            results_queue.put(None)  # Send a sentinel value to the main process to signal completion
            break
        try:
            with fitz.open(pdf_path) as doc:
                text = " "
                for page in doc:
                    text += page.get_text()
                results_queue.put({
                    "filename": os.path.basename(pdf_path),
                    "text": text
                })
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
        finally:
            task_queue.task_done()  # Mark task as complete

In [7]:
def main(pdf_directory, num_consumers):
    task_queue = mp.JoinableQueue() # The shared queue (task_queue)
    results_queue = mp.Queue()  # Queue to collect results
    
    # Start producer process
    producer_proc = mp.Process(target=producer, args=(task_queue, pdf_directory))
    producer_proc.start()

    # Start consumer processes (daemonized to exit on main completion)
    consumers = []
    for _ in range(num_consumers):
        cons = mp.Process(target=consumer, args=(task_queue,results_queue))
        cons.daemon = True
        cons.start()
        consumers.append(cons)

    # Wait for producer to finish enqueuing files
    producer_proc.join()

    # Add poison pills to task queue in order to stop consumers
    for _ in range(num_consumers):
        task_queue.put(None)

    # Wait for all tasks (including poison pills) to complete
    task_queue.join()

    # Collect results and handle sentinels
    data_text = []
    sentinel_count = 0
    while sentinel_count < num_consumers:
        result = results_queue.get()
        if result is None:
            sentinel_count += 1
        else:
            data_text.append(result)

    print(f'All PDFs were processed successfully and the process time is {10}.')
    return data_text  # Now accessible in the main process

In [8]:
num_consumers = mp.cpu_count()
print(f'The total number of consumers is {num_consumers}.')
data_text = main(directory, num_consumers)

# Example: Print the first result
print(f"Processed {len(data_text)} PDFs.")

if data_text:
    print(f"First filename: {data_text[0]['filename']}")

The total number of consumers is 4.
All PDFs were processed successfully and the process time is 10.
Processed 163 PDFs.
First filename: chen22v.pdf


In [9]:
end = time.process_time()
print(f"CPU Time: {end - start:.4f} seconds")

CPU Time: 0.3566 seconds


In [10]:
x = data_text[0]
# x

In [11]:
df = pd.DataFrame(data_text)

In [12]:
df.head()

Unnamed: 0,filename,text
0,chen22v.pdf,Faster Fundamental Graph Algorithms via Learn...
1,05311655a15b75fab86956663e1819cd-Paper.pdf,Practical Bayesian Optimization of Machine\nL...
2,chap2.pdf,i\ni\ni\ni\ni\ni\ni\ni\nMathematical Engineer...
3,planlearn2012_submission_1.pdf,Combining Meta-Learning and Optimization Algo...
4,26262688.pdf,\n \n \n \n \n \nMachine Learning in \nBusin...


In [13]:
# df['text'][4]

In [14]:
# df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+', 'ANDY LANDU NGOMA', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'www.\S+|http\S+', 'ANDY LANDU NGOMA', x))

In [15]:
x = 'He was born on November 15, 2021. URL https://arxiv.org/\nabs/2110.14094.\nEsfandiari, H., Korula Smirth'

y = re.sub(r'https?://\S+|www\.\S+', '', x)
y

'He was born on November 15, 2021. URL \nabs/2110.14094.\nEsfandiari, H., Korula Smirth'

In [16]:
df['text'][4]

'  \n \n \n \n \n \nMachine Learning in \nBusiness: \nAn Introduction to the World of Data \nScience \n \n \n \n \n \n \n \n     \n \n \n     \n \nMachine Learning in \nBusiness: \nAn Introduction to the World of Data \nScience \n \nSecond Edition \n \n \n \nJohn C. Hull \n \nUniversity Professor \nJoseph L. Rotman School of Management \nUniversity of Toronto \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n     \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nSecond Printing \nCopyright © 2019, 2020 by John C. Hull \nAll Rights Reserved \nISBN: 9798644074372 \n \n     \n \n \nTo my students \n \n \n \n \n \n \n     \n \n \nvii \n \nContents \n \n \n \nPreface \n \nxi \n \n \n \nChapter 1 \nIntroduction \n1 \n \n1.1   This book and the ancillary material \n3 \n \n1.2   Types of machine learning models \n4 \n \n1.3   Validation and testing \n6 \n \n1.4   Data cleaning \n14 \n \n1.5   Bayes’ theorem \n16 \n \nSummary \n19 \n \nShort conce