Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make default generators threadsafe #126

Merged
merged 1 commit into from Feb 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 10 additions & 4 deletions pdf2image/exceptions.py
Expand Up @@ -2,20 +2,26 @@
Define exceptions specific to pdf2image
"""


class PopplerNotInstalledError(Exception):
"Happens when poppler is not installed"
"""Happens when poppler is not installed"""

pass


class PDFInfoNotInstalledError(PopplerNotInstalledError):
"Happens when pdfinfo is not installed"
"""Happens when pdfinfo is not installed"""

pass


class PDFPageCountError(Exception):
"Happens when the pdfinfo was unable to retrieve the page count"
"""Happens when the pdfinfo was unable to retrieve the page count"""

pass


class PDFSyntaxError(Exception):
"Syntax error was thrown during rendering"
"""Syntax error was thrown during rendering"""

pass
28 changes: 26 additions & 2 deletions pdf2image/generators.py
Expand Up @@ -3,16 +3,40 @@
"""

import uuid
import threading


class ThreadSafeGenerator(object):
"""Wrapper around generator that protects concurrent access"""

def __init__(self, gen):
self.gen = gen
self.lock = threading.Lock()

def __next__(self):
with self.lock:
return next(self.gen)


def threadsafe(f):
"""Decorator to make generator threadsafe. Fix #125"""

def g(*a, **kw):
return ThreadSafeGenerator(f(*a, **kw))

return g


@threadsafe
def uuid_generator():
"Returns a UUID4"
"""Returns a UUID4"""
while True:
yield str(uuid.uuid4())


@threadsafe
def counter_generator(prefix="", suffix="", padding_goal=4):
"Returns a joined prefix, iteration number, and suffix"
"""Returns a joined prefix, iteration number, and suffix"""
i = 0
while True:
i += 1
Expand Down
12 changes: 4 additions & 8 deletions pdf2image/parsers.py
Expand Up @@ -8,8 +8,7 @@


def parse_buffer_to_ppm(data):
"""Parse PPM file bytes to Pillow Image
"""
"""Parse PPM file bytes to Pillow Image"""

images = []

Expand All @@ -26,8 +25,7 @@ def parse_buffer_to_ppm(data):


def parse_buffer_to_pgm(data):
"""Parse PGM file bytes to Pillow Image
"""
"""Parse PGM file bytes to Pillow Image"""

images = []

Expand All @@ -44,8 +42,7 @@ def parse_buffer_to_pgm(data):


def parse_buffer_to_jpeg(data):
"""Parse JPEG file bytes to Pillow Image
"""
"""Parse JPEG file bytes to Pillow Image"""

return [
Image.open(BytesIO(image_data + b"\xff\xd9"))
Expand All @@ -56,8 +53,7 @@ def parse_buffer_to_jpeg(data):


def parse_buffer_to_png(data):
"""Parse PNG file bytes to Pillow Image
"""
"""Parse PNG file bytes to Pillow Image"""

images = []

Expand Down
21 changes: 13 additions & 8 deletions pdf2image/pdf2image.py
Expand Up @@ -26,7 +26,7 @@
PopplerNotInstalledError,
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
PDFSyntaxError,
)

TRANSPARENT_FILE_TYPES = ["png", "tiff"]
Expand Down Expand Up @@ -99,8 +99,7 @@ def convert_from_path(
)

poppler_version = _get_poppler_version(
"pdftocairo" if use_pdfcairo else "pdftoppm",
poppler_path=poppler_path
"pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
)

if poppler_version <= 57:
Expand Down Expand Up @@ -387,7 +386,9 @@ def _get_poppler_version(command, poppler_path=None):

try:
# TODO: Make this more robust
return int(err.decode("utf8", "ignore").split('\n')[0].split(' ')[-1].split('.')[1])
return int(
err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")[1]
)
except:
# Lowest version that includes pdftocairo (2011)
return 17
Expand All @@ -409,11 +410,15 @@ def pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None):
out, err = proc.communicate()

d = {}
for field in out.decode("utf8", "ignore").split('\n'):
sf = field.split(':')
key, value = sf[0], ':'.join(sf[1:])
for field in out.decode("utf8", "ignore").split("\n"):
sf = field.split(":")
key, value = sf[0], ":".join(sf[1:])
if key != "":
d[key] = int(value.strip()) if key in PDFINFO_CONVERT_TO_INT else value.strip()
d[key] = (
int(value.strip())
if key in PDFINFO_CONVERT_TO_INT
else value.strip()
)

if "Pages" not in d:
raise ValueError
Expand Down
14 changes: 13 additions & 1 deletion tests.py
Expand Up @@ -9,7 +9,7 @@
import subprocess
from subprocess import Popen, PIPE
from tempfile import TemporaryDirectory

from multiprocessing.dummy import Pool
from memory_profiler import profile as profile_memory

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
Expand Down Expand Up @@ -1482,5 +1482,17 @@ def test_conversion_from_path_using_dir_paths_only(self):
)
)

# Test for issue #125
@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed")
def test_multithread_conversion(self):
start_time = time.time()
files = ["./tests/test.pdf", ] * 50
p = Pool(10)
res = p.map(convert_from_path, files)
self.assertTrue(len(res) == 50)
print("test_multithread_conversion: {} sec".format(time.time() - start_time))


if __name__ == "__main__":
unittest.main()