/
pdf2image.py
474 lines (397 loc) · 14.4 KB
/
pdf2image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
"""
pdf2image is a light wrapper for the poppler-utils tools that can convert your
PDFs into Pillow images.
"""
import os
import platform
import tempfile
import types
import shutil
import pathlib
from subprocess import Popen, PIPE
from PIL import Image
from .generators import uuid_generator, counter_generator, ThreadSafeGenerator
from .parsers import (
parse_buffer_to_pgm,
parse_buffer_to_ppm,
parse_buffer_to_jpeg,
parse_buffer_to_png,
)
from .exceptions import (
PopplerNotInstalledError,
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError,
)
TRANSPARENT_FILE_TYPES = ["png", "tiff"]
PDFINFO_CONVERT_TO_INT = ["Pages"]
def convert_from_path(
pdf_path,
dpi=200,
output_folder=None,
first_page=None,
last_page=None,
fmt="ppm",
jpegopt=None,
thread_count=1,
userpw=None,
use_cropbox=False,
strict=False,
transparent=False,
single_file=False,
output_file=uuid_generator(),
poppler_path=None,
grayscale=False,
size=None,
paths_only=False,
use_pdftocairo=False,
):
"""
Description: Convert PDF to Image will throw whenever one of the condition is reached
Parameters:
pdf_path -> Path to the PDF that you want to convert
dpi -> Image quality in DPI (default 200)
output_folder -> Write the resulting images to a folder (instead of directly in memory)
first_page -> First page to process
last_page -> Last page to process before stopping
fmt -> Output image format
jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format)
thread_count -> How many threads we are allowed to spawn for processing
userpw -> PDF's password
use_cropbox -> Use cropbox instead of mediabox
strict -> When a Syntax Error is thrown, it will be raised as an Exception
transparent -> Output with a transparent background instead of a white one.
single_file -> Uses the -singlefile option from pdftoppm/pdftocairo
output_file -> What is the output filename or generator
poppler_path -> Path to look for poppler binaries
grayscale -> Output grayscale image(s)
size -> Size of the resulting image(s), uses the Pillow (width, height) standard
paths_only -> Don't load image(s), return paths instead (requires output_folder)
use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance
"""
if use_pdftocairo and fmt == "ppm":
fmt = "png"
# We make sure that if passed arguments are Path objects, they're converted to strings
if isinstance(pdf_path, pathlib.PurePath):
pdf_path = pdf_path.as_posix()
if isinstance(output_folder, pathlib.PurePath):
output_folder = output_folder.as_posix()
if isinstance(poppler_path, pathlib.PurePath):
poppler_path = poppler_path.as_posix()
page_count = pdfinfo_from_path(pdf_path, userpw, poppler_path=poppler_path)["Pages"]
# We start by getting the output format, the buffer processing function and if we need pdftocairo
parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
fmt, grayscale
)
# We use pdftocairo is the format requires it OR we need a transparent output
use_pdfcairo = (
use_pdftocairo
or use_pdfcairo_format
or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
)
poppler_version = _get_poppler_version(
"pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
)
if poppler_version <= 57:
jpegopt = None
# If output_file isn't a generator, it will be turned into one
if not isinstance(output_file, types.GeneratorType) and not isinstance(
output_file, ThreadSafeGenerator
):
if single_file:
output_file = iter([output_file])
else:
output_file = counter_generator(output_file)
if thread_count < 1:
thread_count = 1
if first_page is None:
first_page = 1
if last_page is None or last_page > page_count:
last_page = page_count
if first_page > last_page:
return []
auto_temp_dir = False
if output_folder is None and use_pdfcairo:
auto_temp_dir = True
output_folder = tempfile.mkdtemp()
# Recalculate page count based on first and last page
page_count = last_page - first_page + 1
if thread_count > page_count:
thread_count = page_count
reminder = page_count % thread_count
current_page = first_page
processes = []
for _ in range(thread_count):
thread_output_file = next(output_file)
# Get the number of pages the thread will be processing
thread_page_count = page_count // thread_count + int(reminder > 0)
# Build the command accordingly
args = _build_command(
["-r", str(dpi), pdf_path],
output_folder,
current_page,
current_page + thread_page_count - 1,
parsed_fmt,
jpegopt,
thread_output_file,
userpw,
use_cropbox,
transparent,
single_file,
grayscale,
size,
)
if use_pdfcairo:
args = [_get_command_path("pdftocairo", poppler_path)] + args
else:
args = [_get_command_path("pdftoppm", poppler_path)] + args
# Update page values
current_page = current_page + thread_page_count
reminder -= int(reminder > 0)
# Add poppler path to LD_LIBRARY_PATH
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
# Spawn the process and save its uuid
processes.append(
(thread_output_file, Popen(args, env=env, stdout=PIPE, stderr=PIPE))
)
images = []
for uid, proc in processes:
data, err = proc.communicate()
if b"Syntax Error" in err and strict:
raise PDFSyntaxError(err.decode("utf8", "ignore"))
if output_folder is not None:
images += _load_from_output_folder(
output_folder, uid, final_extension, paths_only, in_memory=auto_temp_dir
)
else:
images += parse_buffer_func(data)
if auto_temp_dir:
shutil.rmtree(output_folder)
return images
def convert_from_bytes(
pdf_file,
dpi=200,
output_folder=None,
first_page=None,
last_page=None,
fmt="ppm",
jpegopt=None,
thread_count=1,
userpw=None,
use_cropbox=False,
strict=False,
transparent=False,
single_file=False,
output_file=uuid_generator(),
poppler_path=None,
grayscale=False,
size=None,
paths_only=False,
use_pdftocairo=False,
):
"""
Description: Convert PDF to Image will throw whenever one of the condition is reached
Parameters:
pdf_file -> Bytes representing the PDF file
dpi -> Image quality in DPI
output_folder -> Write the resulting images to a folder (instead of directly in memory)
first_page -> First page to process
last_page -> Last page to process before stopping
fmt -> Output image format
jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format)
thread_count -> How many threads we are allowed to spawn for processing
userpw -> PDF's password
use_cropbox -> Use cropbox instead of mediabox
strict -> When a Syntax Error is thrown, it will be raised as an Exception
transparent -> Output with a transparent background instead of a white one.
single_file -> Uses the -singlefile option from pdftoppm/pdftocairo
output_file -> What is the output filename or generator
poppler_path -> Path to look for poppler binaries
grayscale -> Output grayscale image(s)
size -> Size of the resulting image(s), uses the Pillow (width, height) standard
paths_only -> Don't load image(s), return paths instead (requires output_folder)
use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance
"""
fh, temp_filename = tempfile.mkstemp()
try:
with open(temp_filename, "wb") as f:
f.write(pdf_file)
f.flush()
return convert_from_path(
f.name,
dpi=dpi,
output_folder=output_folder,
first_page=first_page,
last_page=last_page,
fmt=fmt,
jpegopt=jpegopt,
thread_count=thread_count,
userpw=userpw,
use_cropbox=use_cropbox,
strict=strict,
transparent=transparent,
single_file=single_file,
output_file=output_file,
poppler_path=poppler_path,
grayscale=grayscale,
size=size,
paths_only=paths_only,
use_pdftocairo=use_pdftocairo,
)
finally:
os.close(fh)
os.remove(temp_filename)
def _build_command(
args,
output_folder,
first_page,
last_page,
fmt,
jpegopt,
output_file,
userpw,
use_cropbox,
transparent,
single_file,
grayscale,
size,
):
if use_cropbox:
args.append("-cropbox")
if transparent and fmt in TRANSPARENT_FILE_TYPES:
args.append("-transp")
if first_page is not None:
args.extend(["-f", str(first_page)])
if last_page is not None:
args.extend(["-l", str(last_page)])
if fmt not in ["pgm", "ppm"]:
args.append("-" + fmt)
if fmt in ["jpeg", "jpg"] and jpegopt:
args.extend(["-jpegopt", _parse_jpegopt(jpegopt)])
if single_file:
args.append("-singlefile")
if output_folder is not None:
args.append(os.path.join(output_folder, output_file))
if userpw is not None:
args.extend(["-upw", userpw])
if grayscale:
args.append("-gray")
if size is None:
pass
elif isinstance(size, tuple) and len(size) == 2:
if size[0] is not None:
args.extend(["-scale-to-x", str(int(size[0]))])
else:
args.extend(["-scale-to-x", str(-1)])
if size[1] is not None:
args.extend(["-scale-to-y", str(int(size[1]))])
else:
args.extend(["-scale-to-y", str(-1)])
elif isinstance(size, tuple) and len(size) == 1:
args.extend(["-scale-to", str(int(size[0]))])
elif isinstance(size, int) or isinstance(size, float):
args.extend(["-scale-to", str(int(size))])
else:
raise ValueError("Size {} is not a tuple or an integer")
return args
def _parse_format(fmt, grayscale=False):
fmt = fmt.lower()
if fmt[0] == ".":
fmt = fmt[1:]
if fmt in ("jpeg", "jpg"):
return "jpeg", "jpg", parse_buffer_to_jpeg, False
if fmt == "png":
return "png", "png", parse_buffer_to_png, False
if fmt in ("tif", "tiff"):
return "tiff", "tif", None, True
if fmt == "ppm" and grayscale:
return "pgm", "pgm", parse_buffer_to_pgm, False
# Unable to parse the format so we'll use the default
return "ppm", "ppm", parse_buffer_to_ppm, False
def _parse_jpegopt(jpegopt):
parts = []
for k, v in jpegopt.items():
if v is True:
v = "y"
if v is False:
v = "n"
parts.append("{}={}".format(k, v))
return ",".join(parts)
def _get_command_path(command, poppler_path=None):
if platform.system() == "Windows":
command = command + ".exe"
if poppler_path is not None:
command = os.path.join(poppler_path, command)
return command
def _get_poppler_version(command, poppler_path=None):
command = [_get_command_path(command, poppler_path), "-v"]
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
try:
# TODO: Make this more robust
return int(
err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")[1]
)
except:
# Lowest version that includes pdftocairo (2011)
return 17
def pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None):
try:
command = [_get_command_path("pdfinfo", poppler_path), pdf_path]
if userpw is not None:
command.extend(["-upw", userpw])
# Add poppler path to LD_LIBRARY_PATH
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
d = {}
for field in out.decode("utf8", "ignore").split("\n"):
sf = field.split(":")
key, value = sf[0], ":".join(sf[1:])
if key != "":
d[key] = (
int(value.strip())
if key in PDFINFO_CONVERT_TO_INT
else value.strip()
)
if "Pages" not in d:
raise ValueError
return d
except OSError:
raise PDFInfoNotInstalledError(
"Unable to get page count. Is poppler installed and in PATH?"
)
except ValueError:
raise PDFPageCountError(
"Unable to get page count.\n%s" % err.decode("utf8", "ignore")
)
def pdfinfo_from_bytes(pdf_file):
fh, temp_filename = tempfile.mkstemp()
try:
with open(temp_filename, "wb") as f:
f.write(pdf_file)
f.flush()
return pdfinfo_from_path(temp_filename)
finally:
os.close(fh)
os.remove(temp_filename)
def _load_from_output_folder(
output_folder, output_file, ext, paths_only, in_memory=False
):
images = []
for f in sorted(os.listdir(output_folder)):
if f.startswith(output_file) and f.split(".")[-1] == ext:
if paths_only:
images.append(os.path.join(output_folder, f))
else:
images.append(Image.open(os.path.join(output_folder, f)))
if in_memory:
images[-1].load()
return images