diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index a7b606e6..d559e71c 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -1,7 +1,8 @@
import io
import logging
from pdfminer.pdfcolor import PDFColorSpace
-from typing import Any, List, Optional, Sequence, cast
+from typing import (Any, BinaryIO, Dict, Generic, List, Optional, Sequence,
+ TextIO, Tuple, TypeVar, cast)
import re
from . import utils
@@ -11,6 +12,7 @@
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
+from .layout import LTItem
from .layout import LTLayoutContainer
from .layout import LTLine
from .layout import LTPage
@@ -26,11 +28,12 @@
from .pdfinterp import PDFGraphicState, PDFResourceManager
from .pdfpage import PDFPage
from .pdftypes import PDFStream
-from .utils import Point, Matrix, Rect, PathSegment
+from .utils import AnyIO, Point, Matrix, Rect, PathSegment
from .utils import apply_matrix_pt
from .utils import bbox2str
from .utils import enc
from .utils import mult_matrix
+from .image import ImageWriter
log = logging.getLogger(__name__)
@@ -161,31 +164,43 @@ def receive_layout(self, ltpage: LTPage) -> None:
class PDFPageAggregator(PDFLayoutAnalyzer):
- def __init__(self, rsrcmgr, pageno=1, laparams=None):
+ def __init__(self,
+ rsrcmgr: PDFResourceManager,
+ pageno: int = 1,
+ laparams: Optional[LAParams] = None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
- self.result = None
+ self.result: Optional[LTPage] = None
return
- def receive_layout(self, ltpage):
+ def receive_layout(self, ltpage: LTPage) -> None:
self.result = ltpage
return
- def get_result(self):
+ def get_result(self) -> LTPage:
+ assert self.result is not None
return self.result
-class PDFConverter(PDFLayoutAnalyzer):
- def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
- laparams=None):
+# Some PDFConverter children support only binary I/O
+IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO)
+
+
+class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
+ def __init__(self,
+ rsrcmgr: PDFResourceManager,
+ outfp: IOType,
+ codec: str = 'utf-8',
+ pageno: int = 1,
+ laparams: Optional[LAParams] = None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
- self.outfp = outfp
+ self.outfp: IOType = outfp
self.codec = codec
self.outfp_binary = self._is_binary_stream(self.outfp)
@staticmethod
- def _is_binary_stream(outfp):
+ def _is_binary_stream(outfp: AnyIO) -> bool:
"""Test if an stream is binary or not"""
if 'b' in getattr(outfp, 'mode', ''):
return True
@@ -200,11 +215,17 @@ def _is_binary_stream(outfp):
return True
-class TextConverter(PDFConverter):
- def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
- showpageno=False, imagewriter=None):
- PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
- laparams=laparams)
+class TextConverter(PDFConverter[AnyIO]):
+ def __init__(self,
+ rsrcmgr: PDFResourceManager,
+ outfp: AnyIO,
+ codec: str = 'utf-8',
+ pageno: int = 1,
+ laparams: Optional[LAParams] = None,
+ showpageno: bool = False,
+ imagewriter: Optional[ImageWriter] = None):
+ super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno,
+ laparams=laparams)
self.showpageno = showpageno
self.imagewriter = imagewriter
return
@@ -247,7 +268,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
return
-class HTMLConverter(PDFConverter):
+class HTMLConverter(PDFConverter[BinaryIO]):
RECT_COLORS = {
'figure': 'yellow',
'textline': 'magenta',
@@ -262,10 +283,21 @@ class HTMLConverter(PDFConverter):
'char': 'black',
}
- def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
- scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
- pagemargin=50, imagewriter=None, debug=0, rect_colors=None,
- text_colors=None):
+ def __init__(self,
+ rsrcmgr: PDFResourceManager,
+ outfp: BinaryIO,
+ codec: str = 'utf-8',
+ pageno: int = 1,
+ laparams: Optional[LAParams] = None,
+ scale: float = 1,
+ fontscale: float = 1.0,
+ layoutmode: str = 'normal',
+ showpageno: bool = True,
+ pagemargin: int = 50,
+ imagewriter: Optional[ImageWriter] = None,
+ debug: int = 0,
+ rect_colors: Optional[Dict[str, str]] = None,
+ text_colors: Optional[Dict[str, str]] = None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
if text_colors is None:
@@ -285,18 +317,17 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
self.rect_colors.update(self.RECT_COLORS)
self.text_colors.update(self.TEXT_COLORS)
self._yoffset = self.pagemargin
- self._font = None
- self._fontstack = []
+ self._font: Optional[Tuple[str, float]] = None
+ self._fontstack: List[Tuple[str, float]] = []
self.write_header()
return
- def write(self, text):
- if self.codec:
- text = text.encode(self.codec)
- self.outfp.write(text)
+ def write(self, text: str) -> None:
+ textb = text.encode(self.codec)
+ self.outfp.write(textb)
return
- def write_header(self):
+ def write_header(self) -> None:
self.write('
\n')
if self.codec:
s = '{}'.format(i, i)
for i in range(1, self.pageno)]
s = 'Page: %s
\n' % \
@@ -316,7 +347,7 @@ def write_footer(self):
self.write('\n')
return
- def write_text(self, text):
+ def write_text(self, text: str) -> None:
self.write(enc(text))
return
@@ -371,14 +402,14 @@ def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
self.write(s)
return
- def end_div(self, color):
+ def end_div(self, color: Any) -> None:
if self._font is not None:
self.write('')
self._font = self._fontstack.pop()
self.write('')
return
- def put_text(self, text, fontname, fontsize):
+ def put_text(self, text: str, fontname: str, fontsize: float) -> None:
font = (fontname, fontsize)
if font != self._font:
if self._font is not None:
@@ -392,7 +423,7 @@ def put_text(self, text, fontname, fontsize):
self.write_text(text)
return
- def put_newline(self):
+ def put_newline(self) -> None:
self.write('
')
return
@@ -468,31 +499,43 @@ def render(item):
self._yoffset += self.pagemargin
return
- def close(self):
+ def close(self) -> None:
self.write_footer()
return
-class XMLConverter(PDFConverter):
+class XMLConverter(PDFConverter[AnyIO]):
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
- def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
- imagewriter=None, stripcontrol=False):
+ def __init__(self,
+ rsrcmgr: PDFResourceManager,
+ outfp: AnyIO,
+ codec: str = 'utf-8',
+ pageno: int = 1,
+ laparams: Optional[LAParams] = None,
+ imagewriter: Optional[ImageWriter] = None,
+ stripcontrol: bool = False):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
+
+ # write() assumes a codec for binary I/O, or no codec for text I/O.
+ if self.outfp_binary == (not self.codec):
+ raise ValueError("Codec is required for a binary I/O output")
+
self.imagewriter = imagewriter
self.stripcontrol = stripcontrol
self.write_header()
return
- def write(self, text):
+ def write(self, text: str) -> None:
if self.codec:
- text = text.encode(self.codec)
- self.outfp.write(text)
+ cast(BinaryIO, self.outfp).write(text.encode(self.codec))
+ else:
+ cast(TextIO, self.outfp).write(text)
return
- def write_header(self):
+ def write_header(self) -> None:
if self.codec:
self.write('\n' % self.codec)
else:
@@ -500,18 +543,18 @@ def write_header(self):
self.write('\n')
return
- def write_footer(self):
+ def write_footer(self) -> None:
self.write('\n')
return
- def write_text(self, text):
+ def write_text(self, text: str) -> None:
if self.stripcontrol:
text = self.CONTROL.sub('', text)
self.write(enc(text))
return
- def receive_layout(self, ltpage):
- def show_group(item):
+ def receive_layout(self, ltpage: LTPage) -> None:
+ def show_group(item: LTItem) -> None:
if isinstance(item, LTTextBox):
self.write('\n' %
(item.index, bbox2str(item.bbox)))
@@ -522,7 +565,7 @@ def show_group(item):
self.write('\n')
return
- def render(item):
+ def render(item: LTItem) -> None:
if isinstance(item, LTPage):
s = '\n' % \
(item.pageid, bbox2str(item.bbox), item.rotate)
@@ -581,7 +624,8 @@ def render(item):
self.write('%s\n' % item.get_text())
elif isinstance(item, LTImage):
if self.imagewriter is not None:
- name = self.imagewriter.export_image(item)
+ name = self.imagewriter \
+ .export_image(item) # type: ignore[no-untyped-call]
self.write('\n' %
(enc(name), item.width, item.height))
else:
@@ -593,6 +637,6 @@ def render(item):
render(ltpage)
return
- def close(self):
+ def close(self) -> None:
self.write_footer()
return
diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
index 33f661c0..5c5921b1 100644
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@@ -3,22 +3,34 @@
import logging
import sys
from io import StringIO
+from typing import Any, BinaryIO, Container, Iterator, Optional, cast
from .converter import XMLConverter, HTMLConverter, TextConverter, \
PDFPageAggregator
from .image import ImageWriter
-from .layout import LAParams
-from .pdfdevice import TagExtractor
+from .layout import LAParams, LTPage
+from .pdfdevice import PDFDevice, TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
-from .utils import open_filename
-
-
-def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
- laparams=None, maxpages=0, page_numbers=None,
- password="", scale=1.0, rotation=0, layoutmode='normal',
- output_dir=None, strip_control=False, debug=False,
- disable_caching=False, **kwargs):
+from .utils import open_filename, FileOrName, AnyIO
+
+
+def extract_text_to_fp(inf: BinaryIO,
+ outfp: AnyIO,
+ output_type: str = 'text',
+ codec: str = 'utf-8',
+ laparams: Optional[LAParams] = None,
+ maxpages: int = 0,
+ page_numbers: Optional[Container[int]] = None,
+ password: str = "",
+ scale: float = 1.0,
+ rotation: int = 0,
+ layoutmode: str = 'normal',
+ output_dir: Optional[str] = None,
+ strip_control: bool = False,
+ debug: bool = False,
+ disable_caching: bool = False,
+ **kwargs: Any) -> None:
"""Parses text from inf-file and writes to outfp file-like object.
Takes loads of optional arguments but the defaults are somewhat sane.
@@ -56,7 +68,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
imagewriter = ImageWriter(output_dir)
rsrcmgr = PDFResourceManager(caching=not disable_caching)
- device = None
+ device: Optional[PDFDevice] = None
if output_type != 'text' and outfp == sys.stdout:
outfp = sys.stdout.buffer
@@ -71,18 +83,21 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
stripcontrol=strip_control)
elif output_type == 'html':
- device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
- layoutmode=layoutmode, laparams=laparams,
- imagewriter=imagewriter)
+ # Binary I/O is required, but we have no good way to test it here.
+ device = HTMLConverter(rsrcmgr, cast(BinaryIO, outfp), codec=codec,
+ scale=scale, layoutmode=layoutmode,
+ laparams=laparams, imagewriter=imagewriter)
elif output_type == 'tag':
- device = TagExtractor(rsrcmgr, outfp, codec=codec)
+ # Binary I/O is required, but we have no good way to test it here.
+ device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
else:
msg = f"Output type can be text, html, xml or tag but is " \
f"{output_type}"
raise ValueError(msg)
+ assert device is not None
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf,
page_numbers,
@@ -95,8 +110,13 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
device.close()
-def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
- caching=True, codec='utf-8', laparams=None):
+def extract_text(pdf_file: FileOrName,
+ password: str = '',
+ page_numbers: Optional[Container[int]] = None,
+ maxpages: int = 0,
+ caching: bool = True,
+ codec: str = 'utf-8',
+ laparams: Optional[LAParams] = None) -> str:
"""Parse and return the text contained in a PDF file.
:param pdf_file: Either a file path or a file-like object for the PDF file
@@ -114,6 +134,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
laparams = LAParams()
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
+ fp = cast(BinaryIO, fp) # we opened in binary mode
rsrcmgr = PDFResourceManager(caching=caching)
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
@@ -131,8 +152,12 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
return output_string.getvalue()
-def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
- caching=True, laparams=None):
+def extract_pages(pdf_file: FileOrName,
+ password: str = '',
+ page_numbers: Optional[Container[int]] = None,
+ maxpages: int = 0,
+ caching: bool = True,
+ laparams: Optional[LAParams] = None) -> Iterator[LTPage]:
"""Extract and yield LTPage objects
:param pdf_file: Either a file path or a file-like object for the PDF file
@@ -149,6 +174,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
laparams = LAParams()
with open_filename(pdf_file, "rb") as fp:
+ fp = cast(BinaryIO, fp) # we opened in binary mode
resource_manager = PDFResourceManager(caching=caching)
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
diff --git a/pdfminer/image.py b/pdfminer/image.py
index b0cc0171..eb087e07 100644
--- a/pdfminer/image.py
+++ b/pdfminer/image.py
@@ -63,7 +63,7 @@ class ImageWriter:
Supports various image types: JPEG, JBIG2 and bitmaps
"""
- def __init__(self, outdir):
+ def __init__(self, outdir: str):
self.outdir = outdir
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index c3d229c2..22b9dd13 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -4,8 +4,9 @@
import io
import pathlib
import struct
-from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
- Set, Tuple, TypeVar, Union, TYPE_CHECKING, cast)
+from typing import (Any, BinaryIO, Callable, Dict, Iterable, Iterator, List,
+ Optional, Set, TextIO, Tuple, TypeVar, Union,
+ TYPE_CHECKING, cast)
from html import escape
if TYPE_CHECKING:
@@ -18,25 +19,29 @@
INF = (1 << 31) - 1
+FileOrName = Union[pathlib.PurePath, str, io.IOBase]
+AnyIO = Union[TextIO, BinaryIO]
+
+
class open_filename(object):
"""
Context manager that allows opening a filename
(str or pathlib.PurePath type is supported) and closes it on exit,
(just like `open`), but does nothing for file-like objects.
"""
- def __init__(self, filename, *args, **kwargs):
+ def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any):
if isinstance(filename, pathlib.PurePath):
filename = str(filename)
if isinstance(filename, str):
- self.file_handler = open(filename, *args, **kwargs)
+ self.file_handler: AnyIO = open(filename, *args, **kwargs)
self.closing = True
elif isinstance(filename, io.IOBase):
- self.file_handler = filename
+ self.file_handler = cast(AnyIO, filename)
self.closing = False
else:
raise TypeError('Unsupported input type: %s' % type(filename))
- def __enter__(self):
+ def __enter__(self) -> AnyIO:
return self.file_handler
def __exit__(self, exc_type, exc_val, exc_tb):