diff --git a/pdfminer/converter.py b/pdfminer/converter.py index a7b606e6..d559e71c 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,7 +1,8 @@ import io import logging from pdfminer.pdfcolor import PDFColorSpace -from typing import Any, List, Optional, Sequence, cast +from typing import (Any, BinaryIO, Dict, Generic, List, Optional, Sequence, + TextIO, Tuple, TypeVar, cast) import re from . import utils @@ -11,6 +12,7 @@ from .layout import LTCurve from .layout import LTFigure from .layout import LTImage +from .layout import LTItem from .layout import LTLayoutContainer from .layout import LTLine from .layout import LTPage @@ -26,11 +28,12 @@ from .pdfinterp import PDFGraphicState, PDFResourceManager from .pdfpage import PDFPage from .pdftypes import PDFStream -from .utils import Point, Matrix, Rect, PathSegment +from .utils import AnyIO, Point, Matrix, Rect, PathSegment from .utils import apply_matrix_pt from .utils import bbox2str from .utils import enc from .utils import mult_matrix +from .image import ImageWriter log = logging.getLogger(__name__) @@ -161,31 +164,43 @@ def receive_layout(self, ltpage: LTPage) -> None: class PDFPageAggregator(PDFLayoutAnalyzer): - def __init__(self, rsrcmgr, pageno=1, laparams=None): + def __init__(self, + rsrcmgr: PDFResourceManager, + pageno: int = 1, + laparams: Optional[LAParams] = None): PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) - self.result = None + self.result: Optional[LTPage] = None return - def receive_layout(self, ltpage): + def receive_layout(self, ltpage: LTPage) -> None: self.result = ltpage return - def get_result(self): + def get_result(self) -> LTPage: + assert self.result is not None return self.result -class PDFConverter(PDFLayoutAnalyzer): - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, - laparams=None): +# Some PDFConverter children support only binary I/O +IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO) + + +class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): + def __init__(self, + rsrcmgr: PDFResourceManager, + outfp: IOType, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None): PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) - self.outfp = outfp + self.outfp: IOType = outfp self.codec = codec self.outfp_binary = self._is_binary_stream(self.outfp) @staticmethod - def _is_binary_stream(outfp): + def _is_binary_stream(outfp: AnyIO) -> bool: """Test if an stream is binary or not""" if 'b' in getattr(outfp, 'mode', ''): return True @@ -200,11 +215,17 @@ def _is_binary_stream(outfp): return True -class TextConverter(PDFConverter): - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - showpageno=False, imagewriter=None): - PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, - laparams=laparams) +class TextConverter(PDFConverter[AnyIO]): + def __init__(self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + showpageno: bool = False, + imagewriter: Optional[ImageWriter] = None): + super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, + laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter return @@ -247,7 +268,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): return -class HTMLConverter(PDFConverter): +class HTMLConverter(PDFConverter[BinaryIO]): RECT_COLORS = { 'figure': 'yellow', 'textline': 'magenta', @@ -262,10 +283,21 @@ class HTMLConverter(PDFConverter): 'char': 'black', } - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, fontscale=1.0, layoutmode='normal', showpageno=True, - pagemargin=50, imagewriter=None, debug=0, rect_colors=None, - text_colors=None): + def __init__(self, + rsrcmgr: PDFResourceManager, + outfp: BinaryIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + scale: float = 1, + fontscale: float = 1.0, + layoutmode: str = 'normal', + showpageno: bool = True, + pagemargin: int = 50, + imagewriter: Optional[ImageWriter] = None, + debug: int = 0, + rect_colors: Optional[Dict[str, str]] = None, + text_colors: Optional[Dict[str, str]] = None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) if text_colors is None: @@ -285,18 +317,17 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, self.rect_colors.update(self.RECT_COLORS) self.text_colors.update(self.TEXT_COLORS) self._yoffset = self.pagemargin - self._font = None - self._fontstack = [] + self._font: Optional[Tuple[str, float]] = None + self._fontstack: List[Tuple[str, float]] = [] self.write_header() return - def write(self, text): - if self.codec: - text = text.encode(self.codec) - self.outfp.write(text) + def write(self, text: str) -> None: + textb = text.encode(self.codec) + self.outfp.write(textb) return - def write_header(self): + def write_header(self) -> None: self.write('\n') if self.codec: s = '{}'.format(i, i) for i in range(1, self.pageno)] s = '
Page: %s
\n' % \ @@ -316,7 +347,7 @@ def write_footer(self): self.write('\n') return - def write_text(self, text): + def write_text(self, text: str) -> None: self.write(enc(text)) return @@ -371,14 +402,14 @@ def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False): self.write(s) return - def end_div(self, color): + def end_div(self, color: Any) -> None: if self._font is not None: self.write('') self._font = self._fontstack.pop() self.write('') return - def put_text(self, text, fontname, fontsize): + def put_text(self, text: str, fontname: str, fontsize: float) -> None: font = (fontname, fontsize) if font != self._font: if self._font is not None: @@ -392,7 +423,7 @@ def put_text(self, text, fontname, fontsize): self.write_text(text) return - def put_newline(self): + def put_newline(self) -> None: self.write('
') return @@ -468,31 +499,43 @@ def render(item): self._yoffset += self.pagemargin return - def close(self): + def close(self) -> None: self.write_footer() return -class XMLConverter(PDFConverter): +class XMLConverter(PDFConverter[AnyIO]): CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]') - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - imagewriter=None, stripcontrol=False): + def __init__(self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + imagewriter: Optional[ImageWriter] = None, + stripcontrol: bool = False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + + # write() assumes a codec for binary I/O, or no codec for text I/O. + if self.outfp_binary == (not self.codec): + raise ValueError("Codec is required for a binary I/O output") + self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.write_header() return - def write(self, text): + def write(self, text: str) -> None: if self.codec: - text = text.encode(self.codec) - self.outfp.write(text) + cast(BinaryIO, self.outfp).write(text.encode(self.codec)) + else: + cast(TextIO, self.outfp).write(text) return - def write_header(self): + def write_header(self) -> None: if self.codec: self.write('\n' % self.codec) else: @@ -500,18 +543,18 @@ def write_header(self): self.write('\n') return - def write_footer(self): + def write_footer(self) -> None: self.write('\n') return - def write_text(self, text): + def write_text(self, text: str) -> None: if self.stripcontrol: text = self.CONTROL.sub('', text) self.write(enc(text)) return - def receive_layout(self, ltpage): - def show_group(item): + def receive_layout(self, ltpage: LTPage) -> None: + def show_group(item: LTItem) -> None: if isinstance(item, LTTextBox): self.write('\n' % (item.index, bbox2str(item.bbox))) @@ -522,7 +565,7 @@ def show_group(item): self.write('\n') return - def render(item): + def render(item: LTItem) -> None: if isinstance(item, LTPage): s = '\n' % \ (item.pageid, bbox2str(item.bbox), item.rotate) @@ -581,7 +624,8 @@ def render(item): self.write('%s\n' % item.get_text()) elif isinstance(item, LTImage): if self.imagewriter is not None: - name = self.imagewriter.export_image(item) + name = self.imagewriter \ + .export_image(item) # type: ignore[no-untyped-call] self.write('\n' % (enc(name), item.width, item.height)) else: @@ -593,6 +637,6 @@ def render(item): render(ltpage) return - def close(self): + def close(self) -> None: self.write_footer() return diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 33f661c0..5c5921b1 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -3,22 +3,34 @@ import logging import sys from io import StringIO +from typing import Any, BinaryIO, Container, Iterator, Optional, cast from .converter import XMLConverter, HTMLConverter, TextConverter, \ PDFPageAggregator from .image import ImageWriter -from .layout import LAParams -from .pdfdevice import TagExtractor +from .layout import LAParams, LTPage +from .pdfdevice import PDFDevice, TagExtractor from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfpage import PDFPage -from .utils import open_filename - - -def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', - laparams=None, maxpages=0, page_numbers=None, - password="", scale=1.0, rotation=0, layoutmode='normal', - output_dir=None, strip_control=False, debug=False, - disable_caching=False, **kwargs): +from .utils import open_filename, FileOrName, AnyIO + + +def extract_text_to_fp(inf: BinaryIO, + outfp: AnyIO, + output_type: str = 'text', + codec: str = 'utf-8', + laparams: Optional[LAParams] = None, + maxpages: int = 0, + page_numbers: Optional[Container[int]] = None, + password: str = "", + scale: float = 1.0, + rotation: int = 0, + layoutmode: str = 'normal', + output_dir: Optional[str] = None, + strip_control: bool = False, + debug: bool = False, + disable_caching: bool = False, + **kwargs: Any) -> None: """Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. @@ -56,7 +68,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) - device = None + device: Optional[PDFDevice] = None if output_type != 'text' and outfp == sys.stdout: outfp = sys.stdout.buffer @@ -71,18 +83,21 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', stripcontrol=strip_control) elif output_type == 'html': - device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, - layoutmode=layoutmode, laparams=laparams, - imagewriter=imagewriter) + # Binary I/O is required, but we have no good way to test it here. + device = HTMLConverter(rsrcmgr, cast(BinaryIO, outfp), codec=codec, + scale=scale, layoutmode=layoutmode, + laparams=laparams, imagewriter=imagewriter) elif output_type == 'tag': - device = TagExtractor(rsrcmgr, outfp, codec=codec) + # Binary I/O is required, but we have no good way to test it here. + device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) else: msg = f"Output type can be text, html, xml or tag but is " \ f"{output_type}" raise ValueError(msg) + assert device is not None interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(inf, page_numbers, @@ -95,8 +110,13 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', device.close() -def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, - caching=True, codec='utf-8', laparams=None): +def extract_text(pdf_file: FileOrName, + password: str = '', + page_numbers: Optional[Container[int]] = None, + maxpages: int = 0, + caching: bool = True, + codec: str = 'utf-8', + laparams: Optional[LAParams] = None) -> str: """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file @@ -114,6 +134,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, laparams = LAParams() with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: + fp = cast(BinaryIO, fp) # we opened in binary mode rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) @@ -131,8 +152,12 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, return output_string.getvalue() -def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, - caching=True, laparams=None): +def extract_pages(pdf_file: FileOrName, + password: str = '', + page_numbers: Optional[Container[int]] = None, + maxpages: int = 0, + caching: bool = True, + laparams: Optional[LAParams] = None) -> Iterator[LTPage]: """Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file @@ -149,6 +174,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, laparams = LAParams() with open_filename(pdf_file, "rb") as fp: + fp = cast(BinaryIO, fp) # we opened in binary mode resource_manager = PDFResourceManager(caching=caching) device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) diff --git a/pdfminer/image.py b/pdfminer/image.py index b0cc0171..eb087e07 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -63,7 +63,7 @@ class ImageWriter: Supports various image types: JPEG, JBIG2 and bitmaps """ - def __init__(self, outdir): + def __init__(self, outdir: str): self.outdir = outdir if not os.path.exists(self.outdir): os.makedirs(self.outdir) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index c3d229c2..22b9dd13 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -4,8 +4,9 @@ import io import pathlib import struct -from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, - Set, Tuple, TypeVar, Union, TYPE_CHECKING, cast) +from typing import (Any, BinaryIO, Callable, Dict, Iterable, Iterator, List, + Optional, Set, TextIO, Tuple, TypeVar, Union, + TYPE_CHECKING, cast) from html import escape if TYPE_CHECKING: @@ -18,25 +19,29 @@ INF = (1 << 31) - 1 +FileOrName = Union[pathlib.PurePath, str, io.IOBase] +AnyIO = Union[TextIO, BinaryIO] + + class open_filename(object): """ Context manager that allows opening a filename (str or pathlib.PurePath type is supported) and closes it on exit, (just like `open`), but does nothing for file-like objects. """ - def __init__(self, filename, *args, **kwargs): + def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any): if isinstance(filename, pathlib.PurePath): filename = str(filename) if isinstance(filename, str): - self.file_handler = open(filename, *args, **kwargs) + self.file_handler: AnyIO = open(filename, *args, **kwargs) self.closing = True elif isinstance(filename, io.IOBase): - self.file_handler = filename + self.file_handler = cast(AnyIO, filename) self.closing = False else: raise TypeError('Unsupported input type: %s' % type(filename)) - def __enter__(self): + def __enter__(self) -> AnyIO: return self.file_handler def __exit__(self, exc_type, exc_val, exc_tb):