From 0e6871c16abb29df2868ab145b4ce451b4b6c777 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Fri, 20 Aug 2021 16:54:46 -0700 Subject: [PATCH] general progress on annotations * finish utils * annotate more of pdfinterp, pdfdevice * document reason for # type: ignore comments * fix cyclic imports * satisfy flake8 --- pdfminer/converter.py | 28 ++++++++++------ pdfminer/layout.py | 45 +++++++++++++++++-------- pdfminer/pdfcolor.py | 5 +-- pdfminer/pdfdevice.py | 76 ++++++++++++++++++++++++++++--------------- pdfminer/pdfinterp.py | 70 ++++++++++++++++++++++++--------------- pdfminer/pdfpage.py | 11 ++++--- pdfminer/pdfparser.py | 3 +- pdfminer/psparser.py | 4 +-- pdfminer/utils.py | 55 ++++++++++++++++++------------- 9 files changed, 187 insertions(+), 110 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index f5b44716..ec1735bf 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,17 +1,18 @@ import io import logging -from pdfminer.pdftypes import PDFStream +from pdfminer.pdfcolor import PDFColorSpace from typing import List -from pdfminer.pdfpage import PDFPage import re import sys from . import utils -from .layout import LTChar, LTLayoutContainer +from .layout import LAParams +from .layout import LTChar from .layout import LTContainer from .layout import LTCurve from .layout import LTFigure from .layout import LTImage +from .layout import LTLayoutContainer from .layout import LTLine from .layout import LTPage from .layout import LTRect @@ -21,9 +22,13 @@ from .layout import LTTextGroup from .layout import LTTextLine from .pdfdevice import PDFTextDevice -from .pdffont import PDFFont, PDFUnicodeNotDefined -from .pdfinterp import PDFResourceManager -from .utils import Matrix, Rect, apply_matrix_pt +from .pdffont import PDFFont +from .pdffont import PDFUnicodeNotDefined +from .pdfinterp import PDFGraphicState, PDFResourceManager +from .pdfpage import PDFPage +from .pdftypes import PDFStream +from .utils import Matrix, Rect +from .utils import apply_matrix_pt from .utils import bbox2str from .utils import enc from .utils import mult_matrix @@ -35,7 +40,8 @@ class PDFLayoutAnalyzer(PDFTextDevice): cur_item: LTLayoutContainer ctm: Matrix - def __init__(self, rsrcmgr: PDFResourceManager, pageno=1, laparams=None): + def __init__(self, rsrcmgr: PDFResourceManager, pageno: int = 1, + laparams: LAParams = None): PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno self.laparams = laparams @@ -79,7 +85,8 @@ def render_image(self, name: str, stream: PDFStream) -> None: self.cur_item.add(item) return - def paint_path(self, gstate, stroke, fill, evenodd, path): + def paint_path(self, gstate: PDFGraphicState, stroke, fill, evenodd, path + ) -> None: """Paint paths described in section 4.4 of the PDF reference manual""" shape = ''.join(x[0] for x in path) @@ -130,8 +137,9 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): gstate.scolor, gstate.ncolor) self.cur_item.add(curve) - def render_char(self, matrix: Matrix, font: PDFFont, fontsize, scaling, rise, cid, ncs, - graphicstate): + def render_char(self, matrix: Matrix, font: PDFFont, fontsize: float, + scaling: float, rise: float, cid: int, ncs: PDFColorSpace, + graphicstate: PDFGraphicState) -> float: try: text = font.to_unichr(cid) assert isinstance(text, str), str(type(text)) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 490a8400..634ba43a 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,9 +1,11 @@ import heapq import logging -from pdfminer.pdftypes import PDFStream -from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Sequence, Set, Tuple, TypeVar +from typing import (Any, Dict, Generic, Iterable, Iterator, List, Optional, + Sequence, Set, Tuple, TypeVar, cast) -from .utils import INF, Matrix, Rect +from .utils import INF +from .utils import Matrix +from .utils import Rect from .utils import Plane from .utils import apply_matrix_pt from .utils import bbox2str @@ -11,6 +13,10 @@ from .utils import get_bound from .utils import matrix2str from .utils import uniq +from .pdfcolor import PDFColorSpace +from .pdftypes import PDFStream +from .pdfinterp import PDFGraphicState +from .pdffont import PDFFont logger = logging.getLogger(__name__) @@ -282,8 +288,9 @@ def get_text(self) -> str: class LTChar(LTComponent, LTText): """Actual letter in the text as a Unicode string.""" - def __init__(self, matrix: Matrix, font, fontsize, scaling, rise, - text: str, textwidth, textdisp, ncs, graphicstate): + def __init__(self, matrix: Matrix, font: PDFFont, fontsize, scaling, rise, + text: str, textwidth, textdisp, ncs: PDFColorSpace, + graphicstate: PDFGraphicState): LTText.__init__(self) self._text = text self.matrix = matrix @@ -335,8 +342,10 @@ def is_compatible(self, obj: Any) -> bool: """Returns True if two characters can coexist in the same line.""" return True + LTContainerElement = TypeVar('LTContainerElement', LTItem, LTComponent) + class LTContainer(LTComponent, Generic[LTContainerElement]): """Object that can be extended and analyzed""" @@ -430,7 +439,8 @@ def add(self, obj: LTComponent) -> None: LTTextLine.add(self, obj) return - def find_neighbors(self, plane: Plane, ratio: float) -> List["LTTextLineHorizontal"]: + def find_neighbors(self, plane: Plane, ratio: float + ) -> List["LTTextLineHorizontal"]: """ Finds neighboring LTTextLineHorizontals in the plane. @@ -486,7 +496,8 @@ def add(self, obj: LTComponent) -> None: LTTextLine.add(self, obj) return - def find_neighbors(self, plane: Plane, ratio: float) -> List["LTTextLineVertical"]: + def find_neighbors(self, plane: Plane, ratio: float + ) -> List["LTTextLineVertical"]: """ Finds neighboring LTTextLineVerticals in the plane. @@ -600,7 +611,8 @@ def __init__(self, bbox: Rect): return # group_objects: group text object to textlines. - def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent]) -> Iterator[LTTextLine]: + def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent] + ) -> Iterator[LTTextLine]: obj0 = None line = None for obj1 in objs: @@ -670,11 +682,13 @@ def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent]) -> Iter obj0 = obj1 if line is None: line = LTTextLineHorizontal(laparams.word_margin) - line.add(obj0) # type: ignore + assert obj0 is not None + line.add(obj0) yield line return - def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine]) -> Iterator[LTTextBox]: + def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine] + ) -> Iterator[LTTextBox]: """Group neighboring lines to textboxes""" plane = Plane(self.bbox) plane.extend(lines) @@ -705,7 +719,8 @@ def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine]) -> It yield box return - def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox]) -> List[LTTextGroup]: + def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox] + ) -> List[LTTextGroup]: """Group textboxes hierarchically. Get pair-wise distances, via dist func defined below, and then merge @@ -724,7 +739,7 @@ def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox]) -> Lis :return: a list that has only one element, the final top level group. """ - def dist(obj1:LTComponent, obj2:LTComponent) -> float: + def dist(obj1: LTComponent, obj2: LTComponent) -> float: """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. @@ -752,7 +767,8 @@ def isany(obj1: LTComponent, obj2: LTComponent) -> Set[LTComponent]: objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) - dists: List[Tuple[bool, float, int, int, LTTextContainer, LTTextContainer]] = [] + dists: List[Tuple[bool, float, int, int, LTTextContainer, + LTTextContainer]] = [] for i in range(len(boxes)): box1 = boxes[i] for j in range(i+1, len(boxes)): @@ -817,7 +833,8 @@ def getkey(box): group.analyze(laparams) assigner.run(group) textboxes.sort(key=lambda box: box.index) - self._objs = textboxes + otherobjs + empties + self._objs = (cast(List[LTComponent], textboxes) + otherobjs + + cast(List[LTComponent], empties)) return diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index ff28d54e..f6aa442d 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -1,4 +1,5 @@ import collections +from typing import Dict from .psparser import LIT @@ -9,7 +10,7 @@ class PDFColorSpace: - def __init__(self, name, ncomponents): + def __init__(self, name: str, ncomponents: int): self.name = name self.ncomponents = ncomponents return @@ -19,7 +20,7 @@ def __repr__(self): (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = collections.OrderedDict() +PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict() for (name, n) in [ ('DeviceGray', 1), # default value first diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 20e62efa..e800d555 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,18 +1,24 @@ -from pdfminer.pdftypes import PDFStream -from typing import List, Any, Optional +from typing import (Any, IO, Iterable, List, Optional, Sequence, Tuple, + TYPE_CHECKING) from . import utils -from .utils import Matrix, Rect -from .psparser import PSObject +from .utils import Matrix, Point, Rect +from .pdfcolor import PDFColorSpace +from .pdffont import PDFFont from .pdffont import PDFUnicodeNotDefined from .pdfpage import PDFPage -from .pdfinterp import PDFPageInterpreter, PDFResourceManager +from .pdftypes import PDFStream + +if TYPE_CHECKING: + from .pdfinterp import PDFGraphicState + from .pdfinterp import PDFResourceManager + from .pdfinterp import PDFTextState class PDFDevice: """Translate the output of PDFPageInterpreter to the output that is needed """ - def __init__(self, rsrcmgr: PDFResourceManager): + def __init__(self, rsrcmgr: "PDFResourceManager"): self.rsrcmgr = rsrcmgr self.ctm: Optional[Matrix] = None return @@ -26,7 +32,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): self.close() - def close(self): + def close(self) -> None: return def set_ctm(self, ctm: Matrix) -> None: @@ -54,19 +60,26 @@ def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: def end_figure(self, name: str) -> None: return - def paint_path(self, graphicstate, stroke, fill, evenodd, path) -> None: + def paint_path(self, graphicstate: "PDFGraphicState", stroke: bool, + fill: bool, evenodd: bool, + path: Sequence[Tuple[str, float, float]]) -> None: return def render_image(self, name: str, stream: PDFStream) -> None: return - def render_string(self, textstate, seq, ncs, graphicstate) -> None: + def render_string(self, textstate: "PDFTextState", seq: Iterable, + ncs: PDFColorSpace, graphicstate: "PDFGraphicState" + ) -> None: return class PDFTextDevice(PDFDevice): - def render_string(self, textstate, seq, ncs, graphicstate): + def render_string(self, textstate: "PDFTextState", seq: Iterable, + ncs: PDFColorSpace, graphicstate: "PDFGraphicState" + ) -> None: + assert self.ctm is not None matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize @@ -74,6 +87,7 @@ def render_string(self, textstate, seq, ncs, graphicstate): charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling rise = textstate.rise + assert font is not None if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling @@ -89,9 +103,12 @@ def render_string(self, textstate, seq, ncs, graphicstate): graphicstate) return - def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): + def render_string_horizontal(self, seq: Iterable, matrix: Matrix, + pos: Point, font: PDFFont, fontsize: float, + scaling: float, charspace: float, + wordspace: float, rise: float, dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState") -> Point: (x, y) = pos needcharspace = False for obj in seq: @@ -110,9 +127,11 @@ def render_string_horizontal(self, seq, matrix, pos, needcharspace = True return (x, y) - def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): + def render_string_vertical(self, seq: Iterable, matrix: Matrix, pos: Point, + font: PDFFont, fontsize: float, scaling: float, + charspace: float, wordspace: float, rise: float, + dxscale: float, ncs: PDFColorSpace, + graphicstate: "PDFGraphicState") -> Point: (x, y) = pos needcharspace = False for obj in seq: @@ -131,23 +150,28 @@ def render_string_vertical(self, seq, matrix, pos, needcharspace = True return (x, y) - def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, - graphicstate): + def render_char(self, matrix: Matrix, font: PDFFont, fontsize: float, + scaling: float, rise: float, cid: int, ncs: PDFColorSpace, + graphicstate: "PDFGraphicState") -> float: return 0 class TagExtractor(PDFDevice): - def __init__(self, rsrcmgr: PDFResourceManager, outfp, codec='utf-8'): + def __init__(self, rsrcmgr: "PDFResourceManager", outfp: IO, + codec: str = 'utf-8'): PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec self.pageno = 0 - self._stack: List[PSObject] = [] + self._stack: List[Any] = [] return - def render_string(self, textstate, seq, ncs, graphicstate): + def render_string(self, textstate: "PDFTextState", seq: Iterable, + ncs: PDFColorSpace, graphicstate: "PDFGraphicState" + ) -> None: font = textstate.font + assert font is not None text = '' for obj in seq: if isinstance(obj, str): @@ -165,18 +189,18 @@ def render_string(self, textstate, seq, ncs, graphicstate): self.outfp.write(utils.enc(text)) return - def begin_page(self, page, ctm): + def begin_page(self, page: PDFPage, ctm: Matrix) -> None: output = '' %\ (self.pageno, utils.bbox2str(page.mediabox), page.rotate) self.outfp.write(utils.make_compat_bytes(output)) return - def end_page(self, page): + def end_page(self, page: PDFPage) -> None: self.outfp.write(utils.make_compat_bytes('\n')) self.pageno += 1 return - def begin_tag(self, tag: Any, props=None): + def begin_tag(self, tag: Any, props=None) -> None: s = '' if isinstance(props, dict): s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v))) @@ -186,14 +210,14 @@ def begin_tag(self, tag: Any, props=None): self._stack.append(tag) return - def end_tag(self): + def end_tag(self) -> None: assert self._stack, str(self.pageno) tag = self._stack.pop(-1) out_s = '' % utils.enc(tag.name) self.outfp.write(utils.make_compat_bytes(out_s)) return - def do_tag(self, tag: Any, props=None): + def do_tag(self, tag: Any, props=None) -> None: self.begin_tag(tag, props) self._stack.pop(-1) return diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 078914bf..f4e7cb20 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -1,11 +1,11 @@ import re import logging -from typing import Any, Dict, List, Sequence, Tuple +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple from io import BytesIO from .cmapdb import CMapDB from .cmapdb import CMap from .cmapdb import CMapBase -from .psparser import PSLiteral, PSParserToken +from .psparser import PSParserToken from .psparser import PSTypeError from .psparser import PSEOF from .psparser import PSKeyword @@ -32,7 +32,8 @@ from .pdffont import PDFCIDFont from .pdfcolor import PDFColorSpace from .pdfcolor import PREDEFINED_COLORSPACE -from .utils import Matrix, choplist +from .utils import Matrix, Point +from .utils import choplist from .utils import mult_matrix from .utils import MATRIX_IDENTITY @@ -56,16 +57,18 @@ class PDFInterpreterError(PDFException): class PDFTextState: + matrix: Matrix + linematrix: Point def __init__(self): - self.font = None - self.fontsize = 0 - self.charspace = 0 - self.wordspace = 0 - self.scaling = 100 - self.leading = 0 - self.render = 0 - self.rise = 0 + self.font: Optional[PDFFont] = None + self.fontsize: float = 0 + self.charspace: float = 0 + self.wordspace: float = 0 + self.scaling: float = 100 + self.leading: float = 0 + self.render: float = 0 + self.rise: float = 0 self.reset() # self.matrix is set # self.linematrix is set @@ -102,7 +105,7 @@ def reset(self) -> None: class PDFGraphicState: def __init__(self): - self.linewidth = 0 + self.linewidth: int = 0 self.linecap = None self.linejoin = None self.miterlimit = None @@ -220,6 +223,9 @@ class PDFContentParser(PSStackParser): def __init__(self, streams: List[Any]): self.streams = streams self.istream = 0 + # PSStackParser.__init__(fp=None) is safe only because we've overloaded + # all the methods that would attempt to access self.fp without first + # calling self.fillfp(). PSStackParser.__init__(self, None) # type: ignore return @@ -251,7 +257,8 @@ def fillbuf(self) -> None: self.charpos = 0 return - def get_inline_data(self, pos: int, target: bytes = b'EI') -> Tuple[int, bytes]: + def get_inline_data(self, pos: int, target: bytes = b'EI' + ) -> Tuple[int, bytes]: self.seek(pos) i = 0 data = b'' @@ -324,19 +331,19 @@ def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice): self.device = device return - def dup(self): + def dup(self) -> "PDFPageInterpreter": return self.__class__(self.rsrcmgr, self.device) - def init_resources(self, resources): + def init_resources(self, resources) -> None: """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap = {} self.xobjmap = {} - self.csmap = PREDEFINED_COLORSPACE.copy() + self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() if not resources: return - def get_colorspace(spec): + def get_colorspace(spec) -> Optional[PDFColorSpace]: if isinstance(spec, list): name = literal_name(spec[0]) else: @@ -360,7 +367,9 @@ def get_colorspace(spec): self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid, spec) in dict_value(v).items(): - self.csmap[csid] = get_colorspace(resolve1(spec)) + colorspace = get_colorspace(resolve1(spec)) + if colorspace is not None: + self.csmap[csid] = colorspace elif k == 'ProcSet': self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': @@ -370,7 +379,8 @@ def get_colorspace(spec): def init_state(self, ctm: Matrix) -> None: """Initialize the text and graphic states for rendering a page.""" - self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] # stack for graphical states. + # gstack: stack for graphical states. + self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] self.ctm = ctm self.device.set_ctm(self.ctm) self.textstate = PDFTextState() @@ -379,7 +389,8 @@ def init_state(self, ctm: Matrix) -> None: # argstack: stack for command arguments. self.argstack: List[Any] = [] # set some global states. - self.scs = self.ncs = None + self.scs: Optional[PDFColorSpace] = None + self.ncs: Optional[PDFColorSpace] = None if self.csmap: self.scs = self.ncs = next(iter(self.csmap.values())) return @@ -395,26 +406,29 @@ def pop(self, n: int) -> Any: self.argstack = self.argstack[:-n] return x - def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: + def get_current_state(self) -> Tuple[Matrix, PDFTextState, + PDFGraphicState]: return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) - def set_current_state(self, state: Tuple[Matrix, PDFTextState, PDFGraphicState]) -> None: + def set_current_state(self, state: Tuple[Matrix, PDFTextState, + PDFGraphicState]) -> None: (self.ctm, self.textstate, self.graphicstate) = state self.device.set_ctm(self.ctm) return - def do_q(self): + def do_q(self) -> None: """Save graphics state""" self.gstack.append(self.get_current_state()) return - def do_Q(self): + def do_Q(self) -> None: """Restore graphics state""" if self.gstack: self.set_current_state(self.gstack.pop()) return - def do_cm(self, a1, b1, c1, d1, e1, f1): + def do_cm(self, a1: float, b1: float, c1: float, d1: float, e1: float, + f1: float) -> None: """Concatenate matrix to current transformation matrix""" self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm) self.device.set_ctm(self.ctm) @@ -799,12 +813,13 @@ def do_T_a(self): self.textstate.linematrix = (0, 0) return - def do_TJ(self, seq): + def do_TJ(self, seq: Iterable): """Show text, allowing individual glyph positioning""" if self.textstate.font is None: if settings.STRICT: raise PDFInterpreterError('No font specified!') return + assert self.ncs is not None self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy()) return @@ -902,7 +917,8 @@ def process_page(self, page: PDFPage) -> None: self.device.end_page(page) return - def render_contents(self, resources, streams, ctm: Matrix = MATRIX_IDENTITY): + def render_contents(self, resources, streams: Sequence, + ctm: Matrix = MATRIX_IDENTITY) -> None: """Render the content streams. This method may be called recursively. diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index ae6d3399..b5b89a53 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Iterator +from typing import Any, BinaryIO, Container, Iterator, List, Optional import warnings from . import settings from .psparser import LIT @@ -67,7 +67,7 @@ def __init__(self, doc: PDFDocument, pageid: Any, attrs): contents = [] if not isinstance(contents, list): contents = [contents] - self.contents = contents + self.contents: List = contents return def __repr__(self) -> str: @@ -120,9 +120,10 @@ def search(obj, parent): return @classmethod - def get_pages(cls, fp, - pagenos=None, maxpages: int = 0, password='', - caching=True, check_extractable=False) -> Iterator["PDFPage"]: + def get_pages(cls, fp: BinaryIO, + pagenos: Optional[Container[int]] = None, maxpages: int = 0, + password: str = '', caching: bool = True, + check_extractable: bool = False) -> Iterator["PDFPage"]: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index ee64c2ee..18c04272 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -1,5 +1,6 @@ import logging from io import BytesIO +from typing import BinaryIO from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import PSEOF @@ -35,7 +36,7 @@ class PDFParser(PSStackParser): """ - def __init__(self, fp): + def __init__(self, fp: BinaryIO): PSStackParser.__init__(self, fp) self.doc = None self.fallback = False diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index d419329d..caed19df 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -120,7 +120,7 @@ def intern(self, name: PSLiteral.NameType) -> _SymbolT: KEYWORD_DICT_END = KWD(b'>>') -def literal_name(x: PSLiteral) -> PSLiteral.NameType: +def literal_name(x: Any) -> Any: if not isinstance(x, PSLiteral): if settings.STRICT: raise PSTypeError('Literal required: {!r}'.format(x)) @@ -136,7 +136,7 @@ def literal_name(x: PSLiteral) -> PSLiteral.NameType: return name -def keyword_name(x: PSKeyword) -> str: +def keyword_name(x: Any) -> Any: if not isinstance(x, PSKeyword): if settings.STRICT: raise PSTypeError('Keyword required: %r' % x) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index d33493d4..807ce11b 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -4,9 +4,12 @@ import io import pathlib import struct -from typing import Any, AnyStr, Dict, Iterable, Iterator, List, Set, Tuple +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, + Set, Tuple, TypeVar, Union, TYPE_CHECKING) from html import escape -from .layout import LTComponent + +if TYPE_CHECKING: + from .layout import LTComponent import chardet # For str encoding detection @@ -42,13 +45,13 @@ def __exit__(self, exc_type, exc_val, exc_tb): return False -def make_compat_bytes(in_str): +def make_compat_bytes(in_str: str) -> bytes: "Converts to bytes, encoding to unicode." assert isinstance(in_str, str), str(type(in_str)) return in_str.encode() -def make_compat_str(in_str): +def make_compat_str(in_str: Union[bytes, str]) -> str: """Converts to string, guessing encoding.""" assert isinstance(in_str, (bytes, str)), str(type(in_str)) if isinstance(in_str, bytes): @@ -57,7 +60,7 @@ def make_compat_str(in_str): return in_str -def shorten_str(s, size): +def shorten_str(s: str, size: int) -> str: if size < 7: return s[:size] if len(s) > size: @@ -67,8 +70,8 @@ def shorten_str(s, size): return s -def compatible_encode_method(bytesorstring, encoding='utf-8', - erraction='ignore'): +def compatible_encode_method(bytesorstring: Union[bytes, str], + encoding='utf-8', erraction='ignore') -> str: """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either. @@ -79,7 +82,8 @@ def compatible_encode_method(bytesorstring, encoding='utf-8', return bytesorstring.decode(encoding, erraction) -def apply_png_predictor(pred, colors, columns, bitspercomponent, data): +def apply_png_predictor(pred: Any, colors: int, columns: int, + bitspercomponent: int, data: bytes) -> bytes: if bitspercomponent != 8: # unsupported raise ValueError("Unsupported `bitspercomponent': %d" % @@ -164,7 +168,10 @@ def isnumber(x: Any) -> bool: return isinstance(x, (int, float)) -def uniq(objs: Iterable[Any]) -> Iterator[Any]: +_T = TypeVar('_T') + + +def uniq(objs: Iterable[_T]) -> Iterator[_T]: """Eliminates duplicated elements.""" done = set() for obj in objs: @@ -175,7 +182,8 @@ def uniq(objs: Iterable[Any]) -> Iterator[Any]: return -def fsplit(pred, objs): +def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T] + ) -> Tuple[List[_T], List[_T]]: """Split a list into two classes according to the predicate.""" t = [] f = [] @@ -187,7 +195,7 @@ def fsplit(pred, objs): return t, f -def drange(v0, v1, d): +def drange(v0: float, v1: float, d: int) -> range: """Returns a discrete range.""" return range(int(v0) // d, int(v1 + d) // d) @@ -204,7 +212,8 @@ def get_bound(pts: Iterable[Point]) -> Rect: return x0, y0, x1, y1 -def pick(seq, func, maxobj=None): +def pick(seq: Iterable[_T], func: Callable[[_T], float], + maxobj: Optional[_T] = None) -> Optional[_T]: """Picks the object obj where func(obj) has the highest value.""" maxscore = None for obj in seq: @@ -214,7 +223,7 @@ def pick(seq, func, maxobj=None): return maxobj -def choplist(n, seq): +def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]: """Groups every n elements of the list.""" r = [] for x in seq: @@ -288,7 +297,7 @@ def decode_text(s: bytes) -> str: return ''.join(PDFDocEncoding[c] for c in s) -def enc(x): +def enc(x: str) -> str: """Encodes a string for SGML/XML/HTML""" if isinstance(x, bytes): return '' @@ -306,7 +315,7 @@ def matrix2str(m: Matrix) -> str: .format(a, b, c, d, e, f) -def vecBetweenBoxes(obj1: LTComponent, obj2: LTComponent) -> Point: +def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point: """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. @@ -341,9 +350,9 @@ class Plane: """ def __init__(self, bbox: Rect, gridsize: int = 50): - self._seq: List[LTComponent] = [] # preserve the object order. - self._objs: Set[LTComponent] = set() - self._grid: Dict[Point, List[LTComponent]] = {} + self._seq: List["LTComponent"] = [] # preserve the object order. + self._objs: Set["LTComponent"] = set() + self._grid: Dict[Point, List["LTComponent"]] = {} self.gridsize = gridsize (self.x0, self.y0, self.x1, self.y1) = bbox @@ -371,15 +380,15 @@ def _getrange(self, bbox: Rect) -> Iterator[Point]: for grid_x in drange(x0, x1, self.gridsize): yield (grid_x, grid_y) - def extend(self, objs: Iterable[LTComponent]) -> None: + def extend(self, objs: Iterable["LTComponent"]) -> None: for obj in objs: self.add(obj) - def add(self, obj: LTComponent) -> None: + def add(self, obj: "LTComponent") -> None: """place an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): if k not in self._grid: - r: List[LTComponent] = [] + r: List["LTComponent"] = [] self._grid[k] = r else: r = self._grid[k] @@ -387,7 +396,7 @@ def add(self, obj: LTComponent) -> None: self._seq.append(obj) self._objs.add(obj) - def remove(self, obj: LTComponent) -> None: + def remove(self, obj: "LTComponent") -> None: """displace an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): try: @@ -396,7 +405,7 @@ def remove(self, obj: LTComponent) -> None: pass self._objs.remove(obj) - def find(self, bbox: Rect) -> Iterator[LTComponent]: + def find(self, bbox: Rect) -> Iterator["LTComponent"]: """finds objects that are in a certain area.""" (x0, y0, x1, y1) = bbox done = set()