Skip to content

Commit

Permalink
general progress on annotations
Browse files Browse the repository at this point in the history
 * finish utils
 * annotate more of pdfinterp, pdfdevice
 * document reason for # type: ignore comments
 * fix cyclic imports
 * satisfy flake8
  • Loading branch information
0xabu committed Aug 20, 2021
1 parent 17d59f4 commit 0e6871c
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 110 deletions.
28 changes: 18 additions & 10 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import io
import logging
from pdfminer.pdftypes import PDFStream
from pdfminer.pdfcolor import PDFColorSpace
from typing import List
from pdfminer.pdfpage import PDFPage
import re
import sys

from . import utils
from .layout import LTChar, LTLayoutContainer
from .layout import LAParams
from .layout import LTChar
from .layout import LTContainer
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
from .layout import LTLayoutContainer
from .layout import LTLine
from .layout import LTPage
from .layout import LTRect
Expand All @@ -21,9 +22,13 @@
from .layout import LTTextGroup
from .layout import LTTextLine
from .pdfdevice import PDFTextDevice
from .pdffont import PDFFont, PDFUnicodeNotDefined
from .pdfinterp import PDFResourceManager
from .utils import Matrix, Rect, apply_matrix_pt
from .pdffont import PDFFont
from .pdffont import PDFUnicodeNotDefined
from .pdfinterp import PDFGraphicState, PDFResourceManager
from .pdfpage import PDFPage
from .pdftypes import PDFStream
from .utils import Matrix, Rect
from .utils import apply_matrix_pt
from .utils import bbox2str
from .utils import enc
from .utils import mult_matrix
Expand All @@ -35,7 +40,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
cur_item: LTLayoutContainer
ctm: Matrix

def __init__(self, rsrcmgr: PDFResourceManager, pageno=1, laparams=None):
def __init__(self, rsrcmgr: PDFResourceManager, pageno: int = 1,
laparams: LAParams = None):
PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno
self.laparams = laparams
Expand Down Expand Up @@ -79,7 +85,8 @@ def render_image(self, name: str, stream: PDFStream) -> None:
self.cur_item.add(item)
return

def paint_path(self, gstate, stroke, fill, evenodd, path):
def paint_path(self, gstate: PDFGraphicState, stroke, fill, evenodd, path
) -> None:
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path)

Expand Down Expand Up @@ -130,8 +137,9 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
gstate.scolor, gstate.ncolor)
self.cur_item.add(curve)

def render_char(self, matrix: Matrix, font: PDFFont, fontsize, scaling, rise, cid, ncs,
graphicstate):
def render_char(self, matrix: Matrix, font: PDFFont, fontsize: float,
scaling: float, rise: float, cid: int, ncs: PDFColorSpace,
graphicstate: PDFGraphicState) -> float:
try:
text = font.to_unichr(cid)
assert isinstance(text, str), str(type(text))
Expand Down
45 changes: 31 additions & 14 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
import heapq
import logging
from pdfminer.pdftypes import PDFStream
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Sequence, Set, Tuple, TypeVar
from typing import (Any, Dict, Generic, Iterable, Iterator, List, Optional,
Sequence, Set, Tuple, TypeVar, cast)

from .utils import INF, Matrix, Rect
from .utils import INF
from .utils import Matrix
from .utils import Rect
from .utils import Plane
from .utils import apply_matrix_pt
from .utils import bbox2str
from .utils import fsplit
from .utils import get_bound
from .utils import matrix2str
from .utils import uniq
from .pdfcolor import PDFColorSpace
from .pdftypes import PDFStream
from .pdfinterp import PDFGraphicState
from .pdffont import PDFFont

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -282,8 +288,9 @@ def get_text(self) -> str:
class LTChar(LTComponent, LTText):
"""Actual letter in the text as a Unicode string."""

def __init__(self, matrix: Matrix, font, fontsize, scaling, rise,
text: str, textwidth, textdisp, ncs, graphicstate):
def __init__(self, matrix: Matrix, font: PDFFont, fontsize, scaling, rise,
text: str, textwidth, textdisp, ncs: PDFColorSpace,
graphicstate: PDFGraphicState):
LTText.__init__(self)
self._text = text
self.matrix = matrix
Expand Down Expand Up @@ -335,8 +342,10 @@ def is_compatible(self, obj: Any) -> bool:
"""Returns True if two characters can coexist in the same line."""
return True


LTContainerElement = TypeVar('LTContainerElement', LTItem, LTComponent)


class LTContainer(LTComponent, Generic[LTContainerElement]):
"""Object that can be extended and analyzed"""

Expand Down Expand Up @@ -430,7 +439,8 @@ def add(self, obj: LTComponent) -> None:
LTTextLine.add(self, obj)
return

def find_neighbors(self, plane: Plane, ratio: float) -> List["LTTextLineHorizontal"]:
def find_neighbors(self, plane: Plane, ratio: float
) -> List["LTTextLineHorizontal"]:
"""
Finds neighboring LTTextLineHorizontals in the plane.
Expand Down Expand Up @@ -486,7 +496,8 @@ def add(self, obj: LTComponent) -> None:
LTTextLine.add(self, obj)
return

def find_neighbors(self, plane: Plane, ratio: float) -> List["LTTextLineVertical"]:
def find_neighbors(self, plane: Plane, ratio: float
) -> List["LTTextLineVertical"]:
"""
Finds neighboring LTTextLineVerticals in the plane.
Expand Down Expand Up @@ -600,7 +611,8 @@ def __init__(self, bbox: Rect):
return

# group_objects: group text object to textlines.
def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent]) -> Iterator[LTTextLine]:
def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent]
) -> Iterator[LTTextLine]:
obj0 = None
line = None
for obj1 in objs:
Expand Down Expand Up @@ -670,11 +682,13 @@ def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent]) -> Iter
obj0 = obj1
if line is None:
line = LTTextLineHorizontal(laparams.word_margin)
line.add(obj0) # type: ignore
assert obj0 is not None
line.add(obj0)
yield line
return

def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine]) -> Iterator[LTTextBox]:
def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine]
) -> Iterator[LTTextBox]:
"""Group neighboring lines to textboxes"""
plane = Plane(self.bbox)
plane.extend(lines)
Expand Down Expand Up @@ -705,7 +719,8 @@ def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine]) -> It
yield box
return

def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox]) -> List[LTTextGroup]:
def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox]
) -> List[LTTextGroup]:
"""Group textboxes hierarchically.
Get pair-wise distances, via dist func defined below, and then merge
Expand All @@ -724,7 +739,7 @@ def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox]) -> Lis
:return: a list that has only one element, the final top level group.
"""

def dist(obj1:LTComponent, obj2:LTComponent) -> float:
def dist(obj1: LTComponent, obj2: LTComponent) -> float:
"""A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2.
Expand Down Expand Up @@ -752,7 +767,8 @@ def isany(obj1: LTComponent, obj2: LTComponent) -> Set[LTComponent]:
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))

dists: List[Tuple[bool, float, int, int, LTTextContainer, LTTextContainer]] = []
dists: List[Tuple[bool, float, int, int, LTTextContainer,
LTTextContainer]] = []
for i in range(len(boxes)):
box1 = boxes[i]
for j in range(i+1, len(boxes)):
Expand Down Expand Up @@ -817,7 +833,8 @@ def getkey(box):
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box: box.index)
self._objs = textboxes + otherobjs + empties
self._objs = (cast(List[LTComponent], textboxes) + otherobjs
+ cast(List[LTComponent], empties))
return


Expand Down
5 changes: 3 additions & 2 deletions pdfminer/pdfcolor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import collections
from typing import Dict
from .psparser import LIT


Expand All @@ -9,7 +10,7 @@

class PDFColorSpace:

def __init__(self, name, ncomponents):
def __init__(self, name: str, ncomponents: int):
self.name = name
self.ncomponents = ncomponents
return
Expand All @@ -19,7 +20,7 @@ def __repr__(self):
(self.name, self.ncomponents)


PREDEFINED_COLORSPACE = collections.OrderedDict()
PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()

for (name, n) in [
('DeviceGray', 1), # default value first
Expand Down
Loading

0 comments on commit 0e6871c

Please sign in to comment.