Skip to content

Commit

Permalink
annotate high_level.py and the immediately-reachable internal APIs (m…
Browse files Browse the repository at this point in the history
…ostly converters)
  • Loading branch information
0xabu committed Sep 3, 2021
1 parent cc49051 commit 5401276
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 74 deletions.
140 changes: 92 additions & 48 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import io
import logging
from pdfminer.pdfcolor import PDFColorSpace
from typing import Any, List, Optional, Sequence, cast
from typing import (Any, BinaryIO, Dict, Generic, List, Optional, Sequence,
TextIO, Tuple, TypeVar, cast)
import re

from . import utils
Expand All @@ -11,6 +12,7 @@
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
from .layout import LTItem
from .layout import LTLayoutContainer
from .layout import LTLine
from .layout import LTPage
Expand All @@ -26,11 +28,12 @@
from .pdfinterp import PDFGraphicState, PDFResourceManager
from .pdfpage import PDFPage
from .pdftypes import PDFStream
from .utils import Point, Matrix, Rect, PathSegment
from .utils import AnyIO, Point, Matrix, Rect, PathSegment
from .utils import apply_matrix_pt
from .utils import bbox2str
from .utils import enc
from .utils import mult_matrix
from .image import ImageWriter

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -161,31 +164,43 @@ def receive_layout(self, ltpage: LTPage) -> None:


class PDFPageAggregator(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
def __init__(self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
self.result = None
self.result: Optional[LTPage] = None
return

def receive_layout(self, ltpage):
def receive_layout(self, ltpage: LTPage) -> None:
self.result = ltpage
return

def get_result(self):
def get_result(self) -> LTPage:
assert self.result is not None
return self.result


class PDFConverter(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None):
# Some PDFConverter children support only binary I/O
IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO)


class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
def __init__(self,
rsrcmgr: PDFResourceManager,
outfp: IOType,
codec: str = 'utf-8',
pageno: int = 1,
laparams: Optional[LAParams] = None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
self.outfp = outfp
self.outfp: IOType = outfp
self.codec = codec
self.outfp_binary = self._is_binary_stream(self.outfp)

@staticmethod
def _is_binary_stream(outfp):
def _is_binary_stream(outfp: AnyIO) -> bool:
"""Test if an stream is binary or not"""
if 'b' in getattr(outfp, 'mode', ''):
return True
Expand All @@ -200,11 +215,17 @@ def _is_binary_stream(outfp):
return True


class TextConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
class TextConverter(PDFConverter[AnyIO]):
def __init__(self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = 'utf-8',
pageno: int = 1,
laparams: Optional[LAParams] = None,
showpageno: bool = False,
imagewriter: Optional[ImageWriter] = None):
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
self.showpageno = showpageno
self.imagewriter = imagewriter
return
Expand Down Expand Up @@ -247,7 +268,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
return


class HTMLConverter(PDFConverter):
class HTMLConverter(PDFConverter[BinaryIO]):
RECT_COLORS = {
'figure': 'yellow',
'textline': 'magenta',
Expand All @@ -262,10 +283,21 @@ class HTMLConverter(PDFConverter):
'char': 'black',
}

def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None, debug=0, rect_colors=None,
text_colors=None):
def __init__(self,
rsrcmgr: PDFResourceManager,
outfp: BinaryIO,
codec: str = 'utf-8',
pageno: int = 1,
laparams: Optional[LAParams] = None,
scale: float = 1,
fontscale: float = 1.0,
layoutmode: str = 'normal',
showpageno: bool = True,
pagemargin: int = 50,
imagewriter: Optional[ImageWriter] = None,
debug: int = 0,
rect_colors: Optional[Dict[str, str]] = None,
text_colors: Optional[Dict[str, str]] = None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
if text_colors is None:
Expand All @@ -285,18 +317,17 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
self.rect_colors.update(self.RECT_COLORS)
self.text_colors.update(self.TEXT_COLORS)
self._yoffset = self.pagemargin
self._font = None
self._fontstack = []
self._font: Optional[Tuple[str, float]] = None
self._fontstack: List[Tuple[str, float]] = []
self.write_header()
return

def write(self, text):
if self.codec:
text = text.encode(self.codec)
self.outfp.write(text)
def write(self, text: str) -> None:
textb = text.encode(self.codec)
self.outfp.write(textb)
return

def write_header(self):
def write_header(self) -> None:
self.write('<html><head>\n')
if self.codec:
s = '<meta http-equiv="Content-Type" content="text/html; ' \
Expand All @@ -307,7 +338,7 @@ def write_header(self):
self.write('</head><body>\n')
return

def write_footer(self):
def write_footer(self) -> None:
page_links = ['<a href="#{}">{}</a>'.format(i, i)
for i in range(1, self.pageno)]
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
Expand All @@ -316,7 +347,7 @@ def write_footer(self):
self.write('</body></html>\n')
return

def write_text(self, text):
def write_text(self, text: str) -> None:
self.write(enc(text))
return

Expand Down Expand Up @@ -371,14 +402,14 @@ def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
self.write(s)
return

def end_div(self, color):
def end_div(self, color: Any) -> None:
if self._font is not None:
self.write('</span>')
self._font = self._fontstack.pop()
self.write('</div>')
return

def put_text(self, text, fontname, fontsize):
def put_text(self, text: str, fontname: str, fontsize: float) -> None:
font = (fontname, fontsize)
if font != self._font:
if self._font is not None:
Expand All @@ -392,7 +423,7 @@ def put_text(self, text, fontname, fontsize):
self.write_text(text)
return

def put_newline(self):
def put_newline(self) -> None:
self.write('<br>')
return

Expand Down Expand Up @@ -468,50 +499,62 @@ def render(item):
self._yoffset += self.pagemargin
return

def close(self):
def close(self) -> None:
self.write_footer()
return


class XMLConverter(PDFConverter):
class XMLConverter(PDFConverter[AnyIO]):

CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')

def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
imagewriter=None, stripcontrol=False):
def __init__(self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = 'utf-8',
pageno: int = 1,
laparams: Optional[LAParams] = None,
imagewriter: Optional[ImageWriter] = None,
stripcontrol: bool = False):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)

# write() assumes a codec for binary I/O, or no codec for text I/O.
if self.outfp_binary == (not self.codec):
raise ValueError("Codec is required for a binary I/O output")

self.imagewriter = imagewriter
self.stripcontrol = stripcontrol
self.write_header()
return

def write(self, text):
def write(self, text: str) -> None:
if self.codec:
text = text.encode(self.codec)
self.outfp.write(text)
cast(BinaryIO, self.outfp).write(text.encode(self.codec))
else:
cast(TextIO, self.outfp).write(text)
return

def write_header(self):
def write_header(self) -> None:
if self.codec:
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
else:
self.write('<?xml version="1.0" ?>\n')
self.write('<pages>\n')
return

def write_footer(self):
def write_footer(self) -> None:
self.write('</pages>\n')
return

def write_text(self, text):
def write_text(self, text: str) -> None:
if self.stripcontrol:
text = self.CONTROL.sub('', text)
self.write(enc(text))
return

def receive_layout(self, ltpage):
def show_group(item):
def receive_layout(self, ltpage: LTPage) -> None:
def show_group(item: LTItem) -> None:
if isinstance(item, LTTextBox):
self.write('<textbox id="%d" bbox="%s" />\n' %
(item.index, bbox2str(item.bbox)))
Expand All @@ -522,7 +565,7 @@ def show_group(item):
self.write('</textgroup>\n')
return

def render(item):
def render(item: LTItem) -> None:
if isinstance(item, LTPage):
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
(item.pageid, bbox2str(item.bbox), item.rotate)
Expand Down Expand Up @@ -581,7 +624,8 @@ def render(item):
self.write('<text>%s</text>\n' % item.get_text())
elif isinstance(item, LTImage):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
name = self.imagewriter \
.export_image(item) # type: ignore[no-untyped-call]
self.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name), item.width, item.height))
else:
Expand All @@ -593,6 +637,6 @@ def render(item):
render(ltpage)
return

def close(self):
def close(self) -> None:
self.write_footer()
return

0 comments on commit 5401276

Please sign in to comment.