Skip to content

Commit

Permalink
fix text output from HTMLConverter
Browse files Browse the repository at this point in the history
  • Loading branch information
0xabu committed Sep 3, 2021
1 parent 5401276 commit da03afe
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
15 changes: 11 additions & 4 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
return


class HTMLConverter(PDFConverter[BinaryIO]):
class HTMLConverter(PDFConverter[AnyIO]):
RECT_COLORS = {
'figure': 'yellow',
'textline': 'magenta',
Expand All @@ -285,7 +285,7 @@ class HTMLConverter(PDFConverter[BinaryIO]):

def __init__(self,
rsrcmgr: PDFResourceManager,
outfp: BinaryIO,
outfp: AnyIO,
codec: str = 'utf-8',
pageno: int = 1,
laparams: Optional[LAParams] = None,
Expand All @@ -300,6 +300,11 @@ def __init__(self,
text_colors: Optional[Dict[str, str]] = None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)

# write() assumes a codec for binary I/O, or no codec for text I/O.
if self.outfp_binary == (not self.codec):
raise ValueError("Codec is required for a binary I/O output")

if text_colors is None:
text_colors = {'char': 'black'}
if rect_colors is None:
Expand All @@ -323,8 +328,10 @@ def __init__(self,
return

def write(self, text: str) -> None:
textb = text.encode(self.codec)
self.outfp.write(textb)
if self.codec:
cast(BinaryIO, self.outfp).write(text.encode(self.codec))
else:
cast(TextIO, self.outfp).write(text)
return

def write_header(self) -> None:
Expand Down
7 changes: 3 additions & 4 deletions pdfminer/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,9 @@ def extract_text_to_fp(inf: BinaryIO,
stripcontrol=strip_control)

elif output_type == 'html':
# Binary I/O is required, but we have no good way to test it here.
device = HTMLConverter(rsrcmgr, cast(BinaryIO, outfp), codec=codec,
scale=scale, layoutmode=layoutmode,
laparams=laparams, imagewriter=imagewriter)
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)

elif output_type == 'tag':
# Binary I/O is required, but we have no good way to test it here.
Expand Down

0 comments on commit da03afe

Please sign in to comment.