diff --git a/pdfminer/converter.py b/pdfminer/converter.py index d559e71c..8e627167 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -268,7 +268,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): return -class HTMLConverter(PDFConverter[BinaryIO]): +class HTMLConverter(PDFConverter[AnyIO]): RECT_COLORS = { 'figure': 'yellow', 'textline': 'magenta', @@ -285,7 +285,7 @@ class HTMLConverter(PDFConverter[BinaryIO]): def __init__(self, rsrcmgr: PDFResourceManager, - outfp: BinaryIO, + outfp: AnyIO, codec: str = 'utf-8', pageno: int = 1, laparams: Optional[LAParams] = None, @@ -300,6 +300,11 @@ def __init__(self, text_colors: Optional[Dict[str, str]] = None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + + # write() assumes a codec for binary I/O, or no codec for text I/O. + if self.outfp_binary == (not self.codec): + raise ValueError("Codec is required for a binary I/O output") + if text_colors is None: text_colors = {'char': 'black'} if rect_colors is None: @@ -323,8 +328,10 @@ def __init__(self, return def write(self, text: str) -> None: - textb = text.encode(self.codec) - self.outfp.write(textb) + if self.codec: + cast(BinaryIO, self.outfp).write(text.encode(self.codec)) + else: + cast(TextIO, self.outfp).write(text) return def write_header(self) -> None: diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 5c5921b1..06b30645 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -83,10 +83,9 @@ def extract_text_to_fp(inf: BinaryIO, stripcontrol=strip_control) elif output_type == 'html': - # Binary I/O is required, but we have no good way to test it here. - device = HTMLConverter(rsrcmgr, cast(BinaryIO, outfp), codec=codec, - scale=scale, layoutmode=layoutmode, - laparams=laparams, imagewriter=imagewriter) + device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, + layoutmode=layoutmode, laparams=laparams, + imagewriter=imagewriter) elif output_type == 'tag': # Binary I/O is required, but we have no good way to test it here.