From 0d40b7c03a8028dc44acd3f457eac71abd681827 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Sat, 4 Sep 2021 22:31:33 -0700 Subject: [PATCH] annotate pdf2txt.py --- tools/pdf2txt.py | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index c7515e6f..f3655320 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -4,9 +4,11 @@ import argparse import logging import sys +from typing import Any, Container, Iterable, List, Literal, Optional, Union import pdfminer.high_level -import pdfminer.layout +from pdfminer.layout import LAParams +from pdfminer.utils import AnyIO logging.basicConfig() @@ -15,24 +17,33 @@ (".xml", "xml"), (".tag", "tag")) +FloatOrDisabled = Union[float, Literal["disabled"]] -def float_or_disabled(x): + +def float_or_disabled(x: str) -> FloatOrDisabled: if x.lower().strip() == "disabled": - return x + return "disabled" try: - x = float(x) + return float(x) except ValueError: raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) -def extract_text(files=[], outfile='-', - no_laparams=False, all_texts=None, detect_vertical=None, - word_margin=None, char_margin=None, line_margin=None, - boxes_flow=None, output_type='text', codec='utf-8', - strip_control=False, maxpages=0, page_numbers=None, - password="", scale=1.0, rotation=0, layoutmode='normal', - output_dir=None, debug=False, disable_caching=False, - **kwargs): +def extract_text(files: Iterable[str] = [], outfile: str = '-', + no_laparams: bool = False, all_texts: Optional[bool] = None, + detect_vertical: Optional[bool] = None, + word_margin: Optional[float] = None, + char_margin: Optional[float] = None, + line_margin: Optional[float] = None, + boxes_flow: Optional[FloatOrDisabled] = None, + output_type: str = 'text', codec: str = 'utf-8', + strip_control: bool = False, maxpages: int = 0, + page_numbers: Optional[Container[int]] = None, + password: str = "", scale: float = 1.0, rotation: int = 0, + layoutmode: str = 'normal', + output_dir: Optional[str] = None, debug: bool = False, + disable_caching: bool = False, + **kwargs: Any) -> AnyIO: if not files: raise ValueError("Must provide files to work upon!") @@ -40,7 +51,7 @@ def extract_text(files=[], outfile='-', # create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: - laparams = pdfminer.layout.LAParams() + laparams: Optional[LAParams] = LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) @@ -55,8 +66,8 @@ def extract_text(files=[], outfile='-', output_type = alttype if outfile == "-": - outfp = sys.stdout - if outfp.encoding is not None: + outfp: AnyIO = sys.stdout + if sys.stdout.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") @@ -67,7 +78,7 @@ def extract_text(files=[], outfile='-', return outfp -def maketheparser(): +def maketheparser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", type=str, default=None, nargs="+", @@ -180,7 +191,7 @@ def maketheparser(): # main -def main(args=None): +def main(args: Optional[List[str]] = None) -> int: P = maketheparser() A = P.parse_args(args=args) @@ -201,4 +212,4 @@ def main(args=None): if __name__ == '__main__': - sys.exit(main()) # type: ignore[no-untyped-call] + sys.exit(main())