From cc490513f8f17a7adc0bcbab2e0e86f37e832300 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Thu, 2 Sep 2021 17:04:35 -0700 Subject: [PATCH] * expand and improve annotations in cmap, encryption/decompression and fonts * disallow untyped calls; this way, we have a core set of typed code that can grow over time (just not for ccitt, because there's a ton of work lurking there) * expand "typing: none" comments to suppress a specific error code --- mypy.ini | 4 +- pdfminer/arcfour.py | 7 +- pdfminer/ascii85.py | 6 +- pdfminer/ccitt.py | 7 +- pdfminer/cmapdb.py | 90 ++++++++++--------- pdfminer/encodingdb.py | 6 +- pdfminer/image.py | 2 +- pdfminer/layout.py | 11 +-- pdfminer/lzw.py | 25 +++--- pdfminer/pdfdocument.py | 194 +++++++++++++++++++++++----------------- pdfminer/pdffont.py | 55 +++++++----- pdfminer/pdfinterp.py | 33 ++++--- pdfminer/pdfpage.py | 6 +- pdfminer/pdfparser.py | 9 +- pdfminer/pdftypes.py | 84 ++++++++++------- pdfminer/psparser.py | 2 +- pdfminer/runlength.py | 2 +- pdfminer/utils.py | 2 +- 18 files changed, 313 insertions(+), 232 deletions(-) diff --git a/mypy.ini b/mypy.ini index 8953a1a3..72aacb79 100644 --- a/mypy.ini +++ b/mypy.ini @@ -2,7 +2,7 @@ warn_unused_configs = True disallow_any_generics = True disallow_subclassing_any = True -#disallow_untyped_calls = True +disallow_untyped_calls = True #disallow_untyped_defs = True disallow_incomplete_defs = True #check_untyped_defs = True @@ -17,3 +17,5 @@ strict_equality = True [mypy-cryptography.hazmat.*] ignore_missing_imports = True +[mypy-pdfminer.ccitt] +disallow_untyped_calls = False diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index e40b0804..5967a1af 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -5,9 +5,12 @@ """ +from typing import Sequence + + class Arcfour: - def __init__(self, key): + def __init__(self, key: Sequence[int]): # because Py3 range is not indexable s = [i for i in range(256)] j = 0 @@ -19,7 +22,7 @@ def __init__(self, key): (self.i, self.j) = (0, 0) return - def process(self, data): + def process(self, data: bytes) -> bytes: (i, j) = (self.i, self.j) s = self.s r = b'' diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index cde3f908..7c7c757f 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -9,7 +9,7 @@ # ascii85decode(data) -def ascii85decode(data): +def ascii85decode(data: bytes) -> bytes: """ In ASCII85 encoding, every four bytes are encoded with five ASCII letters, using 85 different types of characters (as 256**4 < 85**5). @@ -47,7 +47,7 @@ def ascii85decode(data): trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) -def asciihexdecode(data): +def asciihexdecode(data: bytes) -> bytes: """ ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the @@ -57,7 +57,7 @@ def asciihexdecode(data): the EOD marker after reading an odd number of hexadecimal digits, it will behave as if a 0 followed the last digit. """ - def decode(x): + def decode(x: bytes) -> bytes: i = int(x, 16) return bytes((i,)) diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index 8ae123b5..e45e8252 100644 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -13,6 +13,7 @@ import sys import array +from typing import Any, Dict def get_bytes(data): @@ -541,7 +542,7 @@ def output_line(self, y, bits): return -def ccittfaxdecode(data, params): +def ccittfaxdecode(data: bytes, params: Dict[str, Any]) -> bytes: K = params.get('K') cols = params.get('Columns') bytealign = params.get('EncodedByteAlign') @@ -551,7 +552,7 @@ def ccittfaxdecode(data, params): else: raise ValueError(K) parser.feedbytes(data) - return parser.close() + return parser.close() # type: ignore[no-any-return] # test @@ -562,7 +563,7 @@ def main(argv): class Parser(CCITTG4Parser): def __init__(self, width, bytealign=False): - import pygame # type: ignore + import pygame # type: ignore[import] CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.img = pygame.Surface((self.width, 1000)) return diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index e99ceb93..232d9a6a 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -16,10 +16,12 @@ import pickle as pickle import struct import logging -from typing import Any, Dict, List +from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Optional, + TextIO, Tuple, Union) from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import PSEOF +from .psparser import PSKeyword from .psparser import PSLiteral from .psparser import literal_name from .psparser import KWD @@ -39,44 +41,48 @@ class CMapBase: debug = 0 - def __init__(self, **kwargs): + def __init__(self, **kwargs: Union[str, int]): self.attrs = kwargs.copy() return - def is_vertical(self): + def is_vertical(self) -> bool: return self.attrs.get('WMode', 0) != 0 - def set_attr(self, k, v): + def set_attr(self, k: str, v: Any) -> None: self.attrs[k] = v return - def add_code2cid(self, code, cid): + def add_code2cid(self, code: str, cid: int) -> None: return - def add_cid2unichr(self, cid, code): + def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int] + ) -> None: return - def use_cmap(self, cmap): + def use_cmap(self, cmap: "CMapBase") -> None: return + def decode(self, code: bytes) -> Iterable[int]: + raise NotImplementedError + class CMap(CMapBase): - def __init__(self, **kwargs): + def __init__(self, **kwargs: Union[str, int]): CMapBase.__init__(self, **kwargs) - self.code2cid = {} + self.code2cid: Dict[int, Any] = {} return def __repr__(self): return '' % self.attrs.get('CMapName') - def use_cmap(self, cmap): + def use_cmap(self, cmap: CMapBase) -> None: assert isinstance(cmap, CMap), str(type(cmap)) - def copy(dst, src): + def copy(dst: Dict[Any, Any], src: Dict[Any, Any]) -> None: for (k, v) in src.items(): if isinstance(v, dict): - d = {} + d: Dict[Any, Any] = {} dst[k] = d copy(d, v) else: @@ -84,7 +90,7 @@ def copy(dst, src): copy(self.code2cid, cmap.code2cid) return - def decode(self, code): + def decode(self, code: bytes) -> Iterator[int]: log.debug('decode: %r, %r', self, code) d = self.code2cid for i in iter(code): @@ -97,7 +103,9 @@ def decode(self, code): d = self.code2cid return - def dump(self, out=sys.stdout, code2cid=None, code=None): + def dump(self, out: TextIO = sys.stdout, + code2cid: Optional[Dict[int, Any]] = None, + code: Tuple[int, ...] = ()) -> None: if code2cid is None: code2cid = self.code2cid code = () @@ -112,7 +120,7 @@ def dump(self, out=sys.stdout, code2cid=None, code=None): class IdentityCMap(CMapBase): - def decode(self, code): + def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code)//2 if n: return struct.unpack('>%dH' % n, code) @@ -122,7 +130,7 @@ def decode(self, code): class IdentityCMapByte(IdentityCMap): - def decode(self, code): + def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code) if n: return struct.unpack('>%dB' % n, code) @@ -132,19 +140,19 @@ def decode(self, code): class UnicodeMap(CMapBase): - def __init__(self, **kwargs): + def __init__(self, **kwargs: Union[str, int]): CMapBase.__init__(self, **kwargs) - self.cid2unichr = {} + self.cid2unichr: Dict[int, str] = {} return def __repr__(self): return '' % self.attrs.get('CMapName') - def get_unichr(self, cid): + def get_unichr(self, cid: int) -> str: log.debug('get_unichr: %r, %r', self, cid) return self.cid2unichr[cid] - def dump(self, out=sys.stdout): + def dump(self, out: TextIO = sys.stdout) -> None: for (k, v) in sorted(self.cid2unichr.items()): out.write('cid %d = unicode %r\n' % (k, v)) return @@ -152,29 +160,31 @@ def dump(self, out=sys.stdout): class FileCMap(CMap): - def add_code2cid(self, code, cid): + def add_code2cid(self, code: str, cid: int) -> None: assert isinstance(code, str) and isinstance(cid, int),\ str((type(code), type(cid))) d = self.code2cid for c in code[:-1]: - c = ord(c) - if c in d: - d = d[c] + ci = ord(c) + if ci in d: + d = d[ci] else: - t = {} - d[c] = t + t: Dict[int, Any] = {} + d[ci] = t d = t - c = ord(code[-1]) - d[c] = cid + ci = ord(code[-1]) + d[ci] = cid return class FileUnicodeMap(UnicodeMap): - def add_cid2unichr(self, cid, code): + def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int] + ) -> None: assert isinstance(cid, int), str(type(cid)) if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. + assert isinstance(code.name, str) self.cid2unichr[cid] = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. @@ -188,8 +198,8 @@ def add_cid2unichr(self, cid, code): class PyCMap(CMap): - def __init__(self, name, module): - CMap.__init__(self, CMapName=name) + def __init__(self, name: str, module: Any): + super().__init__(CMapName=name) self.code2cid = module.CODE2CID if module.IS_VERTICAL: self.attrs['WMode'] = 1 @@ -198,8 +208,8 @@ def __init__(self, name, module): class PyUnicodeMap(UnicodeMap): - def __init__(self, name, module, vertical): - UnicodeMap.__init__(self, CMapName=name) + def __init__(self, name: str, module: Any, vertical: bool): + super().__init__(CMapName=name) if vertical: self.cid2unichr = module.CID2UNICHR_V self.attrs['WMode'] = 1 @@ -264,17 +274,16 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: return cls._umap_cache[name][vertical] -# int here means that we're not extending PSStackParser with additional types. -class CMapParser(PSStackParser[int]): +class CMapParser(PSStackParser[PSKeyword]): - def __init__(self, cmap, fp): + def __init__(self, cmap: CMapBase, fp: BinaryIO): PSStackParser.__init__(self, fp) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True return - def run(self): + def run(self) -> None: try: self.nextobject() except PSEOF: @@ -298,7 +307,7 @@ def run(self): KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange') KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_BEGINCMAP: self._in_cmap = True self.popall() @@ -382,6 +391,7 @@ def do_keyword(self, pos, token): for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) else: + assert isinstance(code, bytes) var = code[-4:] base = nunpack(var) prefix = code[:-4] @@ -412,7 +422,7 @@ def do_keyword(self, pos, token): return -def main(argv): +def main(argv: List[str]) -> None: args = argv[1:] for fname in args: fp = open(fname, 'rb') @@ -424,4 +434,4 @@ def main(argv): if __name__ == '__main__': - sys.exit(main(sys.argv)) + main(sys.argv) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 58998a90..da51f702 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -10,7 +10,7 @@ log = logging.getLogger(__name__) -def name2unicode(name): +def name2unicode(name: str) -> str: """Converts Adobe glyph names to Unicode numbers. In contrast to the specification, this raises a KeyError instead of return @@ -32,7 +32,7 @@ def name2unicode(name): else: if name in glyphname2unicode: - return glyphname2unicode.get(name) + return glyphname2unicode[name] elif name.startswith('uni'): name_without_uni = name.strip('uni') @@ -59,7 +59,7 @@ def name2unicode(name): 'it does not match specification' % name) -def raise_key_error_for_invalid_unicode(unicode_digit): +def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 diff --git a/pdfminer/image.py b/pdfminer/image.py index e825e83e..b0cc0171 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -81,7 +81,7 @@ def export_image(self, image): if ext == '.jpg': raw_data = image.stream.get_rawdata() if LITERAL_DEVICE_CMYK in image.colorspace: - from PIL import Image # type: ignore + from PIL import Image # type: ignore[import] from PIL import ImageChops ifp = BytesIO(raw_data) i = Image.open(ifp) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index f1c5652e..8bc51a45 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -298,8 +298,8 @@ class LTChar(LTComponent, LTText): def __init__(self, matrix: Matrix, font: PDFFont, fontsize: float, scaling: float, rise: float, text: str, textwidth: float, - textdisp: Point, ncs: PDFColorSpace, - graphicstate: PDFGraphicState): + textdisp: Union[float, Tuple[Optional[float], float]], + ncs: PDFColorSpace, graphicstate: PDFGraphicState): LTText.__init__(self) self._text = text self.matrix = matrix @@ -310,6 +310,7 @@ def __init__(self, matrix: Matrix, font: PDFFont, fontsize: float, # compute the boundary rectangle. if font.is_vertical(): # vertical + assert isinstance(textdisp, tuple) (vx, vy) = textdisp if vx is None: vx = fontsize * 0.5 @@ -385,7 +386,7 @@ def analyze(self, laparams: LAParams) -> None: class LTExpandableContainer(LTContainer[LTItemT]): - def __init__(self): + def __init__(self) -> None: LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) return @@ -399,7 +400,7 @@ def add(self, obj: LTComponent) -> None: # type: ignore[override] class LTTextContainer(LTExpandableContainer[LTItemT], LTText): - def __init__(self): + def __init__(self) -> None: LTText.__init__(self) LTExpandableContainer.__init__(self) return @@ -569,7 +570,7 @@ class LTTextBox(LTTextContainer[LTTextLine]): of LTTextLine objects. """ - def __init__(self): + def __init__(self) -> None: LTTextContainer.__init__(self) self.index: int = -1 return diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index f0ed8a87..4e5e6df9 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -1,5 +1,6 @@ from io import BytesIO import logging +from typing import BinaryIO, Iterator, List, Optional, cast logger = logging.getLogger(__name__) @@ -11,16 +12,17 @@ class CorruptDataError(Exception): class LZWDecoder: - def __init__(self, fp): + def __init__(self, fp: BinaryIO): self.fp = fp self.buff = 0 self.bpos = 8 self.nbits = 9 - self.table = None - self.prevbuf = None + # NB: self.table stores None only in indices 256 and 257 + self.table: Optional[List[Optional[bytes]]] = None + self.prevbuf: Optional[bytes] = None return - def readbits(self, bits): + def readbits(self, bits: int) -> int: v = 0 while 1: # the number of remaining bits we can get from the current buffer. @@ -45,7 +47,7 @@ def readbits(self, bits): self.bpos = 0 return v - def feed(self, code): + def feed(self, code: int) -> bytes: x = b'' if code == 256: self.table = [bytes((c,)) for c in range(256)] # 0-255 @@ -56,14 +58,16 @@ def feed(self, code): elif code == 257: pass elif not self.prevbuf: - x = self.prevbuf = self.table[code] + assert self.table is not None + x = self.prevbuf = cast(bytes, self.table[code]) # assume not None else: + assert self.table is not None if code < len(self.table): - x = self.table[code] + x = cast(bytes, self.table[code]) # assume not None self.table.append(self.prevbuf+x[:1]) elif code == len(self.table): self.table.append(self.prevbuf+self.prevbuf[:1]) - x = self.table[code] + x = cast(bytes, self.table[code]) else: raise CorruptDataError table_length = len(self.table) @@ -76,7 +80,7 @@ def feed(self, code): self.prevbuf = x return x - def run(self): + def run(self) -> Iterator[bytes]: while 1: try: code = self.readbits(self.nbits) @@ -88,12 +92,13 @@ def run(self): # just ignore corrupt data and stop yielding there break yield x + assert self.table is not None logger.debug('nbits=%d, code=%d, output=%r, table=%r' % (self.nbits, code, x, self.table[258:])) return -def lzwdecode(data): +def lzwdecode(data: bytes) -> bytes: fp = BytesIO(data) s = LZWDecoder(fp).run() return b''.join(s) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index f0d0f4f3..2512b932 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -2,17 +2,18 @@ import re import struct from hashlib import sha256, md5 -from typing import Iterable +from typing import (Any, Dict, Iterable, Iterator, KeysView, List, Optional, + Sequence, Tuple, Type, cast) from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes from . import settings from .arcfour import Arcfour -from .pdfparser import PDFSyntaxError, PDFStreamParser -from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \ +from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser +from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream,\ PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ - dict_value, stream_value + uint_value, dict_value, stream_value from .psparser import PSEOF, literal_name, LIT, KWD from .utils import choplist, nunpack, decode_text @@ -66,31 +67,33 @@ def __init__(self, *args): class PDFBaseXRef: - - def get_trailer(self): + def get_trailer(self) -> Dict[Any, Any]: raise NotImplementedError - def get_objids(self): + def get_objids(self) -> Iterable[int]: return [] # Must return # (strmid, index, genno) # or (None, pos, genno) - def get_pos(self, objid): + def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: raise KeyError(objid) + def load(self, parser: PDFParser) -> None: + raise NotImplementedError + class PDFXRef(PDFBaseXRef): - def __init__(self): - self.offsets = {} - self.trailer = {} + def __init__(self) -> None: + self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {} + self.trailer: Dict[Any, Any] = {} return - def __repr__(self): + def __repr__(self) -> str: return '' % (self.offsets.keys()) - def load(self, parser): + def load(self, parser: PDFParser) -> None: while True: try: (pos, line) = parser.nextline() @@ -124,15 +127,15 @@ def load(self, parser): error_msg = 'Invalid XRef format: {!r}, line={!r}'\ .format(parser, line) raise PDFNoValidXRef(error_msg) - (pos, genno, use) = f - if use != b'n': + (pos_b, genno_b, use_b) = f + if use_b != b'n': continue - self.offsets[objid] = (None, int(pos), int(genno)) + self.offsets[objid] = (None, int(pos_b), int(genno_b)) log.info('xref objects: %r', self.offsets) self.load_trailer(parser) return - def load_trailer(self, parser): + def load_trailer(self, parser: PDFParser) -> None: try: (_, kwd) = parser.nexttoken() assert kwd is KWD(b'trailer'), str(kwd) @@ -146,13 +149,13 @@ def load_trailer(self, parser): log.debug('trailer=%r', self.trailer) return - def get_trailer(self): + def get_trailer(self) -> Dict[Any, Any]: return self.trailer - def get_objids(self): + def get_objids(self) -> KeysView[int]: return self.offsets.keys() - def get_pos(self, objid): + def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: try: return self.offsets[objid] except KeyError: @@ -166,25 +169,25 @@ def __repr__(self): PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') - def load(self, parser): + def load(self, parser: PDFParser) -> None: parser.seek(0) while 1: try: - (pos, line) = parser.nextline() + (pos, line_bytes) = parser.nextline() except PSEOF: break - if line.startswith(b'trailer'): + if line_bytes.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) log.info('trailer: %r', self.trailer) break - line = line.decode('latin-1') # default pdf encoding + line = line_bytes.decode('latin-1') # default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue - (objid, genno) = m.groups() - objid = int(objid) - genno = int(genno) + (objid_s, genno_s) = m.groups() + objid = int(objid_s) + genno = int(genno_s) self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) @@ -199,7 +202,7 @@ def load(self, parser): raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser1 = PDFStreamParser(stream.get_data()) - objs = [] + objs: List[Any] = [] try: while 1: (_, obj) = parser1.nextobject() @@ -215,17 +218,19 @@ def load(self, parser): class PDFXRefStream(PDFBaseXRef): - def __init__(self): - self.data = None - self.entlen = None - self.fl1 = self.fl2 = self.fl3 = None - self.ranges = [] + def __init__(self) -> None: + self.data: Optional[bytes] = None + self.entlen: Optional[int] = None + self.fl1: Optional[int] = None + self.fl2: Optional[int] = None + self.fl3: Optional[int] = None + self.ranges: List[Tuple[Any, Any]] = [] return - def __repr__(self): + def __repr__(self) -> str: return '' % (self.ranges) - def load(self, parser): + def load(self, parser: PDFParser) -> None: (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() @@ -237,8 +242,10 @@ def load(self, parser): index_array = stream.get('Index', (0, size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') - self.ranges.extend(choplist(2, index_array)) + self.ranges.extend(cast(Tuple[Any, Any], choplist(2, index_array))) (self.fl1, self.fl2, self.fl3) = stream['W'] + assert (self.fl1 is not None and self.fl2 is not None + and self.fl3 is not None) self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs @@ -250,9 +257,11 @@ def load(self, parser): def get_trailer(self): return self.trailer - def get_objids(self): + def get_objids(self) -> Iterator[int]: for (start, nobjs) in self.ranges: for i in range(nobjs): + assert self.entlen is not None + assert self.data is not None offset = self.entlen * i ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) @@ -260,7 +269,7 @@ def get_objids(self): yield start+i return - def get_pos(self, objid): + def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: index = 0 for (start, nobjs) in self.ranges: if start <= objid and objid < start+nobjs: @@ -270,6 +279,10 @@ def get_pos(self, objid): index += nobjs else: raise KeyError(objid) + assert self.entlen is not None + assert self.data is not None + assert (self.fl1 is not None and self.fl2 is not None + and self.fl3 is not None) offset = self.entlen * index ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) @@ -288,16 +301,17 @@ class PDFStandardSecurityHandler: PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') - supported_revisions: Iterable[int] = (2, 3) + supported_revisions: Tuple[int, ...] = (2, 3) - def __init__(self, docid, param, password=''): + def __init__(self, docid: Sequence[bytes], param: Dict[str, Any], + password: str = ''): self.docid = docid self.param = param self.password = password self.init() return - def init(self): + def init(self) -> None: self.init_params() if self.r not in self.supported_revisions: error_msg = 'Unsupported revision: param=%r' % self.param @@ -305,7 +319,7 @@ def init(self): self.init_key() return - def init_params(self): + def init_params(self) -> None: self.v = int_value(self.param.get('V', 0)) self.r = int_value(self.param['R']) self.p = uint_value(self.param['P'], 32) @@ -314,22 +328,22 @@ def init_params(self): self.length = int_value(self.param.get('Length', 40)) return - def init_key(self): + def init_key(self) -> None: self.key = self.authenticate(self.password) if self.key is None: raise PDFPasswordIncorrect return - def is_printable(self): + def is_printable(self) -> bool: return bool(self.p & 4) - def is_modifiable(self): + def is_modifiable(self) -> bool: return bool(self.p & 8) - def is_extractable(self): + def is_extractable(self) -> bool: return bool(self.p & 16) - def compute_u(self, key): + def compute_u(self, key: bytes) -> bytes: if self.r == 2: # Algorithm 3.4 return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2 @@ -344,7 +358,7 @@ def compute_u(self, key): result += result # 6 return result - def compute_encryption_key(self, password): + def compute_encryption_key(self, password: bytes) -> bytes: # Algorithm 3.2 password = (password + self.PASSWORD_PADDING)[:32] # 1 hash = md5(password) # 2 @@ -353,7 +367,7 @@ def compute_encryption_key(self, password): hash.update(struct.pack('= 4: - if not self.encrypt_metadata: + if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata: hash.update(b'\xff\xff\xff\xff') result = hash.digest() n = 5 @@ -363,28 +377,28 @@ def compute_encryption_key(self, password): result = md5(result[:n]).digest() return result[:n] - def authenticate(self, password): - password = password.encode("latin1") - key = self.authenticate_user_password(password) + def authenticate(self, password: str) -> Optional[bytes]: + password_bytes = password.encode("latin1") + key = self.authenticate_user_password(password_bytes) if key is None: - key = self.authenticate_owner_password(password) + key = self.authenticate_owner_password(password_bytes) return key - def authenticate_user_password(self, password): + def authenticate_user_password(self, password: bytes) -> Optional[bytes]: key = self.compute_encryption_key(password) if self.verify_encryption_key(key): return key else: return None - def verify_encryption_key(self, key): + def verify_encryption_key(self, key: bytes) -> bool: # Algorithm 3.6 u = self.compute_u(key) if self.r == 2: return u == self.u return u[:16] == self.u[:16] - def authenticate_owner_password(self, password): + def authenticate_owner_password(self, password: bytes) -> Optional[bytes]: # Algorithm 3.7 password = (password + self.PASSWORD_PADDING)[:32] hash = md5(password) @@ -404,12 +418,14 @@ def authenticate_owner_password(self, password): user_password = Arcfour(k).decrypt(user_password) return self.authenticate_user_password(user_password) - def decrypt(self, objid, genno, data, attrs=None): + def decrypt(self, objid: int, genno: int, data: bytes, + attrs: Optional[Dict[str, Any]] = None) -> bytes: return self.decrypt_rc4(objid, genno, data) - def decrypt_rc4(self, objid, genno, data): + def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes: + assert self.key is not None key = self.key + struct.pack(' None: + assert self.encryption is not None (docid, param) = self.encryption if literal_name(param.get('Filter')) != 'Standard': raise PDFEncryptionError('Unknown filter: param=%r' % param) @@ -624,15 +642,17 @@ def _initialize_password(self, password=''): self.is_printable = handler.is_printable() self.is_modifiable = handler.is_modifiable() self.is_extractable = handler.is_extractable() + assert self._parser is not None self._parser.fallback = False # need to read streams with exact length return - def _getobj_objstm(self, stream, index, objid): + def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> Any: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: (objs, n) = self._get_objects(stream) if self.caching: + assert stream.objid is not None self._parsed_objs[stream.objid] = (objs, n) i = n*2+index try: @@ -641,7 +661,7 @@ def _getobj_objstm(self, stream, index, objid): raise PDFSyntaxError('index too big: %r' % index) return obj - def _get_objects(self, stream): + def _get_objects(self, stream: PDFStream) -> Tuple[List[Any], Any]: if stream.get('Type') is not LITERAL_OBJSTM: if settings.STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) @@ -662,7 +682,8 @@ def _get_objects(self, stream): pass return (objs, n) - def _getobj_parse(self, pos, objid): + def _getobj_parse(self, pos: int, objid: Any) -> Any: + assert self._parser is not None self._parser.seek(pos) (_, objid1) = self._parser.nexttoken() # objid (_, genno) = self._parser.nexttoken() # genno @@ -690,7 +711,7 @@ def _getobj_parse(self, pos, objid): return obj # can raise PDFObjectNotFound - def getobj(self, objid): + def getobj(self, objid: int) -> Any: """Get object from PDF :raises PDFException if PDFDocument is not initialized @@ -729,11 +750,14 @@ def getobj(self, objid): self._cached_objs[objid] = (obj, genno) return obj - def get_outlines(self): + OutlineType = Tuple[Any, Any, Any, Any, Any] + + def get_outlines(self) -> Iterator[OutlineType]: if 'Outlines' not in self.catalog: raise PDFNoOutlines - def search(entry, level): + def search(entry: Any, level: int + ) -> Iterator[PDFDocument.OutlineType]: entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: @@ -749,7 +773,7 @@ def search(entry, level): return return search(self.catalog['Outlines'], 0) - def lookup_name(self, cat, key): + def lookup_name(self, cat: str, key: str) -> Any: try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): @@ -757,14 +781,14 @@ def lookup_name(self, cat, key): # may raise KeyError d0 = dict_value(names[cat]) - def lookup(d): + def lookup(d: Dict[str, Any]) -> Any: if 'Limits' in d: (k1, k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) - names = dict(choplist(2, objs)) + names = dict(cast(Tuple[Any, Any], choplist(2, objs))) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): @@ -774,7 +798,7 @@ def lookup(d): raise KeyError((cat, key)) return lookup(d0) - def get_dest(self, name): + def get_dest(self, name: str) -> Any: try: # PDF-1.2 or later obj = self.lookup_name('Dests', name) @@ -789,7 +813,7 @@ def get_dest(self, name): return obj # find_xref - def find_xref(self, parser): + def find_xref(self, parser: PDFParser) -> int: """Internal function used to locate the first XRef.""" # search the last xref table by scanning the file backwards. prev = None @@ -803,10 +827,12 @@ def find_xref(self, parser): else: raise PDFNoValidXRef('Unexpected EOF') log.info('xref found: pos=%r', prev) + assert prev is not None return int(prev) # read xref table - def read_xref_from(self, parser, start, xrefs): + def read_xref_from(self, parser: PDFParser, start: int, + xrefs: List[PDFBaseXRef]) -> None: """Reads XRefs from the given location.""" parser.seek(start) parser.reset() @@ -819,7 +845,7 @@ def read_xref_from(self, parser, start, xrefs): # XRefStream: PDF-1.5 parser.seek(pos) parser.reset() - xref = PDFXRefStream() + xref: PDFBaseXRef = PDFXRefStream() xref.load(parser) else: if token is parser.KEYWORD_XREF: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 19bcca0b..df1634f9 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -2,9 +2,11 @@ import struct import sys from io import BytesIO +from typing import Any, Dict, Iterable, Optional, Tuple, Union from . import settings from .cmapdb import CMap +from .cmapdb import CMapBase from .cmapdb import CMapDB from .cmapdb import CMapParser from .cmapdb import FileUnicodeMap @@ -25,7 +27,7 @@ from .psparser import PSLiteral from .psparser import PSStackParser from .psparser import literal_name -from .utils import apply_matrix_norm +from .utils import Rect, apply_matrix_norm from .utils import choplist from .utils import isnumber from .utils import nunpack @@ -484,9 +486,10 @@ class PDFUnicodeNotDefined(PDFFontError): class PDFFont: - def __init__(self, descriptor, widths, default_width=None): + def __init__(self, descriptor: Dict[str, Any], widths: Dict[int, float], + default_width: Optional[float] = None): self.descriptor = descriptor - self.widths = resolve_all(widths) + self.widths: Dict[int, float] = resolve_all(widths) self.fontname = resolve1(descriptor.get('FontName', 'unknown')) if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) @@ -499,8 +502,8 @@ def __init__(self, descriptor, widths, default_width=None): else: self.default_width = default_width self.leading = num_value(descriptor.get('Leading', 0)) - self.bbox = list_value(resolve_all(descriptor.get('FontBBox', - (0, 0, 0, 0)))) + self.bbox: Rect = list_value( # type: ignore[assignment] + resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0)))) self.hscale = self.vscale = .001 # PDF RM 9.8.1 specifies /Descent should always be a negative number. @@ -514,48 +517,52 @@ def __init__(self, descriptor, widths, default_width=None): def __repr__(self): return '' - def is_vertical(self): + def is_vertical(self) -> bool: return False - def is_multibyte(self): + def is_multibyte(self) -> bool: return False - def decode(self, bytes): + def decode(self, bytes: bytes) -> Iterable[int]: return bytearray(bytes) # map(ord, bytes) - def get_ascent(self): + def get_ascent(self) -> float: """Ascent above the baseline, in text space units""" return self.ascent * self.vscale - def get_descent(self): + def get_descent(self) -> float: """Descent below the baseline, in text space units; always negative""" return self.descent * self.vscale - def get_width(self): + def get_width(self) -> float: w = self.bbox[2]-self.bbox[0] if w == 0: w = -self.default_width return w * self.hscale - def get_height(self): + def get_height(self) -> float: h = self.bbox[3]-self.bbox[1] if h == 0: h = self.ascent - self.descent return h * self.vscale - def char_width(self, cid): + def char_width(self, cid: int) -> float: try: return self.widths[cid] * self.hscale except KeyError: try: - return self.widths[self.to_unichr(cid)] * self.hscale + # Type confusion: this appears to be a relic from Python 2 + return (self.widths[self.to_unichr(cid)] # type: ignore[index] + * self.hscale) except (KeyError, PDFUnicodeNotDefined): return self.default_width * self.hscale - def char_disp(self, cid): + def char_disp(self, cid: int + ) -> Union[float, Tuple[Optional[float], float]]: + "Returns an integer for horizontal fonts, a tuple for vertical fonts." return 0 - def string_width(self, s): + def string_width(self, s: bytes) -> float: return sum(self.char_width(cid) for cid in self.decode(s)) def to_unichr(self, cid: int) -> str: @@ -673,7 +680,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): cid_ordering = resolve1( self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1") self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering) - self.cmap = self.get_cmap_from_spec(spec, strict) + self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) try: descriptor = dict_value(spec['FontDescriptor']) @@ -723,7 +730,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - def get_cmap_from_spec(self, spec, strict): + def get_cmap_from_spec(self, spec: Dict[str, Any], strict: bool + ) -> CMapBase: """Get cmap from font specification For certain PDFs, Encoding Type isn't mentioned as an attribute of @@ -742,7 +750,7 @@ def get_cmap_from_spec(self, spec, strict): return CMap() @staticmethod - def _get_cmap_name(spec, strict): + def _get_cmap_name(spec: Dict[str, Any], strict: bool) -> str: """Get cmap name from font specification""" cmap_name = 'unknown' # default value @@ -756,15 +764,14 @@ def _get_cmap_name(spec, strict): if strict: raise PDFFontError('Encoding is unspecified') - if type(cmap_name) is PDFStream: + if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] if 'CMapName' in cmap_name: cmap_name = cmap_name.get('CMapName').name else: if strict: raise PDFFontError('CMapName unspecified for encoding') - cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name) - return cmap_name + return IDENTITY_ENCODER.get(cmap_name, cmap_name) def __repr__(self): return ''\ @@ -776,7 +783,7 @@ def is_vertical(self): def is_multibyte(self): return True - def decode(self, bytes): + def decode(self, bytes: bytes) -> Iterable[int]: return self.cmap.decode(bytes) def char_disp(self, cid): @@ -802,4 +809,4 @@ def main(argv): if __name__ == '__main__': - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 6ce48530..92baf0b6 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -33,7 +33,7 @@ from .pdffont import PDFCIDFont from .pdfcolor import PDFColorSpace from .pdfcolor import PREDEFINED_COLORSPACE -from .utils import Matrix, Point, PathSegment +from .utils import Matrix, Point, PathSegment, Rect from .utils import choplist from .utils import mult_matrix from .utils import MATRIX_IDENTITY @@ -61,7 +61,7 @@ class PDFTextState: matrix: Matrix linematrix: Point - def __init__(self): + def __init__(self) -> None: self.font: Optional[PDFFont] = None self.fontsize: float = 0 self.charspace: float = 0 @@ -111,7 +111,7 @@ def reset(self) -> None: class PDFGraphicState: - def __init__(self): + def __init__(self) -> None: self.linewidth: float = 0 self.linecap = None self.linejoin = None @@ -197,16 +197,19 @@ def get_font(self, objid: Any, spec: Mapping[str, Any]) -> PDFFont: subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font - font = PDFType1Font(self, spec) + font = PDFType1Font( + self, spec) # type: ignore[no-untyped-call] elif subtype == 'TrueType': # TrueType Font - font = PDFTrueTypeFont(self, spec) + font = PDFTrueTypeFont( + self, spec) # type: ignore[no-untyped-call] elif subtype == 'Type3': # Type3 Font - font = PDFType3Font(self, spec) + font = PDFType3Font( + self, spec) # type: ignore[no-untyped-call] elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font - font = PDFCIDFont(self, spec) + font = PDFCIDFont(self, spec) # type: ignore[no-untyped-call] elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) @@ -219,7 +222,8 @@ def get_font(self, objid: Any, spec: Mapping[str, Any]) -> PDFFont: else: if settings.STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) - font = PDFType1Font(self, spec) # this is so wrong! + font = PDFType1Font( # this is so wrong! + self, spec) # type: ignore[no-untyped-call] if objid and self.caching: self._cached_fonts[objid] = font return font @@ -227,13 +231,13 @@ def get_font(self, objid: Any, spec: Mapping[str, Any]) -> PDFFont: class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): - def __init__(self, streams: List[Any]): + def __init__(self, streams: Sequence[Any]): self.streams = streams self.istream = 0 # PSStackParser.__init__(fp=None) is safe only because we've overloaded # all the methods that would attempt to access self.fp without first # calling self.fillfp(). - PSStackParser.__init__(self, None) # type: ignore + PSStackParser.__init__(self, None) # type: ignore[arg-type] return def fillfp(self) -> None: @@ -260,7 +264,7 @@ def fillbuf(self) -> None: self.buf = self.fp.read(self.BUFSIZ) if self.buf: break - self.fp = None # type: ignore + self.fp = None # type: ignore[assignment] self.charpos = 0 return @@ -887,8 +891,9 @@ def do_Do(self, xobjid: Any) -> None: subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() - bbox = list_value(xobj['BBox']) - matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) + bbox: Rect = list_value(xobj['BBox']) # type: ignore[assignment] + matrix: Matrix = list_value(xobj.get( + 'Matrix', MATRIX_IDENTITY)) # type: ignore[assignment] # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. @@ -939,7 +944,7 @@ def render_contents(self, resources: Any, streams: Sequence[Any], self.execute(list_value(streams)) return - def execute(self, streams: List[Any]) -> None: + def execute(self, streams: Sequence[Any]) -> None: try: parser = PDFContentParser(streams) except PSEOF: diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index bfcd013b..168e112f 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,5 +1,6 @@ import logging -from typing import Any, BinaryIO, Container, Iterator, List, Optional +from typing import (Any, BinaryIO, Container, Dict, Iterator, List, Optional, + Tuple) import warnings from . import settings from .psparser import LIT @@ -78,7 +79,8 @@ def __repr__(self) -> str: @classmethod def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: - def search(obj, parent): + def search(obj: Any, parent: Dict[str, Any] + ) -> Iterator[Tuple[int, Dict[Any, Any]]]: if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index b585ef05..f7cfeb17 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -77,7 +77,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if len(self.curstack) >= 2: try: ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) # type: ignore + (objid, genno) = ( + int(objid), int(genno)) # type: ignore[arg-type] + assert self.doc is not None obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: @@ -89,7 +91,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: objlen = 0 if not self.fallback: try: - objlen = int_value(dic['Length']) # type: ignore + objlen = int_value(dic['Length']) except KeyError: if settings.STRICT: raise PDFSyntaxError('/Length is undefined: %r' % dic) @@ -159,7 +161,8 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: # reference to indirect object try: ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) # type: ignore + (objid, genno) = ( + int(objid), int(genno)) # type: ignore[arg-type] obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 14c729b8..09adfa02 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -1,5 +1,7 @@ import zlib import logging +from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Protocol, + Union, List, Tuple, cast) from .lzw import lzwdecode from .ascii85 import ascii85decode from .ascii85 import asciihexdecode @@ -10,7 +12,9 @@ from .psparser import LIT from . import settings from .utils import apply_png_predictor -from .utils import isnumber + +if TYPE_CHECKING: + from .pdfdocument import PDFDocument log = logging.getLogger(__name__) @@ -28,6 +32,12 @@ LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) +class DecipherCallable(Protocol): + def __call__(self, objid: int, genno: int, data: bytes, + attrs: Optional[Dict[str, Any]] = None) -> bytes: + raise NotImplementedError + + class PDFObject(PSObject): pass @@ -54,7 +64,7 @@ class PDFNotImplementedError(PDFException): class PDFObjRef(PDFObject): - def __init__(self, doc, objid, _): + def __init__(self, doc: Optional["PDFDocument"], objid: int, _: Any): if objid == 0: if settings.STRICT: raise PDFValueError('PDF object id cannot be 0.') @@ -62,17 +72,18 @@ def __init__(self, doc, objid, _): self.objid = objid return - def __repr__(self): + def __repr__(self) -> str: return '' % (self.objid) - def resolve(self, default=None): + def resolve(self, default: Any = None) -> Any: + assert self.doc is not None try: return self.doc.getobj(self.objid) except PDFObjectNotFound: return default -def resolve1(x, default=None): +def resolve1(x: Any, default: Any = None) -> Any: """Resolves an object. If this is an array or dictionary, it may still contains @@ -83,7 +94,7 @@ def resolve1(x, default=None): return x -def resolve_all(x, default=None): +def resolve_all(x: Any, default: Any = None) -> Any: """Recursively resolves the given object and all the internals. Make sure there is no indirect reference within the nested object. @@ -99,7 +110,8 @@ def resolve_all(x, default=None): return x -def decipher_all(decipher, objid, genno, x): +def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: Any + ) -> Any: """Recursively deciphers the given object. """ if isinstance(x, bytes): @@ -112,7 +124,7 @@ def decipher_all(decipher, objid, genno, x): return x -def int_value(x): +def int_value(x: Any) -> int: x = resolve1(x) if not isinstance(x, int): if settings.STRICT: @@ -121,7 +133,7 @@ def int_value(x): return x -def float_value(x): +def float_value(x: Any) -> float: x = resolve1(x) if not isinstance(x, float): if settings.STRICT: @@ -130,34 +142,34 @@ def float_value(x): return x -def num_value(x): +def num_value(x: Any) -> float: x = resolve1(x) - if not isnumber(x): + if not isinstance(x, (int, float)): # == utils.isnumber(x) if settings.STRICT: raise PDFTypeError('Int or Float required: %r' % x) return 0 return x -def uint_value(x, n_bits): +def uint_value(x: Any, n_bits: int) -> int: """Resolve number and interpret it as a two's-complement unsigned number""" - x = int_value(x) - if x > 0: - return x + xi = int_value(x) + if xi > 0: + return xi else: - return x + 2**n_bits + return xi + cast(int, 2**n_bits) -def str_value(x): +def str_value(x: Any) -> bytes: x = resolve1(x) if not isinstance(x, bytes): if settings.STRICT: raise PDFTypeError('String required: %r' % x) - return '' + return b'' return x -def list_value(x): +def list_value(x: Any) -> Union[List[Any], Tuple[Any, ...]]: x = resolve1(x) if not isinstance(x, (list, tuple)): if settings.STRICT: @@ -166,7 +178,7 @@ def list_value(x): return x -def dict_value(x): +def dict_value(x: Any) -> Dict[Any, Any]: x = resolve1(x) if not isinstance(x, dict): if settings.STRICT: @@ -176,7 +188,7 @@ def dict_value(x): return x -def stream_value(x): +def stream_value(x: Any) -> "PDFStream": x = resolve1(x) if not isinstance(x, PDFStream): if settings.STRICT: @@ -187,22 +199,23 @@ def stream_value(x): class PDFStream(PDFObject): - def __init__(self, attrs, rawdata, decipher=None): + def __init__(self, attrs: Dict[str, Any], rawdata: bytes, decipher: + Optional[DecipherCallable] = None): assert isinstance(attrs, dict), str(type(attrs)) self.attrs = attrs - self.rawdata = rawdata + self.rawdata: Optional[bytes] = rawdata self.decipher = decipher - self.data = None - self.objid = None - self.genno = None + self.data: Optional[bytes] = None + self.objid: Optional[int] = None + self.genno: Optional[int] = None return - def set_objid(self, objid, genno): + def set_objid(self, objid: int, genno: int) -> None: self.objid = objid self.genno = genno return - def __repr__(self): + def __repr__(self) -> str: if self.data is None: assert self.rawdata is not None return '' % \ @@ -218,16 +231,16 @@ def __contains__(self, name): def __getitem__(self, name): return self.attrs[name] - def get(self, name, default=None): + def get(self, name: str, default: Any = None) -> Any: return self.attrs.get(name, default) - def get_any(self, names, default=None): + def get_any(self, names: Iterable[str], default: Any = None) -> Any: for name in names: if name in self.attrs: return self.attrs[name] return default - def get_filters(self): + def get_filters(self) -> List[Tuple[Any, Any]]: filters = self.get_any(('F', 'Filter')) params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if not filters: @@ -248,12 +261,14 @@ def get_filters(self): # return list solves https://github.com/pdfminer/pdfminer.six/issues/15 return list(zip(_filters, params)) - def decode(self): + def decode(self) -> None: assert self.data is None \ and self.rawdata is not None, str((self.data, self.rawdata)) data = self.rawdata if self.decipher: # Handle encryption + assert self.objid is not None + assert self.genno is not None data = self.decipher(self.objid, self.genno, data, self.attrs) filters = self.get_filters() if not filters: @@ -314,10 +329,11 @@ def decode(self): self.rawdata = None return - def get_data(self): + def get_data(self) -> bytes: if self.data is None: self.decode() + assert self.data is not None return self.data - def get_rawdata(self): + def get_rawdata(self) -> Optional[bytes]: return self.rawdata diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 3ac72e57..81192425 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -103,7 +103,7 @@ def intern(self, name: PSLiteral.NameType) -> _SymbolT: else: # Type confusion issue: PSKeyword always takes bytes as name # PSLiteral uses either str or bytes - lit = self.klass(name) # type: ignore + lit = self.klass(name) # type: ignore[arg-type] self.dict[name] = lit return lit diff --git a/pdfminer/runlength.py b/pdfminer/runlength.py index f8ea228d..b79e18e6 100644 --- a/pdfminer/runlength.py +++ b/pdfminer/runlength.py @@ -6,7 +6,7 @@ # -def rldecode(data): +def rldecode(data: bytes) -> bytes: """ RunLength decoder (Adobe version) implementation based on PDF Reference version 1.4 section 3.3.4: diff --git a/pdfminer/utils.py b/pdfminer/utils.py index edf5c6ee..c3d229c2 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -83,7 +83,7 @@ def compatible_encode_method(bytesorstring: Union[bytes, str], return bytesorstring.decode(encoding, erraction) -def paeth_predictor(left, above, upper_left): +def paeth_predictor(left: int, above: int, upper_left: int) -> int: # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html # Initial estimate p = left + above - upper_left