* expand and improve annotations in cmap, encryption/decompression a…

…nd fonts * disallow untyped calls; this way, we have a core set of typed code that can grow over time (just not for ccitt, because there's a ton of work lurking there) * expand "typing: none" comments to suppress a specific error code
0xabu · Sep 3, 2021 · cc49051 · cc49051
1 parent 92df54b
commit cc49051
Show file tree

Hide file tree

Showing 18 changed files with 313 additions and 232 deletions.
diff --git a/mypy.ini b/mypy.ini
@@ -2,7 +2,7 @@
 warn_unused_configs = True
 disallow_any_generics = True
 disallow_subclassing_any = True
-#disallow_untyped_calls = True
+disallow_untyped_calls = True
 #disallow_untyped_defs = True
 disallow_incomplete_defs = True
 #check_untyped_defs = True
@@ -17,3 +17,5 @@ strict_equality = True
 [mypy-cryptography.hazmat.*]
 ignore_missing_imports = True
 
+[mypy-pdfminer.ccitt]
+disallow_untyped_calls = False
diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py
@@ -5,9 +5,12 @@
 """
 
 
+from typing import Sequence
+
+
 class Arcfour:
 
-    def __init__(self, key):
+    def __init__(self, key: Sequence[int]):
         # because Py3 range is not indexable
         s = [i for i in range(256)]
         j = 0
@@ -19,7 +22,7 @@ def __init__(self, key):
         (self.i, self.j) = (0, 0)
         return
 
-    def process(self, data):
+    def process(self, data: bytes) -> bytes:
         (i, j) = (self.i, self.j)
         s = self.s
         r = b''

diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py
@@ -9,7 +9,7 @@
 
 
 # ascii85decode(data)
-def ascii85decode(data):
+def ascii85decode(data: bytes) -> bytes:
     """
     In ASCII85 encoding, every four bytes are encoded with five ASCII
     letters, using 85 different types of characters (as 256**4 < 85**5).
@@ -47,7 +47,7 @@ def ascii85decode(data):
 trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
 
 
-def asciihexdecode(data):
+def asciihexdecode(data: bytes) -> bytes:
     """
     ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
     For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
@@ -57,7 +57,7 @@ def asciihexdecode(data):
     the EOD marker after reading an odd number of hexadecimal digits, it
     will behave as if a 0 followed the last digit.
     """
-    def decode(x):
+    def decode(x: bytes) -> bytes:
         i = int(x, 16)
         return bytes((i,))
 

diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py
@@ -13,6 +13,7 @@
 
 import sys
 import array
+from typing import Any, Dict
 
 
 def get_bytes(data):
@@ -541,7 +542,7 @@ def output_line(self, y, bits):
         return
 
 
-def ccittfaxdecode(data, params):
+def ccittfaxdecode(data: bytes, params: Dict[str, Any]) -> bytes:
     K = params.get('K')
     cols = params.get('Columns')
     bytealign = params.get('EncodedByteAlign')
@@ -551,7 +552,7 @@ def ccittfaxdecode(data, params):
     else:
         raise ValueError(K)
     parser.feedbytes(data)
-    return parser.close()
+    return parser.close()  # type: ignore[no-any-return]
 
 
 # test
@@ -562,7 +563,7 @@ def main(argv):
 
     class Parser(CCITTG4Parser):
         def __init__(self, width, bytealign=False):
-            import pygame  # type: ignore
+            import pygame  # type: ignore[import]
             CCITTG4Parser.__init__(self, width, bytealign=bytealign)
             self.img = pygame.Surface((self.width, 1000))
             return

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -16,10 +16,12 @@
 import pickle as pickle
 import struct
 import logging
-from typing import Any, Dict, List
+from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Optional,
+                    TextIO, Tuple, Union)
 from .psparser import PSStackParser
 from .psparser import PSSyntaxError
 from .psparser import PSEOF
+from .psparser import PSKeyword
 from .psparser import PSLiteral
 from .psparser import literal_name
 from .psparser import KWD
@@ -39,52 +41,56 @@ class CMapBase:
 
     debug = 0
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Union[str, int]):
         self.attrs = kwargs.copy()
         return
 
-    def is_vertical(self):
+    def is_vertical(self) -> bool:
         return self.attrs.get('WMode', 0) != 0
 
-    def set_attr(self, k, v):
+    def set_attr(self, k: str, v: Any) -> None:
         self.attrs[k] = v
         return
 
-    def add_code2cid(self, code, cid):
+    def add_code2cid(self, code: str, cid: int) -> None:
         return
 
-    def add_cid2unichr(self, cid, code):
+    def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
+                       ) -> None:
         return
 
-    def use_cmap(self, cmap):
+    def use_cmap(self, cmap: "CMapBase") -> None:
         return
 
+    def decode(self, code: bytes) -> Iterable[int]:
+        raise NotImplementedError
+
 
 class CMap(CMapBase):
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Union[str, int]):
         CMapBase.__init__(self, **kwargs)
-        self.code2cid = {}
+        self.code2cid: Dict[int, Any] = {}
         return
 
     def __repr__(self):
         return '<CMap: %s>' % self.attrs.get('CMapName')
 
-    def use_cmap(self, cmap):
+    def use_cmap(self, cmap: CMapBase) -> None:
         assert isinstance(cmap, CMap), str(type(cmap))
 
-        def copy(dst, src):
+        def copy(dst: Dict[Any, Any], src: Dict[Any, Any]) -> None:
             for (k, v) in src.items():
                 if isinstance(v, dict):
-                    d = {}
+                    d: Dict[Any, Any] = {}
                     dst[k] = d
                     copy(d, v)
                 else:
                     dst[k] = v
         copy(self.code2cid, cmap.code2cid)
         return
 
-    def decode(self, code):
+    def decode(self, code: bytes) -> Iterator[int]:
         log.debug('decode: %r, %r', self, code)
         d = self.code2cid
         for i in iter(code):
@@ -97,7 +103,9 @@ def decode(self, code):
                 d = self.code2cid
         return
 
-    def dump(self, out=sys.stdout, code2cid=None, code=None):
+    def dump(self, out: TextIO = sys.stdout,
+             code2cid: Optional[Dict[int, Any]] = None,
+             code: Tuple[int, ...] = ()) -> None:
         if code2cid is None:
             code2cid = self.code2cid
             code = ()
@@ -112,7 +120,7 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
 
 class IdentityCMap(CMapBase):
 
-    def decode(self, code):
+    def decode(self, code: bytes) -> Tuple[int, ...]:
         n = len(code)//2
         if n:
             return struct.unpack('>%dH' % n, code)
@@ -122,7 +130,7 @@ def decode(self, code):
 
 class IdentityCMapByte(IdentityCMap):
 
-    def decode(self, code):
+    def decode(self, code: bytes) -> Tuple[int, ...]:
         n = len(code)
         if n:
             return struct.unpack('>%dB' % n, code)
@@ -132,49 +140,51 @@ def decode(self, code):
 
 class UnicodeMap(CMapBase):
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Union[str, int]):
         CMapBase.__init__(self, **kwargs)
-        self.cid2unichr = {}
+        self.cid2unichr: Dict[int, str] = {}
         return
 
     def __repr__(self):
         return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
 
-    def get_unichr(self, cid):
+    def get_unichr(self, cid: int) -> str:
         log.debug('get_unichr: %r, %r', self, cid)
         return self.cid2unichr[cid]
 
-    def dump(self, out=sys.stdout):
+    def dump(self, out: TextIO = sys.stdout) -> None:
         for (k, v) in sorted(self.cid2unichr.items()):
             out.write('cid %d = unicode %r\n' % (k, v))
         return
 
 
 class FileCMap(CMap):
 
-    def add_code2cid(self, code, cid):
+    def add_code2cid(self, code: str, cid: int) -> None:
         assert isinstance(code, str) and isinstance(cid, int),\
             str((type(code), type(cid)))
         d = self.code2cid
         for c in code[:-1]:
-            c = ord(c)
-            if c in d:
-                d = d[c]
+            ci = ord(c)
+            if ci in d:
+                d = d[ci]
             else:
-                t = {}
-                d[c] = t
+                t: Dict[int, Any] = {}
+                d[ci] = t
                 d = t
-        c = ord(code[-1])
-        d[c] = cid
+        ci = ord(code[-1])
+        d[ci] = cid
         return
 
 
 class FileUnicodeMap(UnicodeMap):
 
-    def add_cid2unichr(self, cid, code):
+    def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
+                       ) -> None:
         assert isinstance(cid, int), str(type(cid))
         if isinstance(code, PSLiteral):
             # Interpret as an Adobe glyph name.
+            assert isinstance(code.name, str)
             self.cid2unichr[cid] = name2unicode(code.name)
         elif isinstance(code, bytes):
             # Interpret as UTF-16BE.
@@ -188,8 +198,8 @@ def add_cid2unichr(self, cid, code):
 
 class PyCMap(CMap):
 
-    def __init__(self, name, module):
-        CMap.__init__(self, CMapName=name)
+    def __init__(self, name: str, module: Any):
+        super().__init__(CMapName=name)
         self.code2cid = module.CODE2CID
         if module.IS_VERTICAL:
             self.attrs['WMode'] = 1
@@ -198,8 +208,8 @@ def __init__(self, name, module):
 
 class PyUnicodeMap(UnicodeMap):
 
-    def __init__(self, name, module, vertical):
-        UnicodeMap.__init__(self, CMapName=name)
+    def __init__(self, name: str, module: Any, vertical: bool):
+        super().__init__(CMapName=name)
         if vertical:
             self.cid2unichr = module.CID2UNICHR_V
             self.attrs['WMode'] = 1
@@ -264,17 +274,16 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
         return cls._umap_cache[name][vertical]
 
 
-# int here means that we're not extending PSStackParser with additional types.
-class CMapParser(PSStackParser[int]):
+class CMapParser(PSStackParser[PSKeyword]):
 
-    def __init__(self, cmap, fp):
+    def __init__(self, cmap: CMapBase, fp: BinaryIO):
         PSStackParser.__init__(self, fp)
         self.cmap = cmap
         # some ToUnicode maps don't have "begincmap" keyword.
         self._in_cmap = True
         return
 
-    def run(self):
+    def run(self) -> None:
         try:
             self.nextobject()
         except PSEOF:
@@ -298,7 +307,7 @@ def run(self):
     KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
     KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
 
-    def do_keyword(self, pos, token):
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
         if token is self.KEYWORD_BEGINCMAP:
             self._in_cmap = True
             self.popall()
@@ -382,6 +391,7 @@ def do_keyword(self, pos, token):
                     for i in range(e1-s1+1):
                         self.cmap.add_cid2unichr(s1+i, code[i])
                 else:
+                    assert isinstance(code, bytes)
                     var = code[-4:]
                     base = nunpack(var)
                     prefix = code[:-4]
@@ -412,7 +422,7 @@ def do_keyword(self, pos, token):
         return
 
 
-def main(argv):
+def main(argv: List[str]) -> None:
     args = argv[1:]
     for fname in args:
         fp = open(fname, 'rb')
@@ -424,4 +434,4 @@ def main(argv):
 
 
 if __name__ == '__main__':
-    sys.exit(main(sys.argv))
+    main(sys.argv)
diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
@@ -10,7 +10,7 @@
 log = logging.getLogger(__name__)
 
 
-def name2unicode(name):
+def name2unicode(name: str) -> str:
     """Converts Adobe glyph names to Unicode numbers.
 
     In contrast to the specification, this raises a KeyError instead of return
@@ -32,7 +32,7 @@ def name2unicode(name):
 
     else:
         if name in glyphname2unicode:
-            return glyphname2unicode.get(name)
+            return glyphname2unicode[name]
 
         elif name.startswith('uni'):
             name_without_uni = name.strip('uni')
@@ -59,7 +59,7 @@ def name2unicode(name):
                    'it does not match specification' % name)
 
 
-def raise_key_error_for_invalid_unicode(unicode_digit):
+def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
     """Unicode values should not be in the range D800 through DFFF because
     that is used for surrogate pairs in UTF-16
 

diff --git a/pdfminer/image.py b/pdfminer/image.py
@@ -81,7 +81,7 @@ def export_image(self, image):
         if ext == '.jpg':
             raw_data = image.stream.get_rawdata()
             if LITERAL_DEVICE_CMYK in image.colorspace:
-                from PIL import Image  # type: ignore
+                from PIL import Image  # type: ignore[import]
                 from PIL import ImageChops
                 ifp = BytesIO(raw_data)
                 i = Image.open(ifp)