Skip to content

Commit

Permalink
* expand and improve annotations in cmap, encryption/decompression a…
Browse files Browse the repository at this point in the history
…nd fonts

 * disallow untyped calls; this way, we have a core set of
   typed code that can grow over time
   (just not for ccitt, because there's a ton of work lurking there)
 * expand "typing: none" comments to suppress a specific error code
  • Loading branch information
0xabu committed Sep 3, 2021
1 parent 92df54b commit cc49051
Show file tree
Hide file tree
Showing 18 changed files with 313 additions and 232 deletions.
4 changes: 3 additions & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
warn_unused_configs = True
disallow_any_generics = True
disallow_subclassing_any = True
#disallow_untyped_calls = True
disallow_untyped_calls = True
#disallow_untyped_defs = True
disallow_incomplete_defs = True
#check_untyped_defs = True
Expand All @@ -17,3 +17,5 @@ strict_equality = True
[mypy-cryptography.hazmat.*]
ignore_missing_imports = True

[mypy-pdfminer.ccitt]
disallow_untyped_calls = False
7 changes: 5 additions & 2 deletions pdfminer/arcfour.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
"""


from typing import Sequence


class Arcfour:

def __init__(self, key):
def __init__(self, key: Sequence[int]):
# because Py3 range is not indexable
s = [i for i in range(256)]
j = 0
Expand All @@ -19,7 +22,7 @@ def __init__(self, key):
(self.i, self.j) = (0, 0)
return

def process(self, data):
def process(self, data: bytes) -> bytes:
(i, j) = (self.i, self.j)
s = self.s
r = b''
Expand Down
6 changes: 3 additions & 3 deletions pdfminer/ascii85.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


# ascii85decode(data)
def ascii85decode(data):
def ascii85decode(data: bytes) -> bytes:
"""
In ASCII85 encoding, every four bytes are encoded with five ASCII
letters, using 85 different types of characters (as 256**4 < 85**5).
Expand Down Expand Up @@ -47,7 +47,7 @@ def ascii85decode(data):
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)


def asciihexdecode(data):
def asciihexdecode(data: bytes) -> bytes:
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
Expand All @@ -57,7 +57,7 @@ def asciihexdecode(data):
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
"""
def decode(x):
def decode(x: bytes) -> bytes:
i = int(x, 16)
return bytes((i,))

Expand Down
7 changes: 4 additions & 3 deletions pdfminer/ccitt.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import sys
import array
from typing import Any, Dict


def get_bytes(data):
Expand Down Expand Up @@ -541,7 +542,7 @@ def output_line(self, y, bits):
return


def ccittfaxdecode(data, params):
def ccittfaxdecode(data: bytes, params: Dict[str, Any]) -> bytes:
K = params.get('K')
cols = params.get('Columns')
bytealign = params.get('EncodedByteAlign')
Expand All @@ -551,7 +552,7 @@ def ccittfaxdecode(data, params):
else:
raise ValueError(K)
parser.feedbytes(data)
return parser.close()
return parser.close() # type: ignore[no-any-return]


# test
Expand All @@ -562,7 +563,7 @@ def main(argv):

class Parser(CCITTG4Parser):
def __init__(self, width, bytealign=False):
import pygame # type: ignore
import pygame # type: ignore[import]
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width, 1000))
return
Expand Down
90 changes: 50 additions & 40 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
import pickle as pickle
import struct
import logging
from typing import Any, Dict, List
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Optional,
TextIO, Tuple, Union)
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
from .psparser import PSKeyword
from .psparser import PSLiteral
from .psparser import literal_name
from .psparser import KWD
Expand All @@ -39,52 +41,56 @@ class CMapBase:

debug = 0

def __init__(self, **kwargs):
def __init__(self, **kwargs: Union[str, int]):
self.attrs = kwargs.copy()
return

def is_vertical(self):
def is_vertical(self) -> bool:
return self.attrs.get('WMode', 0) != 0

def set_attr(self, k, v):
def set_attr(self, k: str, v: Any) -> None:
self.attrs[k] = v
return

def add_code2cid(self, code, cid):
def add_code2cid(self, code: str, cid: int) -> None:
return

def add_cid2unichr(self, cid, code):
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
) -> None:
return

def use_cmap(self, cmap):
def use_cmap(self, cmap: "CMapBase") -> None:
return

def decode(self, code: bytes) -> Iterable[int]:
raise NotImplementedError


class CMap(CMapBase):

def __init__(self, **kwargs):
def __init__(self, **kwargs: Union[str, int]):
CMapBase.__init__(self, **kwargs)
self.code2cid = {}
self.code2cid: Dict[int, Any] = {}
return

def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')

def use_cmap(self, cmap):
def use_cmap(self, cmap: CMapBase) -> None:
assert isinstance(cmap, CMap), str(type(cmap))

def copy(dst, src):
def copy(dst: Dict[Any, Any], src: Dict[Any, Any]) -> None:
for (k, v) in src.items():
if isinstance(v, dict):
d = {}
d: Dict[Any, Any] = {}
dst[k] = d
copy(d, v)
else:
dst[k] = v
copy(self.code2cid, cmap.code2cid)
return

def decode(self, code):
def decode(self, code: bytes) -> Iterator[int]:
log.debug('decode: %r, %r', self, code)
d = self.code2cid
for i in iter(code):
Expand All @@ -97,7 +103,9 @@ def decode(self, code):
d = self.code2cid
return

def dump(self, out=sys.stdout, code2cid=None, code=None):
def dump(self, out: TextIO = sys.stdout,
code2cid: Optional[Dict[int, Any]] = None,
code: Tuple[int, ...] = ()) -> None:
if code2cid is None:
code2cid = self.code2cid
code = ()
Expand All @@ -112,7 +120,7 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):

class IdentityCMap(CMapBase):

def decode(self, code):
def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code)//2
if n:
return struct.unpack('>%dH' % n, code)
Expand All @@ -122,7 +130,7 @@ def decode(self, code):

class IdentityCMapByte(IdentityCMap):

def decode(self, code):
def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
Expand All @@ -132,49 +140,51 @@ def decode(self, code):

class UnicodeMap(CMapBase):

def __init__(self, **kwargs):
def __init__(self, **kwargs: Union[str, int]):
CMapBase.__init__(self, **kwargs)
self.cid2unichr = {}
self.cid2unichr: Dict[int, str] = {}
return

def __repr__(self):
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')

def get_unichr(self, cid):
def get_unichr(self, cid: int) -> str:
log.debug('get_unichr: %r, %r', self, cid)
return self.cid2unichr[cid]

def dump(self, out=sys.stdout):
def dump(self, out: TextIO = sys.stdout) -> None:
for (k, v) in sorted(self.cid2unichr.items()):
out.write('cid %d = unicode %r\n' % (k, v))
return


class FileCMap(CMap):

def add_code2cid(self, code, cid):
def add_code2cid(self, code: str, cid: int) -> None:
assert isinstance(code, str) and isinstance(cid, int),\
str((type(code), type(cid)))
d = self.code2cid
for c in code[:-1]:
c = ord(c)
if c in d:
d = d[c]
ci = ord(c)
if ci in d:
d = d[ci]
else:
t = {}
d[c] = t
t: Dict[int, Any] = {}
d[ci] = t
d = t
c = ord(code[-1])
d[c] = cid
ci = ord(code[-1])
d[ci] = cid
return


class FileUnicodeMap(UnicodeMap):

def add_cid2unichr(self, cid, code):
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
) -> None:
assert isinstance(cid, int), str(type(cid))
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
assert isinstance(code.name, str)
self.cid2unichr[cid] = name2unicode(code.name)
elif isinstance(code, bytes):
# Interpret as UTF-16BE.
Expand All @@ -188,8 +198,8 @@ def add_cid2unichr(self, cid, code):

class PyCMap(CMap):

def __init__(self, name, module):
CMap.__init__(self, CMapName=name)
def __init__(self, name: str, module: Any):
super().__init__(CMapName=name)
self.code2cid = module.CODE2CID
if module.IS_VERTICAL:
self.attrs['WMode'] = 1
Expand All @@ -198,8 +208,8 @@ def __init__(self, name, module):

class PyUnicodeMap(UnicodeMap):

def __init__(self, name, module, vertical):
UnicodeMap.__init__(self, CMapName=name)
def __init__(self, name: str, module: Any, vertical: bool):
super().__init__(CMapName=name)
if vertical:
self.cid2unichr = module.CID2UNICHR_V
self.attrs['WMode'] = 1
Expand Down Expand Up @@ -264,17 +274,16 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
return cls._umap_cache[name][vertical]


# int here means that we're not extending PSStackParser with additional types.
class CMapParser(PSStackParser[int]):
class CMapParser(PSStackParser[PSKeyword]):

def __init__(self, cmap, fp):
def __init__(self, cmap: CMapBase, fp: BinaryIO):
PSStackParser.__init__(self, fp)
self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
return

def run(self):
def run(self) -> None:
try:
self.nextobject()
except PSEOF:
Expand All @@ -298,7 +307,7 @@ def run(self):
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')

def do_keyword(self, pos, token):
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BEGINCMAP:
self._in_cmap = True
self.popall()
Expand Down Expand Up @@ -382,6 +391,7 @@ def do_keyword(self, pos, token):
for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i])
else:
assert isinstance(code, bytes)
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
Expand Down Expand Up @@ -412,7 +422,7 @@ def do_keyword(self, pos, token):
return


def main(argv):
def main(argv: List[str]) -> None:
args = argv[1:]
for fname in args:
fp = open(fname, 'rb')
Expand All @@ -424,4 +434,4 @@ def main(argv):


if __name__ == '__main__':
sys.exit(main(sys.argv))
main(sys.argv)
6 changes: 3 additions & 3 deletions pdfminer/encodingdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
log = logging.getLogger(__name__)


def name2unicode(name):
def name2unicode(name: str) -> str:
"""Converts Adobe glyph names to Unicode numbers.
In contrast to the specification, this raises a KeyError instead of return
Expand All @@ -32,7 +32,7 @@ def name2unicode(name):

else:
if name in glyphname2unicode:
return glyphname2unicode.get(name)
return glyphname2unicode[name]

elif name.startswith('uni'):
name_without_uni = name.strip('uni')
Expand All @@ -59,7 +59,7 @@ def name2unicode(name):
'it does not match specification' % name)


def raise_key_error_for_invalid_unicode(unicode_digit):
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
"""Unicode values should not be in the range D800 through DFFF because
that is used for surrogate pairs in UTF-16
Expand Down
2 changes: 1 addition & 1 deletion pdfminer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def export_image(self, image):
if ext == '.jpg':
raw_data = image.stream.get_rawdata()
if LITERAL_DEVICE_CMYK in image.colorspace:
from PIL import Image # type: ignore
from PIL import Image # type: ignore[import]
from PIL import ImageChops
ifp = BytesIO(raw_data)
i = Image.open(ifp)
Expand Down

0 comments on commit cc49051

Please sign in to comment.