Skip to content

Commit

Permalink
Merge branch 'develop' into mypy (and fixed types)
Browse files Browse the repository at this point in the history
  • Loading branch information
0xabu committed Sep 7, 2021
2 parents eaab3c6 + c3e3499 commit fa229f7
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 18 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]

### Added
- Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614))
- Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537))
- Type annotations ([#661](https://github.com/pdfminer/pdfminer.six/pull/661))

Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,8 @@ Contributing
------------

Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md).

Acknowledgement
---------------

This repository includes code from `pyHanko` ; the original license has been included [here](/docs/licenses/LICENSE.pyHanko).
23 changes: 23 additions & 0 deletions docs/licenses/LICENSE.pyHanko
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
This package contains various elements based on code from the pyHanko project, of which we reproduce the license below.

MIT License

Copyright (c) 2020 Matthias Valvekens

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
95 changes: 95 additions & 0 deletions pdfminer/_saslprep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright 2016-present MongoDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some changes copyright 2021-present Matthias Valvekens,
# licensed under the license of the pyHanko project (see LICENSE file).


"""An implementation of RFC4013 SASLprep."""

__all__ = ['saslprep']

import stringprep
from typing import Callable, Tuple
import unicodedata

# RFC4013 section 2.3 prohibited output.
_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
# A strict reading of RFC 4013 requires table c12 here, but
# characters from it are mapped to SPACE in the Map step. Can
# normalization reintroduce them somehow?
stringprep.in_table_c12,
stringprep.in_table_c21_c22,
stringprep.in_table_c3,
stringprep.in_table_c4,
stringprep.in_table_c5,
stringprep.in_table_c6,
stringprep.in_table_c7,
stringprep.in_table_c8,
stringprep.in_table_c9)


def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
"""An implementation of RFC4013 SASLprep.
:param data:
The string to SASLprep.
:param prohibit_unassigned_code_points:
RFC 3454 and RFCs for various SASL mechanisms distinguish between
`queries` (unassigned code points allowed) and
`stored strings` (unassigned code points prohibited). Defaults
to ``True`` (unassigned code points are prohibited).
:return: The SASLprep'ed version of `data`.
"""
if prohibit_unassigned_code_points:
prohibited = _PROHIBITED + (stringprep.in_table_a1,)
else:
prohibited = _PROHIBITED

# RFC3454 section 2, step 1 - Map
# RFC4013 section 2.1 mappings
# Map Non-ASCII space characters to SPACE (U+0020). Map
# commonly mapped to nothing characters to, well, nothing.
in_table_c12 = stringprep.in_table_c12
in_table_b1 = stringprep.in_table_b1
data = "".join(
["\u0020" if in_table_c12(elt) else elt
for elt in data if not in_table_b1(elt)])

# RFC3454 section 2, step 2 - Normalize
# RFC4013 section 2.2 normalization
data = unicodedata.ucd_3_2_0.normalize('NFKC', data)

in_table_d1 = stringprep.in_table_d1
if in_table_d1(data[0]):
if not in_table_d1(data[-1]):
# RFC3454, Section 6, #3. If a string contains any
# RandALCat character, the first and last characters
# MUST be RandALCat characters.
raise ValueError("SASLprep: failed bidirectional check")
# RFC3454, Section 6, #2. If a string contains any RandALCat
# character, it MUST NOT contain any LCat character.
prohibited = prohibited + (stringprep.in_table_d2,)
else:
# RFC3454, Section 6, #3. Following the logic of #3, if
# the first character is not a RandALCat, no other character
# can be either.
prohibited = prohibited + (in_table_d1,)

# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
for char in data:
if any(in_table(char) for in_table in prohibited):
raise ValueError(
"SASLprep: failed prohibited character check")

return data
111 changes: 93 additions & 18 deletions pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import re
import struct
from hashlib import sha256, md5
from hashlib import sha256, md5, sha384, sha512
from typing import (Any, Callable, Dict, Iterable, Iterator, KeysView, List,
Optional, Sequence, Tuple, Type, Union, cast)

Expand Down Expand Up @@ -443,7 +443,7 @@ def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:

class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):

supported_revisions = (4,)
supported_revisions: Tuple[int, ...] = (4,)

def init_params(self) -> None:
super().init_params()
Expand Down Expand Up @@ -515,7 +515,7 @@ def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:

class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):

supported_revisions = (5,)
supported_revisions = (5, 6)

def init_params(self) -> None:
super().init_params()
Expand All @@ -540,29 +540,104 @@ def get_cfm(
return None

def authenticate(self, password: str) -> Optional[bytes]:
password_bytes = password.encode('utf-8')[:127]
hash = sha256(password_bytes)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = sha256(password_bytes)
hash.update(self.o_key_salt)
hash.update(self.u)
cipher = Cipher(algorithms.AES(hash.digest()),
password_b = self._normalize_password(password)
hash = self._password_hash(password_b, self.o_validation_salt, self.u)
if hash == self.o_hash:
hash = self._password_hash(password_b, self.o_key_salt, self.u)
cipher = Cipher(algorithms.AES(hash),
modes.CBC(b'\0' * 16),
backend=default_backend()) # type: ignore
return cipher.decryptor().update(self.oe) # type: ignore
hash = sha256(password_bytes)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = sha256(password_bytes)
hash.update(self.u_key_salt)
cipher = Cipher(algorithms.AES(hash.digest()),
hash = self._password_hash(password_b, self.u_validation_salt)
if hash == self.u_hash:
hash = self._password_hash(password_b, self.u_key_salt)
cipher = Cipher(algorithms.AES(hash),
modes.CBC(b'\0' * 16),
backend=default_backend()) # type: ignore
return cipher.decryptor().update(self.ue) # type: ignore
return None

def _normalize_password(self, password: str) -> bytes:
if self.r == 6:
# saslprep expects non-empty strings, apparently
if not password:
return b''
from ._saslprep import saslprep
password = saslprep(password)
return password.encode('utf-8')[:127]

def _password_hash(
self,
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
) -> bytes:
"""
Compute password hash depending on revision number
"""
if self.r == 5:
return self._r5_password(password, salt, vector)
return self._r6_password(password, salt[0:8], vector)

def _r5_password(
self,
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
) -> bytes:
"""
Compute the password for revision 5
"""
hash = sha256(password)
hash.update(salt)
if vector is not None:
hash.update(vector)
return hash.digest()

def _r6_password(
self,
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
) -> bytes:
"""
Compute the password for revision 6
"""
initial_hash = sha256(password)
initial_hash.update(salt)
if vector is not None:
initial_hash.update(vector)
k = initial_hash.digest()
hashes = (sha256, sha384, sha512)
round_no = last_byte_val = 0
while round_no < 64 or last_byte_val > round_no - 32:
k1 = (password + k + (vector or b'')) * 64
e = self._aes_cbc_encrypt(
key=k[:16], iv=k[16:32], data=k1
)
# compute the first 16 bytes of e,
# interpreted as an unsigned integer mod 3
next_hash = hashes[self._bytes_mod_3(e[:16])]
k = next_hash(e).digest()
last_byte_val = e[len(e) - 1]
round_no += 1
return k[:32]

@staticmethod
def _bytes_mod_3(input_bytes: bytes) -> int:
# 256 is 1 mod 3, so we can just sum 'em
return sum(b % 3 for b in input_bytes) % 3

def _aes_cbc_encrypt(
self,
key: bytes,
iv: bytes,
data: bytes
) -> bytes:
cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
encryptor = cipher.encryptor() # type: ignore
return encryptor.update(data) + encryptor.finalize() # type: ignore

def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
initialization_vector = data[:16]
ciphertext = data[16:]
Expand Down
Binary file added samples/encryption/aes-256-r6.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ def test_encryption_aes256(self):
def test_encryption_aes256m(self):
run('encryption/aes-256-m.pdf', '-P foo')

def test_encryption_aes256_r6_user(self):
run('encryption/aes-256-r6.pdf', '-P usersecret')

def test_encryption_aes256_r6_owner(self):
run('encryption/aes-256-r6.pdf', '-P ownersecret')

def test_encryption_base(self):
run('encryption/base.pdf', '-P foo')

Expand Down

0 comments on commit fa229f7

Please sign in to comment.