From c3e3499a6bd6014ff2e1bd9586ddbb20097ca525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Cohen?= Date: Mon, 6 Sep 2021 22:00:23 +0200 Subject: [PATCH] Add support for ISO 32000-2 AES256 encryption (#614) * feat: Add support for ISO 32000-2 AES256 encryption * feat: Applies review suggestions --- CHANGELOG.md | 1 + README.md | 5 ++ docs/licenses/LICENSE.pyHanko | 23 ++++++++ pdfminer/_saslprep.py | 94 ++++++++++++++++++++++++++++++ pdfminer/pdfdocument.py | 89 ++++++++++++++++++++++------ samples/encryption/aes-256-r6.pdf | Bin 0 -> 1403 bytes tests/test_tools_pdf2txt.py | 6 ++ 7 files changed, 201 insertions(+), 17 deletions(-) create mode 100644 docs/licenses/LICENSE.pyHanko create mode 100644 pdfminer/_saslprep.py create mode 100644 samples/encryption/aes-256-r6.pdf diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cb40ad1..919e06d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] ### Added +- Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614)) - Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537)) ### Fixed diff --git a/README.md b/README.md index 7b4ff501..2a652af8 100644 --- a/README.md +++ b/README.md @@ -51,3 +51,8 @@ Contributing ------------ Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). + +Acknowledgement +--------------- + +This repository includes code from `pyHanko` ; the original license has been included [here](/docs/licenses/LICENSE.pyHanko). \ No newline at end of file diff --git a/docs/licenses/LICENSE.pyHanko b/docs/licenses/LICENSE.pyHanko new file mode 100644 index 00000000..b0e3a006 --- /dev/null +++ b/docs/licenses/LICENSE.pyHanko @@ -0,0 +1,23 @@ +This package contains various elements based on code from the pyHanko project, of which we reproduce the license below. + +MIT License + +Copyright (c) 2020 Matthias Valvekens + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/pdfminer/_saslprep.py b/pdfminer/_saslprep.py new file mode 100644 index 00000000..067a077f --- /dev/null +++ b/pdfminer/_saslprep.py @@ -0,0 +1,94 @@ +# Copyright 2016-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Some changes copyright 2021-present Matthias Valvekens, +# licensed under the license of the pyHanko project (see LICENSE file). + + +"""An implementation of RFC4013 SASLprep.""" + +__all__ = ['saslprep'] + +import stringprep +import unicodedata + +# RFC4013 section 2.3 prohibited output. +_PROHIBITED = ( + # A strict reading of RFC 4013 requires table c12 here, but + # characters from it are mapped to SPACE in the Map step. Can + # normalization reintroduce them somehow? + stringprep.in_table_c12, + stringprep.in_table_c21_c22, + stringprep.in_table_c3, + stringprep.in_table_c4, + stringprep.in_table_c5, + stringprep.in_table_c6, + stringprep.in_table_c7, + stringprep.in_table_c8, + stringprep.in_table_c9) + + +def saslprep(data: str, prohibit_unassigned_code_points=True) -> str: + """An implementation of RFC4013 SASLprep. + :param data: + The string to SASLprep. + :param prohibit_unassigned_code_points: + RFC 3454 and RFCs for various SASL mechanisms distinguish between + `queries` (unassigned code points allowed) and + `stored strings` (unassigned code points prohibited). Defaults + to ``True`` (unassigned code points are prohibited). + :return: The SASLprep'ed version of `data`. + """ + if prohibit_unassigned_code_points: + prohibited = _PROHIBITED + (stringprep.in_table_a1,) + else: + prohibited = _PROHIBITED + + # RFC3454 section 2, step 1 - Map + # RFC4013 section 2.1 mappings + # Map Non-ASCII space characters to SPACE (U+0020). Map + # commonly mapped to nothing characters to, well, nothing. + in_table_c12 = stringprep.in_table_c12 + in_table_b1 = stringprep.in_table_b1 + data = "".join( + ["\u0020" if in_table_c12(elt) else elt + for elt in data if not in_table_b1(elt)]) + + # RFC3454 section 2, step 2 - Normalize + # RFC4013 section 2.2 normalization + data = unicodedata.ucd_3_2_0.normalize('NFKC', data) + + in_table_d1 = stringprep.in_table_d1 + if in_table_d1(data[0]): + if not in_table_d1(data[-1]): + # RFC3454, Section 6, #3. If a string contains any + # RandALCat character, the first and last characters + # MUST be RandALCat characters. + raise ValueError("SASLprep: failed bidirectional check") + # RFC3454, Section 6, #2. If a string contains any RandALCat + # character, it MUST NOT contain any LCat character. + prohibited = prohibited + (stringprep.in_table_d2,) + else: + # RFC3454, Section 6, #3. Following the logic of #3, if + # the first character is not a RandALCat, no other character + # can be either. + prohibited = prohibited + (in_table_d1,) + + # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi + for char in data: + if any(in_table(char) for in_table in prohibited): + raise ValueError( + "SASLprep: failed prohibited character check") + + return data diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 8233281d..6a576f57 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -1,7 +1,7 @@ import logging import re import struct -from hashlib import sha256, md5 +from hashlib import sha256, md5, sha384, sha512 from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes @@ -477,7 +477,7 @@ def decrypt_aes128(self, objid, genno, data): class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4): - supported_revisions = (5,) + supported_revisions = (5, 6) def init_params(self): super().init_params() @@ -499,29 +499,84 @@ def get_cfm(self, name): return None def authenticate(self, password): - password = password.encode('utf-8')[:127] - hash = sha256(password) - hash.update(self.o_validation_salt) - hash.update(self.u) - if hash.digest() == self.o_hash: - hash = sha256(password) - hash.update(self.o_key_salt) - hash.update(self.u) - cipher = Cipher(algorithms.AES(hash.digest()), + password = self._normalize_password(password) + hash = self._password_hash(password, self.o_validation_salt, self.u) + if hash == self.o_hash: + hash = self._password_hash(password, self.o_key_salt, self.u) + cipher = Cipher(algorithms.AES(hash), modes.CBC(b'\0' * 16), backend=default_backend()) return cipher.decryptor().update(self.oe) - hash = sha256(password) - hash.update(self.u_validation_salt) - if hash.digest() == self.u_hash: - hash = sha256(password) - hash.update(self.u_key_salt) - cipher = Cipher(algorithms.AES(hash.digest()), + hash = self._password_hash(password, self.u_validation_salt) + if hash == self.u_hash: + hash = self._password_hash(password, self.u_key_salt) + cipher = Cipher(algorithms.AES(hash), modes.CBC(b'\0' * 16), backend=default_backend()) return cipher.decryptor().update(self.ue) return None + def _normalize_password(self, password): + if self.r == 6: + # saslprep expects non-empty strings, apparently + if not password: + return b'' + from ._saslprep import saslprep + password = saslprep(password) + return password.encode('utf-8')[:127] + + def _password_hash(self, password, salt, vector=None): + """ + Compute password hash depending on revision number + """ + if self.r == 5: + return self._r5_password(password, salt, vector) + return self._r6_password(password, salt[0:8], vector) + + def _r5_password(self, password, salt, vector): + """ + Compute the password for revision 5 + """ + hash = sha256(password) + hash.update(salt) + if vector is not None: + hash.update(vector) + return hash.digest() + + def _r6_password(self, password, salt, vector): + """ + Compute the password for revision 6 + """ + initial_hash = sha256(password) + initial_hash.update(salt) + if vector is not None: + initial_hash.update(vector) + k = initial_hash.digest() + hashes = (sha256, sha384, sha512) + round_no = last_byte_val = 0 + while round_no < 64 or last_byte_val > round_no - 32: + k1 = (password + k + (vector or b'')) * 64 + e = self._aes_cbc_encrypt( + key=k[:16], iv=k[16:32], data=k1 + ) + # compute the first 16 bytes of e, + # interpreted as an unsigned integer mod 3 + next_hash = hashes[self._bytes_mod_3(e[:16])] + k = next_hash(e).digest() + last_byte_val = e[len(e) - 1] + round_no += 1 + return k[:32] + + @staticmethod + def _bytes_mod_3(input_bytes): + # 256 is 1 mod 3, so we can just sum 'em + return sum(b % 3 for b in input_bytes) % 3 + + def _aes_cbc_encrypt(self, key, iv, data): + cipher = Cipher(algorithms.AES(key), modes.CBC(iv)) + encryptor = cipher.encryptor() + return encryptor.update(data) + encryptor.finalize() + def decrypt_aes256(self, objid, genno, data): initialization_vector = data[:16] ciphertext = data[16:] diff --git a/samples/encryption/aes-256-r6.pdf b/samples/encryption/aes-256-r6.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3fbf6f510d8c4ffada6d16ba0b8805f05f5ec76f GIT binary patch literal 1403 zcmZWpOK4nG7#1Q0Zxq{$4lHYQBkDEMg4Gabpy4R`KHB7z!R zC@y?%l)7*s2x3JM?Z!o;;KG#%;?k8}+Pz@?&rD*clshoz+;h(Vf9HGL`L*Tl0(}e@ z^ZTFefBoYZ1wj~9?!}_jD#|mv+YyyZwy|5)W>K!$%{W2~HuQ>iyNJV(H`18{YjqXI zKGvvZ0_G_Z4hY@-f@r(#PUzAJLc-SCPlq)gx?TKm!7BM^wV?;QIEjM82L$?;qm#iZcnclMO z#Zfh`JzP8U(XC)*G9Z(`DL?sbQLc|&GodTL1PCYWC{Fg5XZnLUTIf{+JIwv3KfpR{ zpwS44jD7}Ko*x1`9fzCE1*CCaA2oHfgJR?U@3sc#KL6v(UD^I?@4;UlPxoE2?N|Q( z{nXL-|Ga$h8}##sFWmjl{?TvWz54oH_ZCK4_H~3!W?LEw>CFrM+UtD-{xzRv8X&h_NXNW%$r>*nE1S*|~$#ZV8#!{-7 zm}gQ78<_JtL3t-M0bz&;hLOxotOsCwrxi7jRRJO^88l9k1R|YJ5gV#*3aXkKCnw}- zh#o;2(VWX^PM+z-1Q|LoRCmiZaXI=b0IR*Kg4|AmP@Wv73XbJa9BP2FBVl@S8LGBG zZwJx?5d;iix%0q;0YfB3q&Btu?*s5%UB z`lTR%LFJ_bA8<00yPb>v{A+vHuOIvL(ZwT2?)~kKE6?qn|Mv}c?BOdPoj$(((A(wQ e+--CC_G`BW3ZiT4W~YYx6M{)GKi^sH7XJZr>zE4w literal 0 HcmV?d00001 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index b2604cc4..9fe2e364 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -96,6 +96,12 @@ def test_encryption_aes256(self): def test_encryption_aes256m(self): run('encryption/aes-256-m.pdf', '-P foo') + def test_encryption_aes256_r6_user(self): + run('encryption/aes-256-r6.pdf', '-P usersecret') + + def test_encryption_aes256_r6_owner(self): + run('encryption/aes-256-r6.pdf', '-P ownersecret') + def test_encryption_base(self): run('encryption/base.pdf', '-P foo')