Merge branch 'develop' into mypy

0xabu · Sep 2, 2021 · f72aaea · f72aaea
2 parents ff787a9 + 8ea9f10
commit f72aaea
Show file tree

Hide file tree

Showing 20 changed files with 248 additions and 140 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,5 +21,7 @@ tests/*.txt
 # python venv management tools
 Pipfile
 Pipfile.lock
+.noseids
+.vscode/
 pyproject.toml
-poetry.lock
+poetry.lock
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+### Added
+- Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537))
+
 ### Fixed
+- `KeyError` when `'Encrypt'` but not `'ID'` present in `trailer` ([#594](https://github.com/pdfminer/pdfminer.six/pull/594))
+- Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
 - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
-- `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
+- Fix `PermissionError` when creating temporary filepaths on windows when running tests ([#484](https://github.com/pdfminer/pdfminer.six/pull/484))
+- Fix `AttributeError` when dumping a TOC with bytes destinations ([#600](https://github.com/pdfminer/pdfminer.six/pull/600))
+- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
 - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
 - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
+- Raising `UnboundLocalError` when a bad `--output-type`  is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
+- `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
 
 ## Removed
 - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))

diff --git a/README.md b/README.md
@@ -7,15 +7,12 @@ pdfminer.six
 
 *We fathom PDF*
 
-Pdfminer.six is a community maintained fork of the original PDFMiner. It is a
-tool for extracting information from PDF documents. It focuses on getting
-and analyzing text data. Pdfminer.six extracts the text from a page directly
-from the sourcecode of the PDF. It can also be used to get the exact location, 
-font or color of the text. 
+Pdfminer.six is a community maintained fork of the original PDFMiner. It is a tool for extracting information from PDF
+documents. It focuses on getting and analyzing text data. Pdfminer.six extracts the text from a page directly from the
+sourcecode of the PDF. It can also be used to get the exact location, font or color of the text.
 
-It is built in a modular way such that each component of pdfminer.six can be
-replaced easily. You can implement your own interpreter or rendering device
-that uses the power of pdfminer.six for other purposes than text analysis. 
+It is built in a modular way such that each component of pdfminer.six can be replaced easily. You can implement your own
+interpreter or rendering device that uses the power of pdfminer.six for other purposes than text analysis.
 
 Check out the full documentation on
 [Read the Docs](https://pdfminersix.readthedocs.io).
@@ -24,31 +21,31 @@ Check out the full documentation on
 Features
 --------
 
- * Written entirely in Python.
- * Parse, analyze, and convert PDF documents.
- * PDF-1.7 specification support. (well, almost).
- * CJK languages and vertical writing scripts support.
- * Various font types (Type1, TrueType, Type3, and CID) support.
- * Support for extracting images (JPG, JBIG2 and Bitmaps).
- * Support for RC4 and AES encryption.
- * Support for AcroForm interactive form extraction.
- * Table of contents extraction.
- * Tagged contents extraction.
- * Automatic layout analysis.
-
+* Written entirely in Python.
+* Parse, analyze, and convert PDF documents.
+* PDF-1.7 specification support. (well, almost).
+* CJK languages and vertical writing scripts support.
+* Various font types (Type1, TrueType, Type3, and CID) support.
+* Support for extracting images (JPG, JBIG2, Bitmaps).
+* Support for various compressions (ASCIIHexDecode, ASCII85Decode, LZWDecode, FlateDecode, RunLengthDecode,
+  CCITTFaxDecode)
+* Support for RC4 and AES encryption.
+* Support for AcroForm interactive form extraction.
+* Table of contents extraction.
+* Tagged contents extraction.
+* Automatic layout analysis.
 
 How to use
 ----------
 
- * Install Python 3.6 or newer.
- * Install
-
-    `pip install pdfminer.six`
+* Install Python 3.6 or newer.
+* Install
 
- * Use command-line interface to extract text from pdf:
+  `pip install pdfminer.six`
 
-    `python pdf2txt.py samples/simple1.pdf`
+* Use command-line interface to extract text from pdf:
 
+  `python pdf2txt.py samples/simple1.pdf`
 
 Contributing
 ------------

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -22,8 +22,8 @@ Check out the source on `github <https://github.com/pdfminer/pdfminer.six>`_.
 Content
 =======
 
-This documentation is organized into four sections (according to the `Divio
-documentation system <https://documentation.divio.com>`_). The
+This documentation is organized into four sections (according to the `Diátaxis
+documentation framework <https://diataxis.fr>`_). The
 :ref:`tutorial` section helps you setup and use pdfminer.six for the first
 time. Read this section if this is your first time working with pdfminer.six.
 The :ref:`howto` offers specific recipies for solving common problems.

diff --git a/docs/source/topic/converting_pdf_to_text.rst b/docs/source/topic/converting_pdf_to_text.rst
@@ -4,14 +4,14 @@ Converting a PDF file to text
 *****************************
 
 Most PDF files look like they contain well structured text. But the reality  is
-that a PDF file does not contain anything that resembles a paragraphs,
+that a PDF file does not contain anything that resembles paragraphs,
 sentences or even words. When it comes to text, a PDF file is only aware of
 the characters and their placement.
 
-This makes extracting meaningful pieces of text from PDF's files difficult.
+This makes extracting meaningful pieces of text from PDF files difficult.
 The characters that compose a paragraph are no different from those that
 compose the table, the page footer or the description of a figure. Unlike
-other documents formats, like a `.txt` file or a word document, the PDF format
+other document formats, like a `.txt` file or a word document, the PDF format
 does not contain a stream of text.
 
 A PDF document does consists of a collection of objects that together describe
@@ -20,7 +20,6 @@ interactive elements and higher-level application data. A PDF file contains
 the objects making up a PDF document along with associated structural
 information, all represented as a single self-contained sequence of bytes. [1]_
 
-
 .. _topic_pdf_to_text_layout:
 
 Layout analysis algorithm
@@ -30,18 +29,17 @@ PDFMiner attempts to reconstruct some of those structures by using heuristics
 on the positioning of characters. This works well for sentences and
 paragraphs because meaningful groups of nearby characters can be made.
 
-The layout analysis consist of three different stages: it groups characters
+The layout analysis consists of three different stages: it groups characters
 into words and lines, then it groups lines into boxes and finally it groups
 textboxes hierarchically. These stages are discussed in the following
-sections.  The resulting output of the layout analysis is an ordered hierarchy
+sections. The resulting output of the layout analysis is an ordered hierarchy
 of layout objects on a PDF page.
 
 .. figure:: ../_static/layout_analysis_output.png
     :align: center
 
     The output of the layout analysis is a hierarchy of layout objects.
 
-
 The output of the layout analysis heavily depends on a couple of parameters.
 All these parameters are part of the :ref:`api_laparams` class.
 
@@ -50,16 +48,15 @@ Grouping characters into words and lines
 
 The first step in going from characters to text is to group characters in a
 meaningful way. Each character has an x-coordinate and a y-coordinate for its
-bottom-left corner and upper-right corner, i.e. its bounding box. Pdfminer
-.six uses these bounding boxes to decide which characters belong together.
+bottom-left corner and upper-right corner, i.e. its bounding box. Pdfminer.six 
+uses these bounding boxes to decide which characters belong together.
 
 Characters that are both horizontally and vertically close are grouped onto
 one line. How close they should be is determined by the `char_margin`
 (M in figure) and the `line_overlap` (not in figure) parameter. The horizontal
-*distance* between the bounding boxes of two characters should be smaller that
+*distance* between the bounding boxes of two characters should be smaller than
 the `char_margin` and the vertical *overlap* between the bounding boxes should
-be smaller the the `line_overlap`.
-
+be smaller than the `line_overlap`.
 
 .. raw:: html
     :file: ../_static/layout_analysis.html
@@ -71,14 +68,14 @@ relative to the minimum height of either one of the bounding boxes.
 
 Spaces need to be inserted between characters because the PDF format has no
 notion of the space character. A space is inserted if the characters are
-further apart that the `word_margin` (W in the figure). The `word_margin` is
+further apart than the `word_margin` (W in the figure). The `word_margin` is
 relative to the maximum width or height of the new character. Having a smaller
 `word_margin` creates smaller words. Note that the `word_margin` should at
 least be smaller than the `char_margin` otherwise none of the characters will
 be separated by a space.
 
-The result of this stage is a list of lines. Each line consists a list of
-characters. These characters either original `LTChar` characters that
+The result of this stage is a list of lines. Each line consists of a list of
+characters. These characters are either original `LTChar` characters that
 originate from the PDF file, or inserted `LTAnno` characters that
 represent spaces between words or newlines at the end of each line.
 
@@ -94,20 +91,20 @@ Lines that are both horizontally overlapping and vertically close are grouped.
 How vertically close the lines should be is determined by the `line_margin`.
 This margin is specified relative to the height of the bounding box. Lines
 are close if the gap between the tops (see L :sub:`1` in the figure) and bottoms
-(see L :sub:`2`) in the figure) of the bounding boxes are closer together
+(see L :sub:`2`) in the figure) of the bounding boxes is closer together
 than the absolute line margin, i.e. the `line_margin` multiplied by the
 height of the bounding box.
 
 .. raw:: html
     :file: ../_static/layout_analysis_group_lines.html
 
-The result of this stage is a list of text boxes. Each box consist of a list
+The result of this stage is a list of text boxes. Each box consists of a list
 of lines.
 
 Grouping textboxes hierarchically
 ---------------------------------
 
-the last step is to group the text boxes in a meaningful way. This step
+The last step is to group the text boxes in a meaningful way. This step
 repeatedly merges the two text boxes that are closest to each other.
 
 The closeness of bounding boxes is computed as the area that is between the
@@ -118,7 +115,6 @@ boxes of the individual lines.
 .. raw:: html
     :file: ../_static/layout_analysis_group_boxes.html
 
-
 Working with rotated characters
 ===============================
 

diff --git a/docs/source/tutorial/highlevel.rst b/docs/source/tutorial/highlevel.rst
@@ -1,8 +1,3 @@
-.. testsetup::
-
-    import sys
-    from pdfminer.high_level import extract_text_to_fp, extract_text
-
 .. _tutorial_highlevel:
 
 Extract text from a PDF using Python
@@ -15,6 +10,7 @@ The most simple way to extract text from a PDF is to use
 
 .. doctest::
 
+    >>> from pdfminer.high_level import extract_text
     >>> text = extract_text('samples/simple1.pdf')
     >>> print(repr(text))
     'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\x0c'
@@ -42,10 +38,8 @@ To read text from a PDF and print it on the command line:
 
 .. doctest::
 
-    >>> if sys.version_info > (3, 0):
-    ...     from io import StringIO
-    ... else:
-    ...     from io import BytesIO as StringIO
+    >>> from io import StringIO
+    >>> from pdfminer.high_level import extract_text_to_fp
     >>> output_string = StringIO()
     >>> with open('samples/simple1.pdf', 'rb') as fin:
     ...     extract_text_to_fp(fin, output_string)
@@ -56,10 +50,8 @@ Or to convert it to html and use layout analysis:
 
 .. doctest::
 
-    >>> if sys.version_info > (3, 0):
-    ...     from io import StringIO
-    ... else:
-    ...     from io import BytesIO as StringIO
+    >>> from io import StringIO
+    >>> from pdfminer.high_level import extract_text_to_fp
     >>> from pdfminer.layout import LAParams
     >>> output_string = StringIO()
     >>> with open('samples/simple1.pdf', 'rb') as fin:

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -340,7 +340,7 @@ def do_keyword(self, pos, token):
         if token is self.KEYWORD_ENDCIDRANGE:
             objs = [obj for (__, obj) in self.popall()]
             for (s, e, cid) in choplist(3, objs):
-                if (not isinstance(s, str) or not isinstance(e, str) or
+                if (not isinstance(s, bytes) or not isinstance(e, bytes) or
                    not isinstance(cid, int) or len(s) != len(e)):
                     continue
                 sprefix = s[:-4]
@@ -354,7 +354,7 @@ def do_keyword(self, pos, token):
                 vlen = len(svar)
                 for i in range(e1-s1+1):
                     x = sprefix+struct.pack('>L', s1+i)[-vlen:]
-                    self.cmap.add_code2cid(x, cid+i)
+                    self.cmap.add_cid2unichr(cid+i, x)
             return
 
         if token is self.KEYWORD_BEGINCIDCHAR:
@@ -363,8 +363,8 @@ def do_keyword(self, pos, token):
         if token is self.KEYWORD_ENDCIDCHAR:
             objs = [obj for (__, obj) in self.popall()]
             for (cid, code) in choplist(2, objs):
-                if isinstance(code, str) and isinstance(cid, str):
-                    self.cmap.add_code2cid(code, nunpack(cid))
+                if isinstance(code, bytes) and isinstance(cid, int):
+                    self.cmap.add_cid2unichr(cid, code)
             return
 
         if token is self.KEYWORD_BEGINBFRANGE:

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -3,7 +3,6 @@
 from pdfminer.pdfcolor import PDFColorSpace
 from typing import Any, List, Optional, Sequence, cast
 import re
-import sys
 
 from . import utils
 from .layout import LAParams
@@ -294,8 +293,6 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
     def write(self, text):
         if self.codec:
             text = text.encode(self.codec)
-        if sys.version_info < (3, 0):
-            text = str(text)
         self.outfp.write(text)
         return
 

diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
@@ -56,25 +56,33 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
         imagewriter = ImageWriter(output_dir)
 
     rsrcmgr = PDFResourceManager(caching=not disable_caching)
+    device = None
+
+    if output_type != 'text' and outfp == sys.stdout:
+        outfp = sys.stdout.buffer
 
     if output_type == 'text':
         device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                imagewriter=imagewriter)
 
-    if outfp == sys.stdout:
-        outfp = sys.stdout.buffer
-
-    if output_type == 'xml':
+    elif output_type == 'xml':
         device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter,
                               stripcontrol=strip_control)
+
     elif output_type == 'html':
         device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                layoutmode=layoutmode, laparams=laparams,
                                imagewriter=imagewriter)
+
     elif output_type == 'tag':
         device = TagExtractor(rsrcmgr, outfp, codec=codec)
 
+    else:
+        msg = f"Output type can be text, html, xml or tag but is " \
+              f"{output_type}"
+        raise ValueError(msg)
+
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in PDFPage.get_pages(inf,
                                   page_numbers,