In [3]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile


"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
"""

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'


def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text
                 for node in paragraph.iter(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)

In [4]:
txt = get_docx_text("../../resources/Categories.docx")

In [6]:
len(txt)

23061

In [25]:
import pymupdf
import re

theme_pattern = re.compile(r"^\d\.\s")
subtopic_pattern = re.compile(r"^\d\.\d+\s")

path = "../../resources/Categories.pdf"


themes = []
with pymupdf.open(path) as pdf:
    pages = pdf[0:13]
    for page in pages:
        for block in page.get_text("dict", flags=11)["blocks"]:
            theme = {}
            begin_sentences = 0
            sentences = []
            for line in block["lines"]:
                if begin_sentences == 1:
                    sentences.append(" ".join([span["text"].strip() for span in line["spans"]]))
                first_elem = line["spans"][0]["text"].strip()
                if theme_pattern.match(first_elem):
                    if begin_sentences == 1:
                        theme["sentences"] = sentences[:-1]
                        sentences = []
                    begin_sentences = 0
                    theme["topic"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif subtopic_pattern.match(first_elem):
                    if begin_sentences == 1:
                        theme["sentences"] = sentences[:-1]
                        sentences = []
                    begin_sentences = 0
                    theme["subtopic"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif first_elem.startswith("Scope"):
                    theme["scope"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif first_elem.startswith("Sample"):
                    begin_sentences = 1
            themes.append(theme)
                
                


"""             stripped = line.strip("• ").strip("◦ ").strip()
            if theme_pattern.match(stripped):
                print(f"Theme: {stripped}")
            elif subtopic_pattern.match(stripped):
                print(f"Subtopic: {stripped}")
            elif stripped.startswith("Scope:"):
                print(f"Scope: {stripped}")
            elif stripped.startswith("Sample"):
                pass
            else:
                print(f"Question: {stripped}") """

'             stripped = line.strip("• ").strip("◦ ").strip()\n            if theme_pattern.match(stripped):\n                print(f"Theme: {stripped}")\n            elif subtopic_pattern.match(stripped):\n                print(f"Subtopic: {stripped}")\n            elif stripped.startswith("Scope:"):\n                print(f"Scope: {stripped}")\n            elif stripped.startswith("Sample"):\n                pass\n            else:\n                print(f"Question: {stripped}") '

In [26]:
themes

[{'topic': '1. Filing requirements and formalities'},
 {'subtopic': '1.1 Minimum requirements for a filing date'},
 {'scope': 'Scope : assessing the mandatory elements (e.g., description, claims) and'},
 {},
 {},
 {},
 {},
 {'subtopic': '1.2 Filing methods and locations'},
 {'scope': 'Scope : addressing allowed filing modes (fax, online, physical) and relevant'},
 {},
 {},
 {},
 {'subtopic': '1.3 Formality examination'},
 {'scope': 'Scope : checking if formal requirements (payment of fees, form usage,'},
 {},
 {},
 {},
 {'topic': '2. Priority claims and right of priority'},
 {'subtopic': '2.1 Substantive requirements for priority'},
 {'scope': 'Scope : validity criteria, same invention requirement, earliest date, and'},
 {},
 {},
 {},
 {'subtopic': '2.2 Time limits and restoration'},
 {'scope': 'Scope : one-year priority period, procedures for requesting restoration,'},
 {},
 {},
 {},
 {'subtopic': '2.3 Multiple priorities and partial priority'},
 {'scope': 'Scope : handling claims tha