In [3]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile


"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
"""

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'


def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text
                 for node in paragraph.iter(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)

In [4]:
txt = get_docx_text("../../resources/Categories.docx")

In [6]:
len(txt)

23061

In [3]:
import pymupdf
import re

theme_pattern = re.compile(r"^\d\.\s")
subtopic_pattern = re.compile(r"^\d\.\d+\s")

path = "../../resources/Categories.pdf"


themes = []
with pymupdf.open(path) as pdf:
    pages = pdf[0:13]
    for page in pages:
        for block in page.get_text("dict", flags=11)["blocks"]:
            for line in block["lines"]:
                for span in line["spans"]:
                    print(span["text"])

            print("---------- EOB -----------")

    for page in pages:
        theme = {}
        begin_sentences = 0
        sentences = []
        for block in page.get_text("dict", flags=11)["blocks"]:
            for line in block["lines"]:
                if begin_sentences == 1:
                    sentences.append(" ".join([span["text"].strip() for span in line["spans"]]))
                first_elem = line["spans"][0]["text"].strip()
                if theme_pattern.match(first_elem):
                    if begin_sentences == 1:
                        theme["sentences"] = sentences[:-1]
                        sentences = []
                    begin_sentences = 0
                    theme["topic"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif subtopic_pattern.match(first_elem):
                    if begin_sentences == 1:
                        theme["sentences"] = sentences[:-1]
                        sentences = []
                    begin_sentences = 0
                    theme["subtopic"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif first_elem.startswith("Scope"):
                    theme["scope"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif first_elem.startswith("Sample"):
                    begin_sentences = 1
            themes.append(theme)
                
                


"""             stripped = line.strip("• ").strip("◦ ").strip()
            if theme_pattern.match(stripped):
                print(f"Theme: {stripped}")
            elif subtopic_pattern.match(stripped):
                print(f"Subtopic: {stripped}")
            elif stripped.startswith("Scope:"):
                print(f"Scope: {stripped}")
            elif stripped.startswith("Sample"):
                pass
            else:
                print(f"Question: {stripped}") """

1. Filing requirements and formalities
---------- EOB -----------
1.1 Minimum requirements for a filing date
---------- EOB -----------

Scope
: assessing the mandatory elements (e.g., description, claims) and 
their submission modes.
---------- EOB -----------

 Sample sentences: 
---------- EOB -----------
o
What documents are required on the filing date for the epo to 
accord a filing date?
---------- EOB -----------
o
How is the filing date affected if claims are missing initially?
---------- EOB -----------
o
If no reference to any drawing is included, can missing drawings be 
submitted later without a change in the filing date?
---------- EOB -----------
1.2 Filing methods and locations
---------- EOB -----------

Scope
: addressing allowed filing modes (fax, online, physical) and relevant 
epo sites.
---------- EOB -----------

 Sample sentences: 
---------- EOB -----------
o
Which epo offices can receive an ep application filed via fax?
---------- EOB -----------
o
How do t

'             stripped = line.strip("• ").strip("◦ ").strip()\n            if theme_pattern.match(stripped):\n                print(f"Theme: {stripped}")\n            elif subtopic_pattern.match(stripped):\n                print(f"Subtopic: {stripped}")\n            elif stripped.startswith("Scope:"):\n                print(f"Scope: {stripped}")\n            elif stripped.startswith("Sample"):\n                pass\n            else:\n                print(f"Question: {stripped}") '

In [40]:
import pandas as pd
import json
with open("../../resources/questions/epo_topics_structure.json", 'r') as js:
    themes = json.load(js)
df = pd.DataFrame.from_dict(themes)
theme = df.sample(1)["topic"].item().lower()

In [41]:
theme

'3. divisional applications'

In [42]:
maps = pd.read_json("../../resources/questions/table0.json")
maps[maps["Category"].str.lower() == theme]

Unnamed: 0,Question,Category,Subcategory
5,Q6,3. Divisional Applications,3.1 Filing Requirements


In [19]:
import pandas as pd
from docx.api import Document

document = Document("../../resources/Categories.docx")
tables = document.tables
for k, table in enumerate(tables):
    data = []

    keys = None
    for i, row in enumerate(table.rows):
        text = (cell.text for cell in row.cells)

        if i == 0:
            keys = tuple(text)
            continue
        row_data = dict(zip(keys, text))
        data.append(row_data)
    print (data)

    df = pd.DataFrame(data)
    df.to_json(f"../../resources/questions/table{k}.json")

[{'Question': 'Q1', 'Category': '11. Entitlement and Transfers', 'Subcategory': '11.1 Entitlement Disputes'}, {'Question': 'Q2', 'Category': '9. Opposition and Appeals', 'Subcategory': '9.1/9.2 (Grounds/Procedure)'}, {'Question': 'Q3', 'Category': '9. Opposition and Appeals', 'Subcategory': '9.3 Appeal Proceedings'}, {'Question': 'Q4', 'Category': '11. Entitlement and Transfers', 'Subcategory': '11.2 Transfers and Assignments'}, {'Question': 'Q5', 'Category': '5. Languages and Translations', 'Subcategory': '5.1 Language of Filing / Procedural Language'}, {'Question': 'Q6', 'Category': '3. Divisional Applications', 'Subcategory': '3.1 Filing Requirements'}, {'Question': 'Q7', 'Category': '10. Substantive Patent Law', 'Subcategory': '10.3 Special Forms (Medical Use)'}, {'Question': 'Q8', 'Category': '7. PCT Procedure and Entry into EP Phase', 'Subcategory': '7.1 International Filing and Search'}, {'Question': 'Q9', 'Category': '1. Filing Requirements and Formalities', 'Subcategory': '1.1