In [3]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile


"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
"""

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'


def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text
                 for node in paragraph.iter(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)

In [4]:
txt = get_docx_text("../../resources/Categories.docx")

In [6]:
len(txt)

23061

In [3]:
import pymupdf
import re

theme_pattern = re.compile(r"^\d\.\s")
subtopic_pattern = re.compile(r"^\d\.\d+\s")

path = "../../resources/Categories.pdf"


themes = []
with pymupdf.open(path) as pdf:
    pages = pdf[0:13]
    for page in pages:
        for block in page.get_text("dict", flags=11)["blocks"]:
            for line in block["lines"]:
                for span in line["spans"]:
                    print(span["text"])

            print("---------- EOB -----------")

    for page in pages:
        theme = {}
        begin_sentences = 0
        sentences = []
        for block in page.get_text("dict", flags=11)["blocks"]:
            for line in block["lines"]:
                if begin_sentences == 1:
                    sentences.append(" ".join([span["text"].strip() for span in line["spans"]]))
                first_elem = line["spans"][0]["text"].strip()
                if theme_pattern.match(first_elem):
                    if begin_sentences == 1:
                        theme["sentences"] = sentences[:-1]
                        sentences = []
                    begin_sentences = 0
                    theme["topic"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif subtopic_pattern.match(first_elem):
                    if begin_sentences == 1:
                        theme["sentences"] = sentences[:-1]
                        sentences = []
                    begin_sentences = 0
                    theme["subtopic"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif first_elem.startswith("Scope"):
                    theme["scope"] = " ".join([span["text"].strip() for span in line["spans"]])
                elif first_elem.startswith("Sample"):
                    begin_sentences = 1
            themes.append(theme)
                
                


"""             stripped = line.strip("• ").strip("◦ ").strip()
            if theme_pattern.match(stripped):
                print(f"Theme: {stripped}")
            elif subtopic_pattern.match(stripped):
                print(f"Subtopic: {stripped}")
            elif stripped.startswith("Scope:"):
                print(f"Scope: {stripped}")
            elif stripped.startswith("Sample"):
                pass
            else:
                print(f"Question: {stripped}") """

1. Filing requirements and formalities
---------- EOB -----------
1.1 Minimum requirements for a filing date
---------- EOB -----------

Scope
: assessing the mandatory elements (e.g., description, claims) and 
their submission modes.
---------- EOB -----------

 Sample sentences: 
---------- EOB -----------
o
What documents are required on the filing date for the epo to 
accord a filing date?
---------- EOB -----------
o
How is the filing date affected if claims are missing initially?
---------- EOB -----------
o
If no reference to any drawing is included, can missing drawings be 
submitted later without a change in the filing date?
---------- EOB -----------
1.2 Filing methods and locations
---------- EOB -----------

Scope
: addressing allowed filing modes (fax, online, physical) and relevant 
epo sites.
---------- EOB -----------

 Sample sentences: 
---------- EOB -----------
o
Which epo offices can receive an ep application filed via fax?
---------- EOB -----------
o
How do t

'             stripped = line.strip("• ").strip("◦ ").strip()\n            if theme_pattern.match(stripped):\n                print(f"Theme: {stripped}")\n            elif subtopic_pattern.match(stripped):\n                print(f"Subtopic: {stripped}")\n            elif stripped.startswith("Scope:"):\n                print(f"Scope: {stripped}")\n            elif stripped.startswith("Sample"):\n                pass\n            else:\n                print(f"Question: {stripped}") '

In [65]:
import pandas as pd
import json
with open("../../resources/questions/epo_topics_structure.json", 'r') as js:
    themes = json.load(js)
df = pd.DataFrame.from_dict(themes)

In [66]:
themes

[{'topic': '1. Filing requirements and formalities',
  'subtopic': '1.1 Minimum requirements for a filing date',
  'scope': 'assessing the mandatory elements (e.g., description, claims) and their submission modes.',
  'questions': ['What documents are required on the filing date for the epo to accord a filing date?',
   'How is the filing date affected if claims are missing initially?',
   'If no reference to any drawing is included, can missing drawings be submitted later without a change in the filing date?']},
 {'topic': '1. Filing requirements and formalities',
  'subtopic': '1.2 Filing methods and locations',
  'scope': 'addressing allowed filing modes (fax, online, physical) and relevant epo sites.',
  'questions': ['Which epo offices can receive an ep application filed via fax?',
   'How do the rules differ for filing with the epo vs. National patent offices?']},
 {'topic': '1. Filing requirements and formalities',
  'subtopic': '1.3 Formality examination',
  'scope': 'checking 

In [43]:
mappings = []
mappings.append(pd.read_json("../../resources/questions/table0.json"))
mappings.append(pd.read_json("../../resources/questions/table1.json"))
mappings.append(pd.read_json("../../resources/questions/table2.json"))

In [99]:
import re

def extract_num(text):
    match = re.match(r"^\d+(?:\.\d+)*", text)
    if match:
        return match.group()

categories = {}
for _, row in mappings[0].iloc[0:10].iterrows():
    category = str(extract_num(row["Category"]))
    subcategory = str(extract_num(row["Subcategory"]))
    qnum = row["Question"]
    exam = "EQE_2022_PreEx_final_documentLess.json"
    if category not in categories:
        categories[category] = {}
    if subcategory not in categories[category]:
        categories[category][subcategory] = []
    categories[category][subcategory].append((exam, int(qnum[1:])))

for _, row in mappings[1].iloc[0:10].iterrows():
    category = str(extract_num(row["Category"]))
    subcategory = str(extract_num(row["Subcategory"]))
    qnum = row["Question"]
    exam = "EQE_2021_PreEx_final_documentLess.json"
    if category not in categories:
        categories[category] = {}
    if subcategory not in categories[category]:
        categories[category][subcategory] = []
    categories[category][subcategory].append((exam, int(qnum[1:])))

for _, row in mappings[2].iterrows():
    category = str(extract_num(row["Category"]))
    subcategory = str(extract_num(row["Subcategory"]))
    qnum = row["Q"]
    exam = "EOB.json"
    if category not in categories:
        categories[category] = {}
    if subcategory not in categories[category]:
        categories[category][subcategory] = []
    categories[category][subcategory].append((exam, int(qnum)))

In [100]:
import json
with open("categories2questions.json", 'w') as cs:
    json.dump(categories, cs)

In [101]:
count = 0
for category in categories:
    for subcategory in categories[category]:
        count += len(categories[category][subcategory])

In [102]:
categories

{'11': {'11.1': [('EQE_2022_PreEx_final_documentLess.json', 1)],
  '11.2': [('EQE_2022_PreEx_final_documentLess.json', 4),
   ('EOB.json', 5),
   ('EOB.json', 20)]},
 '9': {'9.1': [('EQE_2022_PreEx_final_documentLess.json', 2),
   ('EQE_2021_PreEx_final_documentLess.json', 5),
   ('EOB.json', 14)],
  '9.3': [('EQE_2022_PreEx_final_documentLess.json', 3),
   ('EQE_2021_PreEx_final_documentLess.json', 8),
   ('EOB.json', 18)],
  '9.2': [('EQE_2022_PreEx_final_documentLess.json', 10),
   ('EOB.json', 15),
   ('EOB.json', 16),
   ('EOB.json', 17),
   ('EOB.json', 30),
   ('EOB.json', 31)],
  'None': [('EOB.json', 19), ('EOB.json', 22)]},
 '5': {'5.1': [('EQE_2022_PreEx_final_documentLess.json', 5),
   ('EQE_2021_PreEx_final_documentLess.json', 9)],
  '5.3': [('EOB.json', 1), ('EOB.json', 24)]},
 '3': {'3.1': [('EQE_2022_PreEx_final_documentLess.json', 6),
   ('EQE_2021_PreEx_final_documentLess.json', 3),
   ('EOB.json', 8),
   ('EOB.json', 23)]},
 '10': {'10.3': [('EQE_2022_PreEx_final_doc

In [103]:
txt = "Where evidence is provided that a final decision within the meaning of Article 61, paragraph 1, has been taken, the European Patent Office shall inform the applicant and any other party that the proceedings for grant shall be resumed as from the date stated in the communication, unless a new European patent application under Article 61, paragraph 1(b), has been filed for all the designated Contracting States. If the decision is in favour of the third party, the proceedings may not be resumed earlier than three months after the decision has become final, unless the third party requests the resumption."

len(txt.split(' '))

100

In [19]:
import pandas as pd
from docx.api import Document

document = Document("../../resources/Categories.docx")
tables = document.tables
for k, table in enumerate(tables):
    data = []

    keys = None
    for i, row in enumerate(table.rows):
        text = (cell.text for cell in row.cells)

        if i == 0:
            keys = tuple(text)
            continue
        row_data = dict(zip(keys, text))
        data.append(row_data)
    print (data)

    df = pd.DataFrame(data)
    df.to_json(f"../../resources/questions/table{k}.json")

[{'Question': 'Q1', 'Category': '11. Entitlement and Transfers', 'Subcategory': '11.1 Entitlement Disputes'}, {'Question': 'Q2', 'Category': '9. Opposition and Appeals', 'Subcategory': '9.1/9.2 (Grounds/Procedure)'}, {'Question': 'Q3', 'Category': '9. Opposition and Appeals', 'Subcategory': '9.3 Appeal Proceedings'}, {'Question': 'Q4', 'Category': '11. Entitlement and Transfers', 'Subcategory': '11.2 Transfers and Assignments'}, {'Question': 'Q5', 'Category': '5. Languages and Translations', 'Subcategory': '5.1 Language of Filing / Procedural Language'}, {'Question': 'Q6', 'Category': '3. Divisional Applications', 'Subcategory': '3.1 Filing Requirements'}, {'Question': 'Q7', 'Category': '10. Substantive Patent Law', 'Subcategory': '10.3 Special Forms (Medical Use)'}, {'Question': 'Q8', 'Category': '7. PCT Procedure and Entry into EP Phase', 'Subcategory': '7.1 International Filing and Search'}, {'Question': 'Q9', 'Category': '1. Filing Requirements and Formalities', 'Subcategory': '1.1

In [91]:
mappings[2]

Unnamed: 0,Q,Topic (Short),Category,Unnamed: 4,Subcategory
0,1,20% exam fee reduction (Spanish),5. Languages & Translations,,5.3 Effects of Language on Costs/Rights
1,2,Stay of proceedings & renewal fees,6. Procedural Remedies & Legal Effect,,(Stay of proceedings under Rule 14(4))
2,3,Interruption (Rule 142) & resuming an oppositi...,6. Procedural Remedies & Legal Effect,,(Interruption/resumption)
3,4,Correcting inventor in PCT vs. EPC (consent no...,7. PCT Procedure & Entry into EP Phase,,7.1 International Filing & Search
4,5,Recording sub-licence in EP Register,11. Entitlement & Transfers,,11.2 Transfers & Assignments (Licences)
5,6,Exhibition priority certificate refused if not...,1. Filing Requirements & Formalities,,1.3 Formality Examination or a special Art. 55...
6,7,Payment date of exam fee via bank transfer,"4. Fees, Payment & Time Limits",,4.2 Payment Mechanisms / 4.3 Deadlines
7,8,"Filing a divisional after refusal, still withi...",3. Divisional Applications,,3.1 Filing Requirements
8,9,Renewal fees can be paid up to 3 months early,"4. Fees, Payment & Time Limits",,4.3 Fee Deadlines
9,10,"Article 61(1)(b) new application, first renewa...","4. Fees, Payment & Time Limits",,4.3 Fee Deadlines
