# Initialization
## Imports

In [59]:
import pymupdf
from typing import Generator
path = "/home/timothee/Documents/data_challenges/PatentAssist/resources/Datasets - v1/Official Legal Publications/3-en-epc-guidelines-2024-hyperlinked.pdf"

## Functions definition

In [60]:
def browse_summary(summary: list[list[3]]) -> Generator:
    levels = ["", "", "", ""]
    last_level = 0
    section_flag = 0
    for level, title, page_num in summary:
        if level < last_level:
            if section_flag == 1:
                section_flag = 0
                section_origin = list(filter(lambda x: x != "", levels))[-1]
                yield section_origin, begin_section-1, page_num

            for i in range(level, len(levels)):
                levels[i] = ""
        

        if title.lower().startswith("article") or title.lower().startswith("rule"):
            if level > last_level:
                begin_section = page_num
                section_flag = 1

        else:
            levels[level-1] = title

        last_level = level

In [46]:
def content_ref_split(page: pymupdf.Page, hoffset: int, woffset: int) -> tuple[dict]:
    height, width = page.rect.height, page.rect.width
    if page.number % 2 == 1:
        page.set_cropbox((0, hoffset, woffset, height))
        ref = page.get_text("dict", flags=11)["blocks"]
        page.set_cropbox((woffset, hoffset, width, height))
        content = page.get_text("dict", flags=11)["blocks"]
    else:
        page.set_cropbox((width-woffset, hoffset, width, height))
        ref = page.get_text("dict", flags=11)["blocks"]
        page.set_cropbox((0, hoffset, width-woffset, height))
        content = page.get_text("dict", flags=11)["blocks"]

    return ref, content


In [None]:
def extract_articles():
    pass

In [None]:
def manage_overlap(page_blocks, section_origin: str):
    section_type = " ".join(section_origin.split(" ")[:2])
    sections_landmarks = {}
    for i in range(len(page_blocks)):
        if len(page_blocks[i]["lines"]) > 0 and len(page_blocks[i]["lines"][0]["spans"]) > 0 and page_blocks[i]["lines"][0]["spans"][0]["font"] == "Arial-BoldMT" and page_blocks[i]["lines"][0]["spans"][0]["color"] != 0:
            text = " ".join(page_blocks[i]["lines"][0]["spans"][0]["text"].strip().split(" ")[:2])
            sections_landmarks[text] = i
    if section_type not in sections_landmarks:
        if len(sections_landmarks) > 0:
            i_end = next(iter(sections_landmarks.values()))
            return page_blocks[:i_end]
        else:
            return page_blocks
    else:
        i_deb = sections_landmarks[section_type]
        section_crossed = False
        i_end = len(page_blocks)
        for elem in sections_landmarks:
            if section_crossed:
                i_end = sections_landmarks[elem]
            if elem == section_type:
                section_crossed = True
        return page_blocks[i_deb:i_end]
#algo en pseudo code 
#récupérer le nom du chapitre associé
#chercher dans la page les blocks de séparation autres que "article" ou "rule"
#récupérer les indices correspondant aux blocks entre deux blocs de séparation
#si la balise du chapitre correspond, on prend tout ce qu'il y a après et avant éventuelle prochaine balise
#sinon on prend tout ce qu'il y a après et jusqu'à éventuelle prochaine balise

In [None]:
def extract_blocks(pdf: pymupdf.Document, section_origin: str, begin_section: int, end_section: int):
    blocks = []
    for pn in range(begin_section-1, end_section):
        page = pdf[pn]
        page_blocks = page.get_text("dict", flags=11)["blocks"]
        
        if pn == begin_section-1:
            blocks.extend(manage_overlap(page_blocks, section_origin))
        elif pn == end_section-1:
            blocks.extend(manage_overlap(page_blocks, section_origin))
        else:
            blocks.extend(page_blocks)

    return blocks

# Main loop

In [None]:
content = []
with pymupdf.open(path) as pdf:
    summary = pdf.get_toc()

    for section_of_interest in browse_summary(summary):
        section_origin, begin_section, end_section = section_of_interest
        blocks_of_interest = extract_blocks(pdf, section_origin, begin_section, end_section)
        section_articles = extract_articles(blocks_of_interest)
        for article in section_articles:
            article["origin"] = " -> ".join(section_origin)
        content.extend(section_articles)

    for level, title, page in summary:
        indent = "   "*(level-1)
        print(f"{indent}{title} -> {page}")
        

<class 'pymupdf.Document'>


NameError: name 'browse_summary' is not defined

In [61]:
page_num = 44
with pymupdf.open(path) as pdf:
    page = pdf[page_num]
    for block in page.get_text("dict", flags=11)["blocks"]:
        for line in block["lines"]:
            for span in line["spans"]:
                print(f"{span['font']}\n{span['text']}\n{span['size']}\n----------")
            print("----- EOL -----")
        print("----- EOB -----")

Arial
March 2024 
9.0
----------
----- EOL -----
Arial
Guidelines for Examination in the EPO 
9.0
----------
----- EOL -----
Arial
Part A – Chapter II-3 
9.0
----------
----- EOL -----
----- EOB -----
Arial,Bold
1.3 Filing of applications by other means 
11.039999961853027
----------
----- EOL -----
Arial
The filing of European patent applications by other means such as 
11.039999961853027
----------
Arial,Bold
email
11.039999961853027
----------
Arial
 is
11.039999961853027
----------
Arial,Bold
 
11.039999961853027
----------
----- EOL -----
Arial
at present not allowed (see also the notice from the EPO dated 
11.039999961853027
----------
----- EOL -----
Arial
12 September 2000, 
11.039999961853027
----------
Arial
OJ EPO 2000, 458
11.039999961853027
----------
Arial
). 
11.039999961853027
----------
----- EOL -----
----- EOB -----
Arial,Bold
1.4 Subsequent filing of documents 
11.039999961853027
----------
----- EOL -----
Arial
For the subsequent filing of documents, see 
11.039999

In [53]:
page_num = 43
hoffset = 70
woffset = 160
with pymupdf.open(path) as pdf:
    page = pdf[page_num]
    
    ref, content = content_ref_split(page, hoffset, woffset)
    print("REF")
    for block in ref:
        for line in block["lines"]:
            for span in line["spans"]:
                print(span["text"])
    print("-----------")
    print("CONTENT")
    for block in content:
        for line in block["lines"]:
            for span in line["spans"]:
                print(span["text"])

left 1316.7179173708428
right 63192.89335054171
REF
Art. 75(1) 
Rule 35(1)
 
-----------
CONTENT
Austria (AT), 
Bulgaria (BG), 
Czech 
Republic (CZ), 
Estonia (EE), 
Finland (FI), 
France (FR), 
Germany (DE), 
Iceland (IS), 
Ireland (IE), 
Luxembourg (LU), 
Monaco (MC), 
Norway (NO), 
Portugal (PT), 
Slovenia (SI), Sweden (SE) and United Kingdom (GB). For further details, 
see the latest version of the booklet "National law relating to the EPC" 
available on the EPO website (
epo.org
). 
If a faxed application is illegible or incomplete, it is to be treated as not 
having been received to the extent that it is illegible or that the attempted 
transmission failed, and the sender must be notified as soon as possible 
(see the decision of the President of the EPO dated 20 February 2019, 
OJ EPO 2019, A18)
. 
If a European patent application is filed by fax, a written confirmation is 
required only where the documents are of inferior quality. In this case, the 
EPO will invite the applican

In [55]:
with pymupdf.open(path) as pdf:
    last_page_num = -1
    for page in pdf:
        if page.number != last_page_num + 1:
            print(page.number)
        last_page_num = page.number