In [6]:
import pymupdf

# Preprocess PCT

In [7]:
def extract_articles(blocks) -> list[dict]:
    articles = []
    article = None
    article_flag = 0
    text = ""
    for block in blocks:
        #If the block starts with superscript text, it means it's a footnote
        if len(block["lines"]) > 0 and len(block["lines"][0]["spans"]) > 0 and block["lines"][0]["spans"][0]["size"] < 8.5:
            continue
        
        for line in block["lines"]:
            for span in line["spans"]:
                if article_flag == 1:
                    if span["font"] == "TimesNewRomanPS-BoldMT":
                        if not (span["text"].startswith("Article") or span["text"].startswith("Rule")):
                            if not "title" in article:
                                article["title"] = ""
                            article["title"] += span["text"]
                    else:
                        if span["size"] > 8.5:
                            text += span["text"].rstrip("-")
                
                if span["font"] == "TimesNewRomanPS-BoldMT" and (span["text"].startswith("Article") or span["text"].startswith("Rule")):
                    article_flag = 1
                    if article:
                        article["text"] = text
                        text = ""
                        if "title" in article:
                            article["title"] = article["title"].strip()
                        articles.append(article)
                    article = {}
                    article["id"] = span["text"].strip()
    if article:
        article["text"] = text
        if "title" in article:
            article["title"] = article["title"].strip()
        articles.append(article)

    return articles

In [8]:
def manage_overlap(page_blocks, section_origin: str):
    section_type = " ".join(section_origin.split(" ")[:2])
    sections_landmarks = {}
    for i in range(len(page_blocks)):
        if len(page_blocks[i]["lines"]) > 0 and len(page_blocks[i]["lines"][0]["spans"]) > 0 and page_blocks[i]["lines"][0]["spans"][0]["text"].isupper():
            text = " ".join(page_blocks[i]["lines"][0]["spans"][0]["text"].strip().split(" ")[:2])
            sections_landmarks[text] = i
    if section_type not in sections_landmarks:
        if len(sections_landmarks) > 0:
            i_end = next(iter(sections_landmarks.values()))
            return page_blocks[:i_end]
        else:
            return page_blocks
    else:
        i_deb = sections_landmarks[section_type]
        section_crossed = False
        i_end = len(page_blocks)
        for elem in sections_landmarks:
            if section_crossed:
                i_end = sections_landmarks[elem]
            if elem == section_type:
                section_crossed = True
        return page_blocks[i_deb:i_end]
#algo en pseudo code 
#récupérer le nom du chapitre associé
#chercher dans la page les blocks de séparation autres que "article" ou "rule"
#récupérer les indices correspondant aux blocks entre deux blocs de séparation
#si la balise du chapitre correspond, on prend tout ce qu'il y a après et avant éventuelle prochaine balise
#sinon on prend tout ce qu'il y a après et jusqu'à éventuelle prochaine balise

In [12]:
path = "../resources/Datasets - v1/Official Legal Publications/2-PCT_wipo-pub-274-2024-en-patent-cooperation-treaty.pdf"

In [13]:
font_size_threshold = 8.5
with pymupdf.open(path) as pdf:
    page = pdf[215]
    
    for block in page.get_text("dict", flags=11)["blocks"][2:]:
        #If the block starts with superscript text, it means it's a footnote
        if block["lines"][0]["spans"][0]["size"] < font_size_threshold:
            continue
        
        for line in block["lines"]:
            for span in line["spans"]:
                print(f"Font: {span['font']}\nText: {span['text']}\nSize: {span['size']}\nBbox: {span['bbox']}\ncolor: {span['color']}\n-------------------")
            print("----- EOL -----")
        print("----- EOBlock -----")

Font: TimesNewRomanPSMT
Text: period, the number of Contracting States having thus expressed their vote or 
Size: 9.894128799438477
Bbox: (137.87423706054688, 191.66366577148438, 459.9747009277344, 204.9910430908203)
color: 0
-------------------
----- EOL -----
Font: TimesNewRomanPSMT
Text: abstention attains the number of Contracting States which was lacking for 
Size: 9.894128799438477
Bbox: (137.87423706054688, 202.99246215820312, 459.9786682128906, 216.31983947753906)
color: 0
-------------------
----- EOL -----
Font: TimesNewRomanPSMT
Text: attaining the quorum in the session itself, such decisions shall take effect provided 
Size: 9.894128799438477
Bbox: (137.87423706054688, 214.32125854492188, 460.0478210449219, 227.6486358642578)
color: 0
-------------------
----- EOL -----
Font: TimesNewRomanPSMT
Text: that at the same time the required majority still obtains. 
Size: 9.894128799438477
Bbox: (137.87423706054688, 225.73907470703125, 358.44403076171875, 239.0664520263672)
color: 

In [14]:
import re
def special_format(section_origin: str) -> str:
    cleaned_text = re.sub(r'INTERNATIONAL', '', section_origin, flags=re.IGNORECASE)
    pattern = r'^[a-zA-Z]+ [IVX]+'
    match = re.search(pattern, cleaned_text)
    if match:
        return match.group()
    return section_origin

content = []
with pymupdf.open(path) as pdf:
    toc = pdf.get_toc()
    for level, title, page_num in toc:
        indent = "    " * (level - 1)  # 4 espaces par niveau de profondeur
        print(f"{indent}- {title} (Page {page_num})")
    
    levels = ["", "", "", ""]
    last_level = 0
    section_flag = 0
    for level, title, page_num in toc:
        if title[0].isdigit() or title[0] == "(" or title.startswith("The") or title.startswith("is"):
            continue
        if level < last_level:
            print("level down", section_flag)
            if section_flag == 1:
                section_flag = 0
                blocks = []
                print(begin_section, page_num)
                for pn in range(begin_section-1, page_num):
                    page = pdf[pn]
                    page_blocks = page.get_text("dict", flags=11)["blocks"]
                    section_origin = special_format(list(filter(lambda x: x != "", levels))[-1])
                    if pn == begin_section-1:
                        blocks.extend(manage_overlap(page_blocks, section_origin))
                    elif pn == page_num-1:
                        blocks.extend(manage_overlap(page_blocks, section_origin))
                    else:
                        blocks.extend(page_blocks)
                section_articles = extract_articles(blocks)
                for article in section_articles:
                    article["origin"] = " -> ".join(list(filter(lambda x: x != "", levels)))
                content.extend(section_articles)

            for i in range(level, len(levels)):
                levels[i] = ""
        

        if title.lower().startswith("article") or title.lower().startswith("rule"):
            if level > last_level:
                print("level up")
                begin_section = page_num
                section_flag = 1

        else:
            levels[level-1] = title

        last_level = level

- (E) 274 July 2024 Final.pdf (Page 1)
    - (E) Intro 274 July 2024 (Page 3)
        - CONTENTS (Page 4)
    - E_PCT Treaty (Page 5)
        - Patent Cooperation Treaty (Page 5)
        - INTRODUCTORY PROVISIONS (Page 9)
            - Article 1Establishment of a Union (Page 9)
            - Article 2Definitions (Page 9)
        - CHAPTER IINTERNATIONAL APPLICATION AND INTERNATIONAL SEARCH (Page 11)
            - Article 3The International Application (Page 11)
            - Article 4The Request (Page 11)
            - Article 5The Description (Page 12)
            - Article 6The Claims (Page 12)
            - Article 7The Drawings (Page 13)
            - Article 8Claiming Priority (Page 13)
            - Article 9The Applicant (Page 13)
            - Article 10The Receiving Office (Page 14)
            - Article 11Filing Date and Effects of the International Application (Page 14)
            - Article 12Transmittal of the International Application to the International Bureau and the I

In [15]:
import pandas as pd

df = pd.DataFrame(content)
i = 69
while i > 0:
    name = f"Rule {str(i)}"
    df_slice = df[df["id"] == name]
    if df_slice.empty:
        print(i)
        break
    i -= 1

In [17]:
df

Unnamed: 0,id,title,text,origin
0,Article 1,Establishment of a Union,(1) The States party to this Treaty (hereina...,E_PCT Treaty -> INTRODUCTORY PROVISIONS
1,Article 2,Definitions,For the purposes of this Treaty and the Regul...,E_PCT Treaty -> INTRODUCTORY PROVISIONS
2,Article 3,The International Application,(1) Applications for the protection of inven...,E_PCT Treaty -> CHAPTER IINTERNATIONAL APPLICA...
3,Article 4,The Request,(1) The request shall contain: (i) a petiti...,E_PCT Treaty -> CHAPTER IINTERNATIONAL APPLICA...
4,Article 5,The Description,The description shall disclose the invention ...,E_PCT Treaty -> CHAPTER IINTERNATIONAL APPLICA...
...,...,...,...,...
186,Rule 93,Keeping of Records and Files,93.1 The Receiving Office Each receiving Of...,E_PCT Treaty -> PART F - RULES CONCERNING SEVE...
187,Rule 93,Manner of Communication of Documents,bis 93bis.1 Communication on Request; Comm...,E_PCT Treaty -> PART F - RULES CONCERNING SEVE...
188,Rule 94,Access to Files,94.1 Access to the File Held by the Interna...,E_PCT Treaty -> PART F - RULES CONCERNING SEVE...
189,Rule 95,Information and Translations from Designated a...,95.1 Information Concerning Events at the De...,E_PCT Treaty -> PART F - RULES CONCERNING SEVE...
