In [3]:
from bs4 import BeautifulSoup
import requests
import os

In [7]:
base_url = "https://www.epo.org/en/legal/epc/2020"
summary_url = "/".join([base_url, "regulations.html"])
response = requests.get(url=summary_url)
if response.ok:
    soup = BeautifulSoup(response.content)
else:
    print("request error")

In [18]:
def extract_name(item) -> str:
    spans = item.find_all("span")
    i_end = 2
    if len(spans) < 2:
        i_end = len(spans)
    part_name = " - ".join([span.get_text() for span in spans[:i_end]])
    return part_name

In [78]:
from typing import TypedDict

class Article(TypedDict):
    book: str
    part: str
    chapter: str
    title: str
    text: str
    references: list[str]

def extract_article(url: str, part: str, chapter: str) -> Article:
    article = {"book": "EPC", "part": part, "chapter": chapter, "title": "", "text": "", "references": ""}
    response = requests.get(url=url)
    if response.ok:
        soup = BeautifulSoup(response.content)
    else:
        print("request error")
        return Article[article]

    block = soup.find("div", class_="lc-inline_column_second-edit layoutcomponent-column col-sm-12 col-md-12 col-lg-6 show-modification-body legal-text-navigation-buttons").find("div")
    article["title"] = block.find("div").find("h1").get_text(" - ")
    article["text"] = block.find("div", class_="site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody").find("div", class_="epolegal-content").get_text()
    refs_block = soup.find("div", class_="site-references")
    if refs_block:
        refs = refs_block.find_all("a")
        article["references"] = [ref.get("title") for ref in refs]

    return Article(article)

In [10]:
body = "site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody"
accordeon = soup.find(class_=body).find_next("div").find_next("div").find_next("dl")

content = []

for part, chapters in zip(accordeon.find_all("dt"), accordeon.find_all("dd")):
    part_name = extract_name(part)
    print(part_name)
    chapter = ""
    for elem in chapters.find_all("div"):
        if elem.has_attr("class") and "card-header-nolink" in elem["class"]:
            chapter_name = extract_name(elem)
            print(f"  {chapter_name}")
        if elem.has_attr("class") and "card-header-white" in elem["class"]:
            article = elem
            article_a = article.find_next("h5").find_next("a")
            sub_url = article_a.get("href")
            article_url = "/".join([base_url, sub_url])
            article_name = extract_name(article_a)
            print(f"    {article_name}")
            print("    ", article_url)
            print("    ----------")
            content.append(extract_article(article_url, part_name, chapter_name))

IndexError: list index out of range

In [6]:
import json
with open("EPC_regulations.json", "w") as js:
    json.dump(content, js)

# Guidelines

This part has to be addressed separately as the html shape is different.

In [1]:
import requests
from bs4 import BeautifulSoup

In [18]:
from typing import TypedDict

class Article(TypedDict):
    book: str
    part: str
    chapter: str
    title: str
    text: str
    references: list[str]

def extract_article(url: str, part: str, chapter: str) -> Article:
    article = {"book": "EPO_guidelines", "part": part, "chapter": chapter, "title": "", "implementation": {}}
    response = requests.get(url=url)
    if response.ok:
        soup = BeautifulSoup(response.content)
    else:
        print("request error")
        return Article(article)

    block = soup.find("div", class_="lc-inline_column_second-edit layoutcomponent-column col-sm-12 col-md-12 col-lg-6 show-modification-body legal-text-navigation-buttons").find("div", class_="layout-builder__region js-layout-builder-region lc-inline_column_second-content-edit")
    article["title"] = block.find("div", class_="lc-inline_block_908ed3e9e0298a0f3a8a96e0e0f83b8f-edit block block-layout-builder block-field-blocknodeeuropean-patent-conventionsfield-caption").find("h1").get_text(" - ")
    content_blocks = block.find("div", class_="site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody").find_all("div", class_="DOC4NET2_Frame")
    if content_blocks:
        for content_block in content_blocks:
            text_block = content_block.find("div", class_="DOC4NET2_Frame_Text")
            ref_block = content_block.find("div", class_="DOC4NET2_Frame_Margin")
            if ref_block and len(ref_block.find_all("a")) > 0:
                for ref in ref_block.find_all("a"):
                    article["implementation"][ref.get("title").split(" ")[0]] = text_block.get_text()
            else:
                article["implementation"] = text_block.get_text()

    return Article(article)

In [19]:
url = "https://www.epo.org/en/legal/guidelines-epc/2024/a_vii_5.html"
extract_article(url, "", "")

{'book': 'EPO_guidelines',
 'part': '',
 'chapter': '',
 'title': '5. Documents filed in the wrong language\xa0 ',
 'implementation': {'Article\xa014': 'Documents making up the European patent application can only be filed in the wrong language on the occasion of its amendment, since the application can originally be filed in any language (see A‑VII,\xa01.1). In such a case, as well as if any other document is not filed in the prescribed language or any required translation is not filed in due time, the document is deemed not filed. The person who has filed the document will be notified accordingly by the EPO. Even though deemed not filed, the document concerned will become part of the file and therefore accessible to the public according to Art.\xa0128(4).\n',
  'Rule\xa03': 'In the event of failure to file a translation of the filed documentary evidence upon invitation in due time, the documents in question may be disregarded by the EPO.\nWhere submissions accompanying the performanc

In [20]:
base_url = "https://www.epo.org"
summary_url = "/".join([base_url, "en/legal/guidelines-epc/2024/index.html"])
response = requests.get(url=summary_url)
if response.ok:
    soup = BeautifulSoup(response.content, "html.parser")
else:
    print("request error")

In [21]:
def browse_sommaire_depth_first(sommaire: list, depth: int=0) -> str:
    output = []
    i = 0
    while i < len(sommaire):
        if isinstance(sommaire[i], list):
            output.extend(browse_sommaire_depth_first(sommaire[i], depth+1))
            i += 1
        else:
            output.append((sommaire[0], sommaire[1]))
            i += 3
        
    return output
    """ if elem.startswith('/'):
        address = elem
        print(" "*depth, f"address: {address}")
    elif elem != "":
        title = elem
        print(" "*depth, f"{title}") """

In [22]:
x = soup.find("div", class_="lc-section lc-section-884 layoutcomponents-three-column container legal-texts lc-inline_section-edit").find("div", class_="layout-builder__region js-layout-builder-region lc-inline_column_first-content-edit").find("div", class_="views-element-container block block-views block-views-blockepc-left-epc-content").find("script")

In [23]:
def parse(x: str):
    x = x[:-1]
    x = x.replace("var oMenu = ", "")
    return x

sommaire_guidelines = eval(parse(x.get_text()))

In [24]:
from tqdm import tqdm
content = []
for part in tqdm(sommaire_guidelines[1:-3]):
    for chapter in part[3]:
        print(f"{part[0]} ; {chapter[0]}")
        elements = browse_sommaire_depth_first(chapter)
        for guideline, url in elements:
            url = "/".join([base_url, url])
            content.append(extract_article(url, part[0], chapter[0]))

  0%|          | 0/8 [00:00<?, ?it/s]

Part A – Guidelines for Formalities Examination ; Chapter I – Introduction
Part A – Guidelines for Formalities Examination ; Chapter II – Filing of applications and examination on filing


  0%|          | 0/8 [00:03<?, ?it/s]

else





AttributeError: 'dict' object has no attribute 'add'

In [54]:
import json
with open("EPO_guidelines.json", 'w') as js:
    json.dump(content, js)

In [17]:
content

[{'book': 'EPO_guidelines',
  'part': 'Part A – Guidelines for Formalities Examination',
  'chapter': 'Chapter I – Introduction',
  'title': 'Chapter I – Introduction',
  'implementation': {}},
 {'book': 'EPO_guidelines',
  'part': 'Part A – Guidelines for Formalities Examination',
  'chapter': 'Chapter I – Introduction',
  'title': '1. Overview\xa0 ',
  'implementation': {}},
 {'book': 'EPO_guidelines',
  'part': 'Part A – Guidelines for Formalities Examination',
  'chapter': 'Chapter I – Introduction',
  'title': '2. Responsibility for formalities examination\xa0 ',
  'implementation': {'Rule\xa010': 'The matters covered in this part are intended for EPO formalities staff at all sites (The\xa0Hague, Munich and Berlin) and in particular for the Receiving Section, which is specifically responsible under the EPC for ensuring that the formal requirements for European patent applications are met. Once an application is transferred to the examining division, the latter accepts responsibili