# EPC

In [34]:
from bs4 import BeautifulSoup
import requests
import os

In [35]:
base_url = "https://www.epo.org/en/legal/epc/2020"
summary_url = "/".join([base_url, "convention.html"])
response = requests.get(url=summary_url)
if response.ok:
    soup = BeautifulSoup(response.content)
else:
    print("request error")

In [36]:
def extract_name(item) -> str:
    spans = item.find_all("span")
    i_end = 2
    if len(spans) < 2:
        i_end = len(spans)
    part_name = " - ".join([span.get_text() for span in spans[:i_end]])
    return part_name

In [38]:
from typing import TypedDict

class Article(TypedDict):
    book: str
    part: str
    chapter: str
    title: str
    text: str
    references: list[str]

def extract_article(url: str, part: str, chapter: str) -> Article:
    article = {"book": "EPC", "part": part, "chapter": chapter, "title": "", "text": "", "references": "", "url": url}
    response = requests.get(url=url)
    if response.ok:
        soup = BeautifulSoup(response.content)
    else:
        print("request error")
        return Article[article]

    block = soup.find("div", class_="lc-inline_column_second-edit layoutcomponent-column col-sm-12 col-md-12 col-lg-6 show-modification-body legal-text-navigation-buttons").find("div")
    title_block = block.find("div").find("h1")
    if title_block.find("a", class_="FootnoteRef"):
        for el in title_block.find_all("a", class_="FootnoteRef"):
            el.extract()
    article["title"] = title_block.get_text(" - ")
    article["text"] = block.find("div", class_="site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody").find("div", class_="epolegal-content").get_text()
    refs_block = soup.find("div", class_="site-references")
    if refs_block:
        refs = refs_block.find_all("a")
        article["references"] = [ref.get("title") for ref in refs]

    return Article(article)

In [39]:
body = "site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody"
accordeon = soup.find(class_=body).find_next("div").find_next("div").find_next("dl")

content = []

for part, chapters in zip(accordeon.find_all("dt"), accordeon.find_all("dd")):
    part_name = extract_name(part)
    print(part_name)
    chapter = ""
    for elem in chapters.find_all("div"):
        if elem.has_attr("class") and "card-header-nolink" in elem["class"]:
            chapter_name = extract_name(elem)
            print(f"  {chapter_name}")
        if elem.has_attr("class") and "card-header-white" in elem["class"]:
            article = elem
            article_a = article.find_next("h5").find_next("a")
            sub_url = article_a.get("href")
            article_url = "/".join([base_url, sub_url])
            article_name = extract_name(article_a)
            print(f"    {article_name}")
            print("    ", article_url)
            print("    ----------")
            content.append(extract_article(article_url, part_name, chapter_name))

Part I - General and institutional provisions
  Chapter I - General provisions
    Art. 1 - European law for the grant of patents
     https://www.epo.org/en/legal/epc/2020/a1.html
    ----------
    Art. 2 - European patent
     https://www.epo.org/en/legal/epc/2020/a2.html
    ----------
    Art. 3 - Territorial effect
     https://www.epo.org/en/legal/epc/2020/a3.html
    ----------
    Art. 4 - European Patent Organisation
     https://www.epo.org/en/legal/epc/2020/a4.html
    ----------
    Art. 4a - Conference of ministers of the Contracting States
     https://www.epo.org/en/legal/epc/2020/a4a.html
    ----------
  Chapter II - The European Patent Organisation
    Art. 5 - Legal status
     https://www.epo.org/en/legal/epc/2020/a5.html
    ----------
    Art. 6 - Headquarters
     https://www.epo.org/en/legal/epc/2020/a6.html
    ----------
    Art. 7 - Sub-offices of the European Patent Office
     https://www.epo.org/en/legal/epc/2020/a7.html
    ----------
    Art. 8 - Privil

In [40]:
content

[{'book': 'EPC',
  'part': 'Part I - General and institutional provisions',
  'chapter': 'Chapter I - General provisions',
  'title': 'Article\xa01 - European law for the grant of patents',
  'text': 'A system of law, common to the Contracting States, 2 for the grant of patents for invention is established by this Convention.\n\n\n2There are currently 3839 Contracting States: AL, AT, BE, BG, CH, CY, CZ, DE, DK, EE, ES, FI, FR, GB, GR, HR, HU, IE, IS, IT, LI, LT, LU, LV, MC, ME, MK, MT, NL, NO, PL, PT, RO, RS, SE, SI, SK, SM, TR.\n',
  'references': '',
  'url': 'https://www.epo.org/en/legal/epc/2020/a1.html'},
 {'book': 'EPC',
  'part': 'Part I - General and institutional provisions',
  'chapter': 'Chapter I - General provisions',
  'title': 'Article\xa02 - European patent',
  'text': '(1) Patents granted under this Convention shall be called European patents.\xa0\n(2) The European patent shall, in each of the Contracting States for which it is granted, have the effect of and be subjec

In [41]:
import json
with open("EPC_data.json", "w") as js:
    json.dump(content, js)

# EPC regulations

In [27]:
from bs4 import BeautifulSoup
import requests
import os

In [28]:
base_url = "https://www.epo.org/en/legal/epc/2020"
summary_url = "/".join([base_url, "regulations.html"])
response = requests.get(url=summary_url)
if response.ok:
    soup = BeautifulSoup(response.content)
else:
    print("request error")

In [29]:
def extract_name(item) -> str:
    spans = item.find_all("span")
    i_end = 2
    if len(spans) < 2:
        i_end = len(spans)
    part_name = " - ".join([span.get_text() for span in spans[:i_end]])
    return part_name

In [30]:
from typing import TypedDict

class Article(TypedDict):
    book: str
    part: str
    chapter: str
    title: str
    text: str
    references: list[str]

def extract_article(url: str, part: str, chapter: str, section: str) -> Article:
    article = {"book": "EPC", "part": part, "chapter": chapter, "section": section, "title": "", "text": "", "references": "", "url": url}
    response = requests.get(url=url)
    if response.ok:
        soup = BeautifulSoup(response.content)
    else:
        print("request error")
        return Article[article]

    block = soup.find("div", class_="lc-inline_column_second-edit layoutcomponent-column col-sm-12 col-md-12 col-lg-6 show-modification-body legal-text-navigation-buttons").find("div")
    title_block = block.find("div").find("h1")
    if title_block.find("a", class_="FootnoteRef"):
        for el in title_block.find_all("a", class_="FootnoteRef"):
            el.extract()
    article["title"] = title_block.get_text(" - ")
    article["text"] = block.find("div", class_="site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody").find("div", class_="epolegal-content").get_text()
    refs_block = soup.find("div", class_="site-references")
    if refs_block:
        refs = refs_block.find_all("a")
        article["references"] = [ref.get("title") for ref in refs]

    return Article(article)

In [31]:
body = "site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody"
accordeon = soup.find(class_=body).find_next("div").find_next("div").find_next("dl")

content = []

for part, chapters in zip(accordeon.find_all("dt"), accordeon.find_all("dd")):
    part_name = extract_name(part)
    print(part_name)
    chapter_name = ""
    section_name = ""
    for elem in chapters.find_all("div"):
        if elem.has_attr("class") and "card-header-nolink" in elem["class"]:
            if elem.find("h5").get("class") and "chapter" in elem.find("h5").get("class"):
                chapter_name = extract_name(elem)
                print(f"  {chapter_name}")
            elif elem.find("h5").get("class") and "section" in elem.find("h5").get("class"):
                section_name = extract_name(elem)
                print(f"  {section_name}")
                
        if elem.has_attr("class") and "card-header-white" in elem["class"]:
            article = elem
            article_a = article.find_next("h5").find_next("a")
            sub_url = article_a.get("href")
            article_url = "/".join([base_url, sub_url])
            article_name = extract_name(article_a)
            print(f"    {article_name}")
            print("    ", article_url)
            print("    ----------")
            content.append(extract_article(article_url, part_name, chapter_name, section_name))

Part I - Implementing Regulations to part I of the Convention
  Chapter I - General provisions
    R. 1 - Written proceedings
     https://www.epo.org/en/legal/epc/2020/r1.html
    ----------
    R. 2 - Filing of and formal requirements for documents
     https://www.epo.org/en/legal/epc/2020/r2.html
    ----------
    R. 3 - Language in written proceedings
     https://www.epo.org/en/legal/epc/2020/r3.html
    ----------
    R. 4 - Language in oral proceedings
     https://www.epo.org/en/legal/epc/2020/r4.html
    ----------
    R. 5 - Certification of translations
     https://www.epo.org/en/legal/epc/2020/r5.html
    ----------
    R. 6 - Filing of translations and reduction of fees
     https://www.epo.org/en/legal/epc/2020/r6.html
    ----------
    R. 7 - Legal authenticity of the translation of the European patent application
     https://www.epo.org/en/legal/epc/2020/r7.html
    ----------
    R. 7a - R. 7a
     https://www.epo.org/en/legal/epc/2020/r7a.html
    ----------
    

In [32]:
content

[{'book': 'EPC',
  'part': 'Part I - Implementing Regulations to part\xa0I of the Convention',
  'chapter': 'Chapter I - General provisions',
  'section': '',
  'title': 'Rule\xa01 - Written proceedings',
  'text': 'In written proceedings before the European Patent Office, the requirement to use the written form shall be satisfied if the content of the documents can be reproduced in a legible form on paper.\n\n\n1Amended by decision of the Administrative Council CA/D\xa026/23 of 14.12.2023 (OJ\xa0EPO 2024,\xa0A16), which entered into force on 01.04.2024.\n',
  'references': ['Rule\xa03 Language in written proceedings',
   'Rule\xa035 General provisions',
   'Rule\xa049 Presentation of the application documents',
   'Rule\xa050 Documents filed subsequently',
   'Rule\xa076 Form and content of the opposition',
   'Rule\xa089 Intervention of the assumed infringer',
   'Rule\xa092 Requirements of the request',
   'Rule\xa0114 Observations by third parties'],
  'url': 'https://www.epo.org/e

In [33]:
import json
with open("EPC_regulations.json", "w") as js:
    json.dump(content, js)

# Guidelines

This part has to be addressed separately as the html shape is different.

In [131]:
import requests
from bs4 import BeautifulSoup, Comment

In [186]:
from typing import TypedDict

class Article(TypedDict):
    book: str
    part: str
    chapter: str
    title: str
    text: str
    references: list[str]

def extract_article(url: str, part: str, chapter: str) -> Article:
    article = {"book": "EPO_guidelines", "part": part, "chapter": chapter, "title": "", "implementation": {}}
    response = requests.get(url=url)
    if response.ok:
        soup = BeautifulSoup(response.content)
    else:
        print("request error")
        return Article(article)

    block = soup.find("div", class_="lc-inline_column_second-edit layoutcomponent-column col-sm-12 col-md-12 col-lg-6 show-modification-body legal-text-navigation-buttons").find("div", class_="layout-builder__region js-layout-builder-region lc-inline_column_second-content-edit")
    title_block = block.find("div", class_="lc-inline_block_908ed3e9e0298a0f3a8a96e0e0f83b8f-edit block block-layout-builder block-field-blocknodeeuropean-patent-conventionsfield-caption").find("h1")
    for el in title_block.find_all('span'):
        if el.get('class') == ['Del']:
            el.extract()
    article["title"] = title_block.get_text()

    content_blocks = block.find("div", class_="site-main block block-layout-builder block-field-blocknodeguidelines-for-examination-epcbody").find_all("div", class_="DOC4NET2_Frame")
    if content_blocks:
        for content_block in content_blocks:
            text_block = content_block.find("div", class_="DOC4NET2_Frame_Text")
            for el in text_block.find_all("p"):
                if el.get("class") and "Del" in el.get("class"):
                    el.extract()
            ref_block = content_block.find("div", class_="DOC4NET2_Frame_Margin")
            if ref_block and len(ref_block.find_all("a")) > 0:
                for ref in ref_block.find_all("a"):
                    ref_title = ref.get("title").split(" ")[0]
                    if ref_title not in article["implementation"]:
                        article["implementation"][ref_title] = []
                    article["implementation"][ref_title].append(text_block.get_text())
            elif ref_block:
                if ref_block.find("p"):
                    for el in ref_block.find_all("p"):
                        if el.get("class") and "Del"in el.get("class"):
                            el.extract()
                    ref_title = ref_block.get_text()
                    if ref_title not in article["implementation"]:
                        article["implementation"][ref_title] = []
                    article["implementation"][ref_title].append(text_block.get_text())
                    
    else:
        content_block = block.find("div", class_="epolegal-content")
        for el in content_block.find_all("p"):
            if el.get('class') and "Del" in el.get("class"):
                el.extract()
        article["implementation"] = content_block.get_text()


    return Article(article)

In [187]:
url = "https://www.epo.org/en/legal/guidelines-epc/2024/a_ii_1.html"
extract_article(url, "", "")

{'book': 'EPO_guidelines',
 'part': '',
 'chapter': '',
 'title': '1. Where and how applications may be filed\xa0 ',
 'implementation': {'Rule\xa01': ['European patent applications must be filed in writing. They may be filed by means of electronic communication (see A‑II,\xa01.1) or by delivery by hand or , by postal services (see\xa0A‑II,\xa01.1) or by means of electronic communication (see\xa0A‑II,\xa01.2).\n\n'],
  'Rule\xa02': ['European patent applications must be filed in writing. They may be filed by means of electronic communication (see A‑II,\xa01.1) or by delivery by hand or , by postal services (see\xa0A‑II,\xa01.1) or by means of electronic communication (see\xa0A‑II,\xa01.2).\n\n'],
  '\n': ['\n\n\n\n']}}

In [193]:
base_url = "https://www.epo.org"
summary_url = "/".join([base_url, "en/legal/guidelines-epc/2024/index.html"])
response = requests.get(url=summary_url)
if response.ok:
    soup = BeautifulSoup(response.content, "html.parser")
else:
    print("request error")

In [194]:
def browse_sommaire_depth_first(sommaire: list, depth: int=0) -> str:
    output = []
    i = 0
    while i < len(sommaire):
        if isinstance(sommaire[i], list):
            output.extend(browse_sommaire_depth_first(sommaire[i], depth+1))
            i += 1
        else:
            output.append((sommaire[0], sommaire[1]))
            i += 3
        
    return output
    """ if elem.startswith('/'):
        address = elem
        print(" "*depth, f"address: {address}")
    elif elem != "":
        title = elem
        print(" "*depth, f"{title}") """

In [195]:
x = soup.find("div", class_="lc-section lc-section-884 layoutcomponents-three-column container legal-texts lc-inline_section-edit").find("div", class_="layout-builder__region js-layout-builder-region lc-inline_column_first-content-edit").find("div", class_="views-element-container block block-views block-views-blockepc-left-epc-content").find("script")

In [196]:
def parse(x: str):
    x = x[:-1]
    x = x.replace("var oMenu = ", "")
    return x

sommaire_guidelines = eval(parse(x.get_text()))

In [197]:
from tqdm import tqdm
content = []
for part in tqdm(sommaire_guidelines[1:-3]):
    for chapter in part[3]:
        print(f"{part[0]} ; {chapter[0]}")
        elements = browse_sommaire_depth_first(chapter)
        for guideline, url in elements:
            url = "/".join([base_url, url])
            content.append(extract_article(url, part[0], chapter[0]))

  0%|          | 0/8 [00:00<?, ?it/s]

Part A – Guidelines for Formalities Examination ; Chapter I – Introduction
Part A – Guidelines for Formalities Examination ; Chapter II – Filing of applications and examination on filing
Part A – Guidelines for Formalities Examination ; Chapter III – Examination of formal requirements
Part A – Guidelines for Formalities Examination ; Chapter IV – Special provisions
Part A – Guidelines for Formalities Examination ; Chapter V – Communications concerning formal deficiencies; amendment of application; correction of errors
Part A – Guidelines for Formalities Examination ; Chapter VI – Publication of application; request for examination and transmission of the dossier to examining division
Part A – Guidelines for Formalities Examination ; Chapter VII – Languages
Part A – Guidelines for Formalities Examination ; Chapter VIII – Common provisions
Part A – Guidelines for Formalities Examination ; Chapter IX – Drawings
Part A – Guidelines for Formalities Examination ; Chapter X – Fees
Part A – Gu

 12%|█▎        | 1/8 [03:01<21:11, 181.66s/it]

Part B – Guidelines for Search ; Chapter I – Introduction
Part B – Guidelines for Search ; Chapter II – General
Part B – Guidelines for Search ; Chapter III – Characteristics of the search
Part B – Guidelines for Search ; Chapter IV – Search procedure and strategy
Part B – Guidelines for Search ; Chapter V – Preclassification, IPC and CPC classification of European patent applications
Part B – Guidelines for Search ; Chapter VI – The state of the art at the search stage
Part B – Guidelines for Search ; Chapter VII – Unity of invention
Part B – Guidelines for Search ; Chapter VIII – Subject-matter to be excluded from the search
Part B – Guidelines for Search ; Chapter IX – Search documentation
Part B – Guidelines for Search ; Chapter X – Search report
Part B – Guidelines for Search ; Chapter XI – The search opinion


 25%|██▌       | 2/8 [04:47<13:40, 136.81s/it]

Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter I – Introduction
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter II – Formal requirements to be met before the division starts substantive examination
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter III – The first stage of examination
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter IV – Examination of replies and further stages of examination
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter V – The final stage of examination
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter VI – Time limits and acceleration of examination
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter VII – Other procedures in examination
Part C – Guidelines for Procedural Aspects of Substantive Examination ; Chapter VIII – Work within the examining div

 38%|███▊      | 3/8 [06:01<09:01, 108.35s/it]

Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter I – General remarks
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter II – The opposition division
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter III – Opposition
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter IV – Procedure up to substantive examination
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter V – Substantive examination of opposition
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter VI – Procedure for the examination of the opposition
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter VII – Details and special features of the proceedings
Part D – Guidelines for Opposition and Limitation/Revocation Procedures ; Chapter VIII – Decisions of the opposition division
Part D – Guidelines for Opposition and L

 50%|█████     | 4/8 [07:24<06:32, 98.24s/it] 

Part E – Guidelines on General Procedural Matters ; Chapter I – Introduction
Part E – Guidelines on General Procedural Matters ; Chapter II – Communications and notifications
Part E – Guidelines on General Procedural Matters ; Chapter III – Oral proceedings
Part E – Guidelines on General Procedural Matters ; Chapter IV – Taking and conservation of evidence
Part E – Guidelines on General Procedural Matters ; Chapter V – Derogations from the language of the proceedings in oral proceedings
Part E – Guidelines on General Procedural Matters ; Chapter VI – Examination by the EPO of its own motion; facts, evidence or grounds not submitted in due time; observations by third parties
Part E – Guidelines on General Procedural Matters ; Chapter VII – Interruption, stay and consolidation of the proceedings
Part E – Guidelines on General Procedural Matters ; Chapter VIII – Time limits, loss of rights, further and accelerated processing and re-establishment of rights
Part E – Guidelines on General Pr

 62%|██████▎   | 5/8 [09:59<05:55, 118.66s/it]

Part F – The European Patent Application ; Chapter I – Introduction
Part F – The European Patent Application ; Chapter II – Content of a European patent application (other than claims)
Part F – The European Patent Application ; Chapter III – Sufficiency of disclosure
Part F – The European Patent Application ; Chapter IV – Claims (Art. 84 and formal requirements)
Part F – The European Patent Application ; Chapter V – Unity of invention
Part F – The European Patent Application ; Chapter VI – Priority


 75%|███████▌  | 6/8 [11:38<03:44, 112.18s/it]

Part G – Patentability ; Chapter I – Patentability
Part G – Patentability ; Chapter II – Inventions
Part G – Patentability ; Chapter III – Industrial application
Part G – Patentability ; Chapter IV – State of the art
Part G – Patentability ; Chapter V – Non-prejudicial disclosures
Part G – Patentability ; Chapter VI – Novelty
Part G – Patentability ; Chapter VII – Inventive step


 88%|████████▊ | 7/8 [12:56<01:41, 101.02s/it]

Part H – Amendments and Corrections ; Chapter I – The right to amend
Part H – Amendments and Corrections ; Chapter II – Admissibility of amendments – general rules
Part H – Amendments and Corrections ; Chapter III – Admissibility of amendments – other procedural matters
Part H – Amendments and Corrections ; Chapter IV – Allowability of amendments
Part H – Amendments and Corrections ; Chapter V – Allowability of amendments – examples
Part H – Amendments and Corrections ; Chapter VI – Correction of errors


100%|██████████| 8/8 [14:18<00:00, 107.27s/it]


In [198]:
import json
with open("EPO_guidelines.json", 'w') as js:
    json.dump(content, js)