In [206]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
from tqdm.notebook import tqdm
# from tqdm import tqdm


In [None]:
# only disable warnings when you're sure you want to do it
import warnings
warnings.filterwarnings("ignore")

In [195]:
# target_url
URL = "https://www.gesetze-im-internet.de/Teilliste_translations.html"
ENDPOINT = "https://www.gesetze-im-internet.de/" 
DATA_FOLDER = "./data"

In [25]:
request = requests.request(url=URL, method="POST")
soup = BeautifulSoup(request.text, 'html.parser')
# Find all <a> tags
anchor_tags = soup.find_all('a')

In [86]:
# the first 15 links are not relevant
links = pd.DataFrame(anchor_tags, columns=["anchor_tags"])
# filter after the dataframe correctly recognized the it's anchor tag
links = links.iloc[15:-6].reset_index(drop=True)
# get links
links.loc[:, "link"] = links.loc[:, "anchor_tags"].apply(lambda x:x.get("href"))
links.loc[:, "description"] = links.loc[:, "anchor_tags"].apply(lambda x:x.abbr.get("title"))
# we want to go to englisch_abgg/englisch_abgg.html
links.loc[:, "full_link"] = links.loc[:, "link"].apply(lambda x:ENDPOINT + x.split("/")[0] + "/" + x.split("/")[0] + ".html")
links.head()

Unnamed: 0,anchor_tags,link,description,full_link
0,[[AbgG]],englisch_abgg/index.html,Members of the Bundestag Act,https://www.gesetze-im-internet.de/englisch_ab...
1,[[AdVermiG]],englisch_advermig/index.html,Act on Adoption Placement and Support and on t...,https://www.gesetze-im-internet.de/englisch_ad...
2,[[AEntG]],englisch_aentg/index.html,Act on Mandatory Working Conditions for Worker...,https://www.gesetze-im-internet.de/englisch_ae...
3,[[AGG]],englisch_agg/index.html,General Act on Equal Treatment,https://www.gesetze-im-internet.de/englisch_ag...
4,[[AktG]],englisch_aktg/index.html,Stock Corporation Act,https://www.gesetze-im-internet.de/englisch_ak...


In [189]:
def html_extration(base_url: pd.DataFrame) -> dict:
    """
    This function will extract certain inforamtion from the gesetze website.
    The function will return a list of dictionary as follows: 
    [
    {"section": first section detail,
    "content": law content,
    "link": link to the specific paragraph(by id)},
    {"section": second section detail,
    "content": law content,
    "link": link to the specific paragraph(by id)}
    ]
    For example:
    Given the url: 
    https://www.gesetze-im-internet.de/englisch_abgg/englisch_abgg.html

    the element could be:

    <p style="text-align: center; font-weight: bold"><a name="p0017"><!----></a>Section 2<br>Protection of the free exercise of an electoral mandate</p>
    <p><a name="p0018"><!----></a>(1) No one may be prevented from standing as a candidate for a mandate to serve as a Member of the Bundestag or from acquiring, accepting or holding such a mandate.</p>
    <p><a name="p0019"><!----></a>(2) Discrimination at work on the grounds of candidature for or acquisition, acceptance and exercise of a mandate shall be inadmissible.</p>
    
    The first paragraph is section and section detail.
    The second paragraph is the law content.
    The third paragraph is the law content of the same section of second paragraph.

    And the function will return:
    [
    {'section': 'Section 2:Protection of the free exercise of an electoral mandate',
    'content': '(1) No one may be prevented from standing as a candidate for a mandate to serve as a Member of the Bundestag or from acquiring, accepting or holding such a mandate. (2) Discrimination at work on the grounds of candidature for or acquisition, acceptance and exercise of a mandate shall be inadmissible. (3) Termination of an employment contract or dismissal on grounds of the acquisition, acceptance or exercise of a mandate shall be inadmissible. In all other respects, termination of an employment contract shall only be permitted for a compelling reason. Protection against termination or dismissal shall take effect on the selection of the candidate by the relevant party organ or on submission of the list of nominated candidates. It shall continue to apply for one year after the end of the Member’s term of office.',
    'link': 'https://www.gesetze-im-internet.de/englisch_abgg/englisch_abgg.html#p0017'},
    ]
    """
    request = requests.request(url=base_url, method="POST")
    soup = BeautifulSoup(request.text, 'html.parser')
    # find p tags
    paragraphs = soup.find_all('p')

    # init
    current_section = None
    sections = []  
    # analyse html and get the actual content
    for p in paragraphs:
        a_tag = p.find('a')
        if a_tag and 'name' in a_tag.attrs:
            anchor_id = a_tag['name']
            # find the section by bold style
            if 'text-align: center' in p.get('style', '') and 'font-weight: bold' in p.get('style', ''):
                # replace br to make format clean
                for br in p.find_all('br'):
                    br.replace_with(':')
                # make sure the content update after changing section
                new_section = p.get_text(strip=True)
                if current_section:
                    sections.append({
                        'section': current_section,
                        'content': current_content.strip(),
                        'link': f"{base_url}#{current_anchor_id}"
                    })
                current_section = new_section
                current_content = ""
                # anchor id for creating link
                current_anchor_id = anchor_id
            else:
                # if section does not change, concatenate the content
                if current_section:
                    content = p.get_text(strip=True).replace("table of contents", "")
                    if content:
                        current_content += " " + content
    # add the lastest section back
    if current_section:
        sections.append({
            'section': current_section,
            'content': current_content.strip(),
            'link': f"{base_url}#{current_anchor_id}"
        })
    return sections

In [210]:
for index, row in tqdm(links.iterrows(), total=links.shape[0], leave=True, desc="Processing"):
    # item = row[1]
    # get tartget url and overall description of the law
    title = row["anchor_tags"].get_text().replace("/", "_")
    base_url = row["full_link"]
    main_topic = row["description"]
    if os.path.exists(f"{DATA_FOLDER}/{title}.json"):
        continue
    sections = html_extration(base_url)
    # save data to DataFrame
    df = pd.DataFrame(sections)
    # filtered data, empty chapter 
    df = df[df.content != ""]
    # change DataFrame to json
    json_data = {
        'main_topic': main_topic,
        'sections': df.to_dict(orient='records')
    }

    # save df to json
    import json
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    with open(f"{DATA_FOLDER}/{title}.json", 'w') as f:
        json.dump(json_data, f, indent=4)

    # print current processing title
    tqdm.write(f"Processing {title}")

Processing:   0%|          | 0/126 [00:00<?, ?it/s]

Processing FreizügG_EU
Processing GeschGehG
Processing GG
Processing GmbHG
Processing EGGmbHG
Processing GVG
Processing GWB
Processing HaagÜbkAG
Processing HGB
Processing IFG
Processing InsO
Processing EGInsO
Processing IntFamRVG
Processing IntVG
Processing IRG
Processing IStGHG
Processing JFDG
Processing JGG
Processing KapMuG
Processing KassenSichV
Processing KGSG
Processing KSG
Processing LasthandhabV
Processing LPartG
Processing LuftSchlichtV
Processing LuftVG
Processing MariMedV
Processing MarkenG
Processing MediationsG
Processing MiLoG
Processing MuSchG
Processing NKRG
Processing Offshore-ArbZV
Processing OWiG
Processing PartGG
Processing PaßG
Processing PatG
Processing PAuswG
Processing ProdHaftG
Processing ProdSG
Processing RDG
Processing RPflG
Processing RVG
Processing SchBesV
Processing SchwarzArbG
Processing SeeArbG
Processing SeeArbÜV
Processing See-ArbZNV
Processing SeeAufgG
Processing See-BAV
Processing See-BV
Processing SeeEigensichV
Processing SeeUnterkunftsV
Processing 