In [1]:
import os
from dotenv import load_dotenv
from typing import List
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
from docx import Document
import tiktoken
import openai
from pydantic import BaseModel
import tqdm
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from schemas.document import Document
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
from collections import defaultdict

nested_defaultdict = lambda: defaultdict(nested_defaultdict)

In [7]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)

llm_client = openai.AsyncOpenAI(
    api_key=OPENAI_API_KEY
)

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt-4o")
driver = webdriver.Firefox()

In [5]:
urls_xml = [
    "https://www.fedlex.admin.ch/eli/cc/1959/827_857_845/fr",
    "https://www.fedlex.admin.ch/eli/cc/1961/29_29_29/fr",
    "https://www.fedlex.admin.ch/eli/cc/2002/510/fr",
    "https://www.fedlex.admin.ch/eli/cc/2002/569/fr",
    "https://www.fedlex.admin.ch/eli/cc/63/837_843_843/fr",
    "https://www.fedlex.admin.ch/eli/cc/63/1185_1183_1185/fr",
    "https://www.fedlex.admin.ch/eli/cc/1963/37_37_37/fr",
    "https://www.fedlex.admin.ch/eli/cc/2008/51/fr",
    "https://www.fedlex.admin.ch/eli/cc/2007/804/fr",
    "https://www.fedlex.admin.ch/eli/cc/39/55_55_57/fr",
    "https://www.fedlex.admin.ch/eli/cc/1952/1021_1046_1050/fr",
    "https://www.fedlex.admin.ch/eli/cc/2005/187/fr",
]

urls_pdf = [
    "https://sozialversicherungen.admin.ch/fr/d/6435/download",
    "https://sozialversicherungen.admin.ch/it/d/6435/download",
    "https://sozialversicherungen.admin.ch/de/d/6435/download",
    "https://sozialversicherungen.admin.ch/fr/d/6857/download",
    "https://sozialversicherungen.admin.ch/it/d/6857/download",
    "https://sozialversicherungen.admin.ch/de/d/6857/download",
    ]

In [11]:
for url in urls_xml[2:]:
    driver.get(url)
    table = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table.table.table-condensed"))
    )
    table_html = table.get_attribute("outerHTML")
    table_soup = BeautifulSoup(table_html)

    for download_link in table_soup.find_all("a"):
        if download_link.has_attr("href") and download_link["href"].endswith(".xml"):
            xml_url = "https://www.fedlex.admin.ch" + download_link["href"]
            res = requests.get(xml_url)
            xml_soup = BeautifulSoup(res.content, features="xml")
            break
    break

In [None]:
xml_url

In [None]:
xml_soup.find("preface")

In [None]:
xml_soup.find("preamble")

# Document preprocessing

In [12]:
class AugmentMetadataSchema(BaseModel):
    summary: str
    hyq: List[str]
    hyq_declarative: List[str]

class AugmentSubtopicsSchema(BaseModel):
    subtopics: List[str]

metadata_prompt = """<purpose>
Write a short summary (3-5 sentences) describing the content of the <doc> to facilitate semantic search in language of <doc>.
Create 3 HIGH-LEVEL questions which <doc> can precisely answer in language of <doc>.
Reformulate the exact 3 questions in a declarative form.
</purpose>

<response_format>
SummarySchema(BaseModel):
    summary: str # 3-5 sentence descriptive summary in language of <doc>
    hyq: List[str] # 3 declarative questions which <doc> can precisely answer in language of <doc>
    hyq_declarative: List[str] # the 3 reformulated questions in declarative form.
</response_format>

<examples>
For hyq_declarative:
Quelles dispositions de la LPGA s'appliquent à l'AVS? -> Dispositions de la LPGA s'appliquant à l'AVS
La LPGA s'applique-t-elle à l'octroi de subventions pour l'aide à la vieillesse? -> Application de la LPGA à l'octroi de subventions pour l'aide à la vieillesse
Quels articles de la LPGA ne s'appliquent pas à l'octroi de subventions pour l'aide à la vieillesse? -> Articles de la LPGA ne s'appliquant pas à l'octroi de subventions pour l'aide à la vieillesse
</examples>

<doc>
{doc}
</doc>
"""

subtopics_prompt = """<purpose>
Your purpose is to assign subtopics from the list of available <subtopics> for the following <doc>.
Assign 1 or more <subtopics> to the provided <doc> based on it's <text>, <summary>, <tags>, <hyq> and <hyq_declarative> metadata.
The selected subtopics should be clearly recognizable in the <doc>.
Look at the description of each subtopic to ensure selected subtopics are relevant to the <doc>.
The selected subtopics must be related to the <tag>, <text>, <summary>, <hyq>, <declarative_hyq> and provide lower-level (more precise) information about the document content.
</purpose>

<response_format>
SummarySchema(BaseModel):
    subtopics: List[str] # 1 or more selected subtopics from provided <subtopics>
</response_format>

<subtopics>
{subtopic_data}
</subtopics>

<doc>
{doc}
</doc>
"""

In [13]:
async def create_article_metadata(article):
    messages = [{"role": "developer", "content": metadata_prompt.format(doc=article)}]
    res = await llm_client.beta.chat.completions.parse(
            model="gpt-4o",
            temperature=0,
            top_p=0.95,
            max_tokens=512,
            messages=messages,
            response_format=AugmentMetadataSchema,
        )

    summary = res.choices[0].message.parsed.summary
    hyq = res.choices[0].message.parsed.hyq
    hyq_declarative = res.choices[0].message.parsed.hyq_declarative

    return summary, hyq, hyq_declarative

async def assign_subtopics(doc, subtopic_data):
    subtopics_formatted = "\n\n".join([f'**{row["subtopic"]}**: {row["description"]}' for i, row in subtopic_data.iterrows()])
    messages = [{"role": "developer", "content": subtopics_prompt.format(doc=doc, subtopic_data=subtopics_formatted)}]
    res = await llm_client.beta.chat.completions.parse(
            model="gpt-4o",
            temperature=0,
            top_p=0.95,
            max_tokens=512,
            messages=messages,
            response_format=AugmentSubtopicsSchema,
        )

    subtopics = res.choices[0].message.parsed.subtopics

    return subtopics

In [None]:
data = nested_defaultdict()
tok_len = []
docs = []

language = url.split("/")[-1]
tags = xml_soup.find_all("FRBRname", {"xml:lang": language})[0]["shortForm"].lower()
doc_title = xml_soup.find_all("FRBRname", {"xml:lang": language})[0]["value"]
organizations = "ZAS,EAK"

subtopic_data = pd.read_csv(f"indexing/data/glossary/glossary_ahv_iv_{language}_normalized.csv")

for part in tqdm.tqdm(xml_soup.find_all("part")):
    part_id = part["eId"]

    # Ensure the part exists in the data structure
    if part_id not in data:
        data[part_id] = {}

    # Handle chapters
    chapters = part.find_all("chapter")
    if chapters:
        for chapter in chapters:
            chapter_id = chapter.get("eId", "NO_CHAPTER")

            # Ensure the chapter exists in the part
            if chapter_id not in data[part_id]:
                data[part_id][chapter_id] = {}

            # Handle articles within the chapter
            for article in chapter.find_all("article"):
                article_id = article["eId"]

                article_str = str(article)
                data[part_id][chapter_id][article_id] = {
                    "content": article_str,
                    "n_toks": len(tokenizer.encode(article_str)),
                }

                summary, hyq, hyq_declarative = await create_article_metadata(article_str)
                doc = {
                        "text": article_str,
                        "url": url,
                        "language": language,
                        "tags": tags,
                        "summary": summary,
                        "hyq": ",".join(hyq),
                        "hyq_declarative": ",".join(hyq_declarative),
                        "doctype": "context_doc",
                        "organization": organizations,
                    }

                subtopics = await assign_subtopics(doc, subtopic_data)
                doc["subtopics"] = ",".join(subtopics + [tags])
                docs.append(doc)
                print(doc["subtopics"])
    else:
        # No chapters: handle articles directly under the part
        chapter_id = "NO_CHAPTER"
        if chapter_id not in data[part_id]:
            data[part_id][chapter_id] = {}

        for article in part.find_all("article"):
            article_id = article["eId"]

            article_str = str(article)
            data[part_id][chapter_id][article_id] = {
                "content": article_str,
                "n_toks": len(tokenizer.encode(article_str)),
            }

            summary, hyq, hyq_declarative = await create_article_metadata(article_str)
            doc = {
                "text": article_str,
                "url": url,
                "language": language,
                "tags": tags,
                "summary": summary,
                "hyq": ",".join(hyq),
                "hyq_declarative": ",".join(hyq_declarative),
                "doctype": "context_doc",
                "organization": organizations,
            }

            subtopics = await assign_subtopics(doc, subtopic_data)
            doc["subtopics"] = ",".join(subtopics + [tags])
            docs.append(doc)
            print(doc["subtopics"])

pd.DataFrame(docs).to_csv(f"indexing/data/to_upsert/fedlex/{doc_title}_{language}.csv", index=None)

# TO DO:
- \<preamble\> and \<preface\> parsing
- dispositions \<provisio\> and \<transitional\> parsing
- table parsing


In [None]:
len(docs)

In [None]:
data.keys()

In [None]:
data["part_5"].keys()

In [None]:
data["part_5"]['NO_CHAPTER'].keys()

### EDA - Token length

In [None]:
df = pd.DataFrame(tok_len, columns=["n_toks"])
df.describe()

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(df['n_toks'], bins=10, edgecolor='black', alpha=0.7)
plt.title('Histogram of n_toks in articles')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
data

# --> TO DO: Get all laws

In [None]:
def standardize_category_name(category):
    replace_mapping = {
        ".": "_",
        ":": "",
        "'": "_",
        "’": "_",
        '"': "",
        "- ": " ",
        "-": "_",
        "(": "",
        ")": "",
        "/": " ",
        " ": "_",
    }
    for old, new in replace_mapping.items():
        std_cat = category.replace(old, new)
    return std_cat

In [None]:
urls = {
    "de": "https://www.fedlex.admin.ch/de/cc/internal-law/83",
    "fr": "https://www.fedlex.admin.ch/fr/cc/internal-law/83",
    "it": "https://www.fedlex.admin.ch/it/cc/internal-law/83"
}

law_data = nested_defaultdict()

for lang, url in urls.items():

    driver.get(url)
    table = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table.table.text-left.table-striped"))
    )

    table_html = table.get_attribute("outerHTML")
    table_soup = BeautifulSoup(table_html)

    for tr in table_soup.find_all("tr"):

        # set category
        if tr.find("h2"):
            category = tr.find("h2").text.strip()
            #std_category = standardize_category_name(category)
        if "name" in tr.attrs:
            code = tr["name"].strip()

        # get links
        links = tr.find_all("a")
        if links:
            for a in links:
                doc_name = a.text.strip()
                law_data[lang][category][code][doc_name] = "https://www.fedlex.admin.ch" + a["href"]


with open(f"indexing/data/to_upsert/fedlex/urls.json", "w", encoding="utf-8") as file:
    json.dump(law_data, file, indent=4, ensure_ascii=False)

driver.quit()

# Get individual xml/docx content

In [None]:
def get_last_values(nested_dict):
    last_values = []

    def traverse(dictionary):
        if isinstance(dictionary, dict):
            for key in dictionary:
                traverse(dictionary[key])
        else:
            last_values.append(dictionary)

    traverse(nested_dict)
    return last_values

In [None]:
with open(f"indexing/data/to_upsert/fedlex/urls.json", "r", encoding="utf-8") as file:
    urls = json.load(file)

In [None]:
urls["de"]['830 Allgemeiner Teil des Sozialversicherungsrechts']

In [None]:
# Define the SPARQL endpoint
endpoint_url = "https://fedlex.data.admin.ch/sparqlendpoint"

# Initialize the SPARQL wrapper
sparql = SPARQLWrapper(endpoint_url)

# Define your SPARQL query
query = """
PREFIX jolux: <http://data.legilux.public.lu/resource/ontology/jolux#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT DISTINCT (str(?srNotation) AS ?rsNr) (str(?dateApplicabilityNode) AS ?dateApplicability) ?title ?abrev ?fileUrl
WHERE {
  FILTER(?language = <http://publications.europa.eu/resource/authority/language/DEU>)
  ?consolidation a jolux:Consolidation .
  ?consolidation jolux:dateApplicability ?dateApplicabilityNode .
  OPTIONAL { ?consolidation jolux:dateEndApplicability ?dateEndApplicability }
  FILTER(xsd:date(?dateApplicabilityNode) <= xsd:date(now()) && (!BOUND(?dateEndApplicability) || xsd:date(?dateEndApplicability) >= xsd:date(now())))
  ?consolidation jolux:isRealizedBy ?consoExpr .
  ?consoExpr jolux:language ?language .
  ?consoExpr jolux:isEmbodiedBy ?consoManif .
  ?consoManif jolux:userFormat <https://fedlex.data.admin.ch/vocabulary/user-format/xml> .
  ?consoManif jolux:isExemplifiedBy ?fileUrl .
  ?consolidation jolux:isMemberOf ?cc .
  ?cc jolux:classifiedByTaxonomyEntry/skos:notation ?srNotation .
  OPTIONAL { ?cc jolux:dateNoLongerInForce ?ccNoLonger }
  OPTIONAL { ?cc jolux:dateEndApplicability ?ccEnd }
  FILTER(!BOUND(?ccNoLonger) || xsd:date(?ccNoLonger) > xsd:date(now()))
  FILTER(!BOUND(?ccEnd) || xsd:date(?ccEnd) >= xsd:date(now()))
  FILTER(datatype(?srNotation) = <https://fedlex.data.admin.ch/vocabulary/notation-type/id-systematique>)
  OPTIONAL {
    ?cc jolux:isRealizedBy ?ccExpr .
    ?ccExpr jolux:language ?language .
    ?ccExpr jolux:title ?title .
    OPTIONAL {?ccExpr jolux:titleShort ?abrev }
  }
}
ORDER BY ?srNotation
"""

# Set the query and the return format
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

# Execute the query and process the results
file_urls = []
try:
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        rsNr = result.get("rsNr", {}).get("value", "")
        dateApplicability = result.get("dateApplicability", {}).get("value", "")
        title = result.get("title", {}).get("value", "")
        abrev = result.get("abrev", {}).get("value", "")
        fileUrl = result.get("fileUrl", {}).get("value", "")
        file_urls.append(fileUrl)

        # Print each result
        #print(f"rsNr: {rsNr}, dateApplicability: {dateApplicability}, title: {title}, abrev: {abrev}, fileUrl: {fileUrl}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
get_last_values(urls)

In [None]:
"https://www.fedlex.admin.ch/eli/cc/2002/510/de".endswith(("/de", "/fr", "/it"))

In [None]:
target_urls = []
i = 0
for lang in urls.keys():
    for category in urls[lang].keys():
        for subcategory in urls[lang][category].keys():
            for law, target_url in urls[lang][category][subcategory].items():
                i += 1

                for url in file_urls:
                    if target_url.split("https://www.fedlex.admin.ch")[1].replace(lang, "") in url:
                        target_urls.append(url)

pd.DataFrame(target_urls)

In [None]:
i

In [None]:
target_urls[-1]

In [None]:
for law, url in urls[lang][category][subcategory].items():
    break

In [None]:
law

In [None]:
url

In [None]:
fedlex_data = []

for target_url in target_urls:
    for url in file_urls:
        if '/eli/cc/2022/424/it' in url and url.endswith(".xml"):
            print("OK")
            break

In [None]:
for url in file_urls:
    if target_url.split("https://www.fedlex.admin.ch")[1].replace("it", "") in url:
        print("OK")
        break

In [None]:
url

In [None]:
res = requests.get(url)

In [None]:
soup = BeautifulSoup(res.content)

In [None]:
from docx import Document
from io import StringIO

with open('/Users/kieranschubert/Downloads/fedlex-data-admin-ch-eli-cc-2022-424-20250101-de-docx-4.docx', 'rb') as f:
    #source_stream = StringIO(f.read())
    document = Document(source_stream)
source_stream.close()



In [None]:
pwd

In [None]:
res.content

In [None]:
languages = ["de", "fr", "it"]

data = {}
for lang in languages:
    for category in urls[lang].keys():
        for subcategory in urls[lang][category].keys():

            (law_name, url), = urls[lang][category][subcategory].items()

            # get content
            driver.get(url)

            download_div = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.well.well-white"))
            )
            download_div_html = download_div.get_attribute("outerHTML")
            soup = BeautifulSoup(download_div_html)

            xml_content = []
            docx_content = []

            for a in soup.find_all("a"):
                # get xml
                if a.has_attr("href") and a.has_attr("download") and a["href"].endswith(".xml"):
                    xml_content.append({a["download"]: "https://www.fedlex.admin.ch" + a["href"]})

                # get docx
                elif a.has_attr("href") and a.has_attr("download") and a["href"].endswith(".docx"):
                    docx_content.append({a["download"]: "https://www.fedlex.admin.ch" + a["href"]})
                else:
                    print("NO AVAILABLE DOC FOR: ", url)

            break
        break
    break


In [None]:
law_name

In [None]:
url

In [None]:
download_div_html

In [None]:
soup.find_all("a")

In [None]:
xml_content

In [None]:
docx_content

In [None]:
res = requests.get(xml_content[0]["RS-830.1-01012024-FR.xml"])

In [None]:
soup = BeautifulSoup(res.content, 'xml')

In [None]:
soup.find("preface")

In [None]:
soup.find("preamble")[0].text

# Split by chapter

In [None]:
body = soup.find("body")

In [None]:
for chapter in body.find_all("chapter"):
    print(chapter["eId"])
    break

In [None]:
print("\n".join([str(x) for x in chapter.find_all("article")]))

In [None]:
organization = "ZAS:ALL,EAK:ALL"


language = None

db_document = Document(
            url=obj_in.url,
            language=language,
            text=obj_in.text,
            tags=obj_in.tags,
            subtopics=obj_in.subtopics,
            summary=obj_in.summary,
            doctype=obj_in.doctype,
            organization=obj_in.organization,
        )