In [36]:
from indexing.scraper import scraper
from indexing.pipelines.admin import AdminParser

from bs4 import BeautifulSoup
from bs4.element import Tag

In [None]:
sitemap_url = "https://eak.admin.ch/eak/de/home.sitemap.xml"
sitemap = await scraper.fetch(sitemap_url)
sitemap

In [None]:
parser = AdminParser()
url_list = parser.parse_urls(sitemap)
url_list

In [None]:
# select language
#url_list = [url.replace("/de/", "/fr/") for url in url_list]
url_list

In [None]:
# get content
content = scraper.scrap_urls(url_list)
content

### Parser

- https://eak.admin.ch/eak/fr/home/sitemap.html has hierarchical structure of content -> use for KG

In [104]:
url = "https://eak.admin.ch/eak/fr/home/sitemap.html"

In [105]:
content = scraper.scrap_urls([url])

In [139]:
soup = BeautifulSoup(content[0].data)

In [None]:
soup

### Not necessary (?)

In [12]:
sitemap = soup.find_all("div", {"class":"mod mod-sitemap"})[0]

In [None]:
h3_tags = sitemap.find_all("h3")
h3_tags

In [79]:
# Initialize a list to hold the sections
sections = []

# Loop through each h3 tag
for i, h3 in enumerate(h3_tags):
    # Get the next siblings of the current h3 until the next h3
    content = []
    for sibling in h3.next_siblings:
        if sibling.name == "h3":
            break
        if isinstance(sibling, Tag):
            content.append(sibling)

    # Add the h3 and its associated content to the sections list
    sections.append({
        'heading': h3.get_text(strip=True),
        'content': content[0].find_all("a") if content else []
    })

### Create nested dict for neo4j

In [141]:
from bs4 import BeautifulSoup
from collections import defaultdict

def build_hierarchy(element, use_href=False):
    """Recursively build a dictionary hierarchy from the HTML structure."""
    hierarchy = defaultdict(dict)
    for tag in element.find_all('li', recursive=False):
        a_tag = tag.find('a', recursive=False)
        if a_tag:
            if use_href:
                key = "https://eak.admin.ch" + a_tag.get('href')
            else:
                key = a_tag.get_text(strip=True)

            # Find nested UL (if any) and build its hierarchy
            nested_ul = tag.find('ul', recursive=False)
            if nested_ul:
                hierarchy[key] = build_hierarchy(nested_ul, use_href)
            else:
                hierarchy[key] = {}
    return hierarchy

In [None]:
# Initialize a dictionary to hold the complete structure
structure = {}

# Option to use href as keys
use_href = False  # Set this to False to use text instead of href

# Find all h3 tags and build the hierarchy under each
for h3 in soup.find_all('h3'):
    first_a = h3.find('a')
    if first_a:
        if use_href:
            h3_heading = "https://eak.admin.ch" + first_a.get('href')
        else:
            h3_heading = first_a.get_text(strip=True)

        next_ul = h3.find_next_sibling('ul')
        if next_ul:
            structure[h3_heading] = build_hierarchy(next_ul, use_href)
        else:
            structure[h3_heading] = {}

# Display the hierarchical structure
import pprint
pprint.pprint(structure)

- !!! Formulaires/Annonces pas récupérés
    - eg. formulaires d'allocations familiales

# Postprocessing

- for each subtopic
    - can subtopic be linked to any other subtopic?

# Neo4j

In [150]:
from neo4j import GraphDatabase

In [151]:
# Example connection to Neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "neo4j_password"))

In [None]:
def create_nodes_and_relationships(tx, parent, children):
    for child in children:
        tx.run("MERGE (p:Category {name: $parent}) "
               "MERGE (c:Category {name: $child}) "
               "MERGE (p)-[:CONTAINS]->(c)", parent=parent, child=child)
        # Recursively create relationships for the next level
        create_nodes_and_relationships(tx, child, children[child])

# Start a session and create the graph
with driver.session() as session:
    for parent, children in structure.items():
        session.write_transaction(create_nodes_and_relationships, parent, children)

### Convert to docs

In [None]:
# convert to docs
docs = parser.convert_to_documents(content)
docs