In [None]:
#%pip install bs4
#%pip install chardet
#%pip install Jinja2
#%pip install os
#%pip install lxml


Collecting lxmlNote: you may need to restart the kernel to use updated packages.

  Downloading lxml-5.3.0-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Downloading lxml-5.3.0-cp312-cp312-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   --------------------------- ------------ 2.6/3.8 MB 16.9 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 9.9 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.3.0



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


The Crawler

In [None]:
import time
import requests
from collections import deque
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
import os

################################################################################
# Global crawler data
################################################################################
nodes = []  # each: { "url": str, "id": int, "status": str }
links = []  # each: { "source": int, "target": int }

visited_domains = set()       # which unified domains we have fully crawled
visited_pages = set()         # full URLs visited
edges_set = set()             # to avoid duplicate edges
domain_crawl_count = 0
max_domain_crawl_count = 100
sleep_time = 3
negative_list = ["srf.ch", 
                 "nzz.ch", 
                 "tagesanzeiger.ch",
                 "baselwandel.ch",
                 "naturwissenschaften.ch",
                 "20min.ch",
                 "baz.ch",
                 "bzbasel.ch",
                 "klimageschichten.so.ch",
                 "daten.stadt.sg.ch",
                 "ag.ch",
                 "mathias-binswanger.ch",
                 "24heures.ch",
                 "zentrumranft.ch",
                 "redcross.ch",
                 "letemps.ch",
                 "migros-service.ch",
                 "bernerzeitung.ch",
                 "sergeandpeppers.ch",
                 "silviodezanet.ch",
                 "dergewerbeverein.ch",
                 "berneroberlaender.ch",
                 "frapp.ch",
                 "ticketcorner.ch",
                 "lelanceen.ch",
                "mini.ch"
                ]

positive_list = [
    "prospecierara.ch"    
    ]

################################################################################
# Update JSON file with nodes and edges
################################################################################

def load_existing_data(json_file="public/graph_data.json"):
    global nodes, links, visited_domains, visited_pages
    if os.path.exists(json_file):
        if os.path.getsize(json_file) > 0:  # Check if file is not empty
            with open(json_file, 'r') as f:
                data = json.load(f)
                # Convert "label" -> "url" if needed
                raw_nodes = data.get("nodes", [])
                for n in raw_nodes:
                    if "url" not in n and "label" in n:
                        n["url"] = n["label"]
                nodes[:] = raw_nodes
                links[:] = data.get("edges", [])

                # Load visited sets if present
                visited_domains_data = data.get("visited_domains", [])
                visited_pages_data = data.get("visited_pages", [])
                visited_domains = set(visited_domains_data) if visited_domains_data else set()
                visited_pages = set(visited_pages_data) if visited_pages_data else set()
        else:
            print(f"{json_file} is empty. Initializing data structures.")
            nodes = []
            links = []
            visited_domains = set()
            visited_pages = set()
    else:
        print(f"{json_file} does not exist. Initializing data structures.")
        nodes = []
        links = []
        visited_domains = set()
        visited_pages = set()

def generate_json_from_data(nodes, links, output_json="public/graph_data.json"):
    # Build nodes
    nodes_list = []
    for node in nodes:
        nodes_list.append({
            "id": int(node["id"]),
            "label": node["url"],
            "status": node["status"],
            "size": 3,
            "x": node.get("x", 0),
            "y": node.get("y", 0),
        })

    # Build edges
    edges_list = []
    for link in links:
        edges_list.append({
            "source": int(link["source"]),
            "target": int(link["target"])
        })

    # Convert visited sets to lists for JSON
    graph_data = {
        "nodes": nodes_list,
        "edges": edges_list,
        "visited_domains": list(visited_domains),
        "visited_pages": list(visited_pages)
    }

    os.makedirs(os.path.dirname(output_json), exist_ok=True)
    with open(output_json, 'w') as f:
        json.dump(graph_data, f, indent=4)

    print(f"Updated {output_json} with {len(nodes_list)} nodes and {len(edges_list)} edges, plus visited sets.")

def add_node_if_missing(domain_str):
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    new_id = len(nodes) + 1
    nodes.append({"url": domain_str, "id": new_id, "status": "Unknown"})
    return new_id

def set_node_status(domain_str, status):
    for n in nodes:
        if n["url"] == domain_str:
            n["status"] = status
            return

def get_node_id(domain_str):
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    raise KeyError(f"Domain not found in nodes: {domain_str}")


################################################################################
# Utility functions
################################################################################

def unify_domain(url):
    """
    Returns a domain string without scheme and without 'www.' prefix.
    Example: 'https://www.urbanagriculturebasel.ch' -> 'urbanagriculturebasel.ch'
    """
    parsed = urlparse(url)
    netloc = parsed.netloc.lower()
    if netloc.startswith("www."):
        netloc = netloc[4:]
    return netloc

def canonical_domain(url):
    """
    For internal checks, returns the netloc in lowercase (still includes 'www.' if present).
    Used to decide if a link is internal or external within BFS.
    """
    return urlparse(url).netloc.lower()

def normalize_url(url):
    """
    Return a normalized full URL with scheme if missing, and lowercase netloc.
    """
    parsed = urlparse(url)
    scheme = parsed.scheme.lower() if parsed.scheme else "https"
    netloc = parsed.netloc.lower()
    return f"{scheme}://{netloc}{parsed.path}"

def can_crawl(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        robots_url = urljoin(base_url, "/robots.txt")
        print(f"Checking robots.txt at: {robots_url}")  # Debugging print

        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        can_fetch = rp.can_fetch("*", url)
        print(f"Can fetch {url}: {can_fetch}")  # Debugging print
        return can_fetch
    except Exception as e:
        print(f"Error checking robots.txt for {url}: {e}")
        return True

def detect_encoding_and_decode(raw):
    """
    Attempts to decode raw bytes, using chardet if available, otherwise utf-8 fallback.
    """
    try:
        import chardet
        result = chardet.detect(raw)
        enc = result["encoding"] or "utf-8"
        return raw.decode(enc, errors="replace")
    except ImportError:
        return raw.decode("utf-8", errors="replace")

def extract_links(url):
    if not can_crawl(url):
        print(f"Robots.txt disallows crawling => {url}")
        return []
    time.sleep(sleep_time)

    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()
        content = resp.content

        # First try html.parser, then fall back to lxml
        try:
            soup = BeautifulSoup(content, "html.parser")
        except Exception as e:
            print(f"Parser error with html.parser. Falling back to 'lxml'. Error: {e}")
            soup = BeautifulSoup(content, "lxml")

        valid_links = []
        for a in soup.find_all("a", href=True):
            raw_href = a["href"]
            try:
                merged_url = urljoin(url, raw_href)
                valid_links.append(merged_url)
            except ValueError:
                continue

        return valid_links

    except requests.RequestException as e:
        print(f"Failed to extract links from {url}: {e}")
        return []
    except UnicodeDecodeError as e:
        print(f"Unicode decode error for {url}: {e}")
        return []


def extract_visible_text(html):
    """
    Remove non-visible tags like <script>, <style>, <meta>, etc.
    Then get only the text from <body>.
    """
    soup = BeautifulSoup(html, "html.parser")
    # Remove tags not considered main content
    for tag_name in ["script", "style", "meta", "head", "noscript", "link"]:
        for t in soup.find_all(tag_name):
            t.decompose()

    # If you need to remove display:none elements:
    # for hidden in soup.select("[style*='display:none']"):
    #     hidden.decompose()

    # Some pages may not have a <body> tag; handle that gracefully
    body = soup.body
    if body is not None:
        text = body.get_text(separator=" ", strip=True)
    else:
        # fallback: entire soup
        text = soup.get_text(separator=" ", strip=True)
    
    return text

def contains_keyword(domain_url, keyword_list):
    """
    Check if the domain_url page text contains at least one of the given keywords 
    as a full standalone word. Use regex with negative lookbehind/lookahead 
    and print out what was matched for debugging.
    """
    if not can_crawl(domain_url):
        return False
    time.sleep(sleep_time)

    # Use the same headers for your keyword check
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        resp = requests.get(domain_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # Remove tags containing non-user-facing text
        for tag_name in ["script", "style", "head", "title", "meta", "noscript"]:
            for tag in soup.find_all(tag_name):
                tag.decompose()

        # Join all remaining text
        visible_text = ' '.join(soup.stripped_strings).lower()

        # Check if any keyword is present
        return any(kw.lower() in visible_text for kw in keyword_list)

    except Exception as e:
        print(f"contains_keyword failed for {domain_url}: {e}")
        return None

def add_node_if_missing(domain_str):
    """
    Use the 'domain_str' from unify_domain() as unique node key.
    If it doesn't exist in 'nodes', add it. Return node ID.
    """
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    new_id = len(nodes) + 1
    nodes.append({"url": domain_str, "id": new_id, "status": "Unknown"})
    return new_id

def set_node_status(domain_str, status):
    for n in nodes:
        if n["url"] == domain_str:
            n["status"] = status
            return

def get_node_id(domain_str):
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    raise KeyError(f"Domain not found in nodes: {domain_str}")

################################################################################
# BFS domain crawl
################################################################################
def bfs_crawl_domain(domain_str, depth=1):
    print(f"\n--- Crawling domain: {domain_str} (Domain #{domain_crawl_count}) ---")
    if domain_crawl_count > max_domain_crawl_count:
        print("Limit of crawlable domains reached. Stopping.")
        return

    start_page = f"https://{domain_str}"
    visited_pages.add(start_page)
    queue = deque([(start_page, 0)])
    
    # We assume the node for this domain already exists (e.g., after add_node_if_missing).
    try:
        source_id = get_node_id(domain_str)
    except KeyError:
        source_id = add_node_if_missing(domain_str)

    while queue:
        page_url, lvl = queue.popleft()
        if lvl > depth:
            break

        lower = page_url.lower()
        if lower.endswith(".pdf"):
            print(f"Skipping PDF: {page_url}")
            continue
        if lower.endswith(".jpg"):
            print(f"Skipping JPG: {page_url}")
            continue
        if lower.endswith(".mp4"):
            print(f"Skipping MP4: {page_url}")
            continue
        if lower.endswith(".exe"):
            print(f"Skipping exe: {page_url}")
            continue
        if lower.endswith(".zip"):
            print(f"Skipping ZIP: {page_url}")
            continue

        print(f"\nCrawling page: {page_url} (depth={lvl})")
        links_found = extract_links(page_url)
        print(f"Found {len(links_found)} links at {page_url}")

        # If it's the homepage (lvl=0) and we see > 200 links, classify + skip further
        if lvl == 0 and len(links_found) > 200:
            print("Homepage has more than 200 links, treating as 'Webshop-like' and skipping BFS.")
            set_node_status(domain_str, "Relevant, but possibly Webshop-like with too many links")
            break

        for link in links_found:
            link_unified = unify_domain(link).strip()
            if not link_unified:
                # Domain is empty or invalid
                continue

            if link_unified != domain_str:
                # External link => only add node and link if you actually want it
                try:
                    target_id = get_node_id(link_unified)
                except KeyError:
                    target_id = add_node_if_missing(link_unified)
                if (source_id, target_id) not in edges_set:
                    edges_set.add((source_id, target_id))
                    links.append({"source": source_id, "target": target_id})

            else:
                # Internal link => BFS deeper
                norm = normalize_url(link)
                if norm not in visited_pages and lvl < depth:
                    visited_pages.add(norm)
                    queue.append((norm, lvl + 1))

    # Finally, update the JSON output after finishing this domain
    generate_json_from_data(nodes, links, "public/graph_data.json")

################################################################################
# Main
################################################################################
def main():
    keyword_list = ["landwirtschaft", "landwirtschaftlich","agriculture","agricoltura","farming","agrar","fattoria","agricole","ferme","paysan","plouc","bauer"]

    # Load existing data from JSON
    load_existing_data()

    start_url = "https://infopro.ch/"
    start_unified = unify_domain(start_url)
    add_node_if_missing(start_unified)

    # If domain is in positive_list, skip keyword check => mark Relevant + BFS
    if any(pdom in start_unified for pdom in positive_list):
        set_node_status(start_unified, "Relevant")
        bfs_crawl_domain(start_unified, depth=1)
    elif any(nd in start_unified for nd in negative_list):
        set_node_status(start_unified, "Negativliste")
    elif start_unified.endswith(".ch"):
        keyword_check = contains_keyword(start_url, keyword_list)
        if keyword_check is None:
            set_node_status(start_unified, "could not test keyword")
        elif keyword_check:
            set_node_status(start_unified, "Relevant")
            bfs_crawl_domain(start_unified, depth=1)
        else:
            set_node_status(start_unified, "Kein Bezug zu Landwirtschaft")
    else:
        set_node_status(start_unified, "Nicht in der Schweiz")

    idx = 0
    while idx < len(nodes) and domain_crawl_count < max_domain_crawl_count:
        node = nodes[idx]
        idx += 1

        if node["url"] in visited_domains:
            continue
        
        if node["status"] in ("Start", "Relevant"):
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            continue

        dom_str = node["url"]

        if dom_str in visited_domains:
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            continue

        if any(nd in dom_str for nd in negative_list):
            set_node_status(dom_str, "Negativliste")
            visited_domains.add(dom_str)  # Mark visited
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            continue

        if any(pdom in dom_str for pdom in positive_list):
            set_node_status(dom_str, "Relevant") 
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            bfs_crawl_domain(dom_str, depth=1)
            continue

        if dom_str.endswith(".ch"):
            keyword_check = contains_keyword(f"https://{dom_str}", keyword_list)
            if keyword_check is None:
                set_node_status(dom_str, "could not test keyword")
            elif keyword_check:
                set_node_status(dom_str, "Relevant")
                bfs_crawl_domain(dom_str, depth=1)
            else:
                set_node_status(dom_str, "Kein Bezug zu Landwirtschaft")
            visited_domains.add(dom_str)  # Mark visited here as well
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
        else:
            set_node_status(dom_str, "Nicht in der Schweiz")
            visited_domains.add(dom_str)  # Mark visited
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON


    





if __name__ == "__main__":
    main()

Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/: True

--- Crawling domain: infopro.ch (Domain #0) ---

Crawling page: https://infopro.ch (depth=0)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch: True
Found 94 links at https://infopro.ch

Crawling page: https://infopro.ch/de/ (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/de/: True
Found 94 links at https://infopro.ch/de/

Crawling page: https://infopro.ch/remote (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/remote: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://infopro.ch/remote
Skipping PDF: https://www.infopro.ch/wp-content/uploads/2020/11/Infopro-Codex.pdf
Skipping PDF: https://infopro.ch/wp-content/uploads/2023/11/Stelleninserat-PL.pdf
Skipping PDF: https://infopro.ch/wp-content/uploads/2022/11/Sys-Engineer.pdf
Skipping exe: https://infopro.ch/downloads/IPCloudInstaller.exe
Skipping exe: https://infopro.ch/downloads/AgroCloudInstaller.exe

Crawling page: https://infopro.ch/client (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/client: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://infopro.ch/client
Skipping exe: https://infopro.ch/downloads/ReceiverCleanupUtility.exe
Skipping exe: https://infopro.ch/downloads/InfoproTeamViewer.exe

Crawling page: https://infopro.ch/downloads/InfoproTeamViewerOSX.zip (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/downloads/InfoproTeamViewerOSX.zip: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://infopro.ch/downloads/InfoproTeamViewerOSX.zip
Skipping exe: https://infopro.ch/downloads/InfoproTeamViewerID.exe

Crawling page: https://infopro.ch/downloads/InfoproTeamViewerOSXID.zip (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/downloads/InfoproTeamViewerOSXID.zip: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://infopro.ch/downloads/InfoproTeamViewerOSXID.zip
Skipping exe: https://infopro.ch/downloads/TriceratRemovalScript.exe

Crawling page: https://infopro.ch/remote-local/ (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/remote-local/: True
Found 4 links at https://infopro.ch/remote-local/

Crawling page: https://infopro.ch/client-local/ (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/client-local/: True
Found 5 links at https://infopro.ch/client-local/

Crawling page: https://infopro.ch/agb (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/agb: True
Found 2 links at https://infopro.ch/agb

Crawling page: https://infopro.ch/en/ (depth=1)
Checking robots.txt at: https://infopro.ch/robots.txt
Can fetch https://infopro.ch/en/: True
Found 95 links at https://infopro.ch/en/

Crawling page: https://infopro.ch/fr/ (depth=1)
Checking robots.txt 

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://berghof-hallau.ch/lageplan/
Skipping PDF: https://berghof-hallau.ch/wp-content/uploads/2021/03/berghof-hallau-prospekt.pdf
Skipping PDF: https://berghof-hallau.ch/wp-content/uploads/2022/11/Berghof-Hallau-CD-Manual.pdf

Crawling page: https://berghof-hallau.ch/impressum/ (depth=1)
Checking robots.txt at: https://berghof-hallau.ch/robots.txt
Can fetch https://berghof-hallau.ch/impressum/: True
Found 2 links at https://berghof-hallau.ch/impressum/

Crawling page: https://berghof-hallau.ch/datenschutzerklaerung/ (depth=1)
Checking robots.txt at: https://berghof-hallau.ch/robots.txt
Can fetch https://berghof-hallau.ch/datenschutzerklaerung/: True
Found 27 links at https://berghof-hallau.ch/datenschutzerklaerung/
Skipping JPG: https://berghof-hallau.ch/wp-content/uploads/2021/03/berghof-hallau-gruppenhaus-800px-08-4.jpg
Skipping JPG: https://berghof-hallau.ch/wp-content/uploads/2021/03/berghof-hallau-gruppenhaus-800px-09-4.jpg
Skipping JPG: https://berghof-hallau.ch

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://landwirtschaft.ufasamen.ch/versuchsbericht

Crawling page: https://landwirtschaft.ufasamen.ch/aktuelles-5/vortraege-maistag-2025-online (depth=1)
Checking robots.txt at: https://landwirtschaft.ufasamen.ch/robots.txt
Can fetch https://landwirtschaft.ufasamen.ch/aktuelles-5/vortraege-maistag-2025-online: True
Found 114 links at https://landwirtschaft.ufasamen.ch/aktuelles-5/vortraege-maistag-2025-online

Crawling page: https://landwirtschaft.ufasamen.ch/aktuelles-5/einladung-zum-maistag-am-8.januar-2025-bei-ufa-samen-in-lyssach (depth=1)
Checking robots.txt at: https://landwirtschaft.ufasamen.ch/robots.txt
Can fetch https://landwirtschaft.ufasamen.ch/aktuelles-5/einladung-zum-maistag-am-8.januar-2025-bei-ufa-samen-in-lyssach: True
Found 108 links at https://landwirtschaft.ufasamen.ch/aktuelles-5/einladung-zum-maistag-am-8.januar-2025-bei-ufa-samen-in-lyssach

Crawling page: https://landwirtschaft.ufasamen.ch/aktuelles-5/einladung-maistag-2025-in-winterthur (depth

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at http://1francpourleclimat.ch/wp-content/uploads/2021/06/Comment-sont-utilises-les-15.png

Crawling page: http://1francpourleclimat.ch/faire-un-don/ (depth=1)
Checking robots.txt at: http://1francpourleclimat.ch/robots.txt
Can fetch http://1francpourleclimat.ch/faire-un-don/: True
Found 19 links at http://1francpourleclimat.ch/faire-un-don/
Updated public/graph_data.json with 32137 nodes and 69628 edges, plus visited sets.
Updated public/graph_data.json with 32137 nodes and 69628 edges, plus visited sets.
Checking robots.txt at: https://biofruits-shop.ch/robots.txt
Can fetch https://biofruits-shop.ch: True
Updated public/graph_data.json with 32137 nodes and 69628 edges, plus visited sets.
Checking robots.txt at: https://pepillo.ch/robots.txt
Can fetch https://pepillo.ch: True
Updated public/graph_data.json with 32137 nodes and 69628 edges, plus visited sets.
Updated public/graph_data.json with 32137 nodes and 69628 edges, plus visited sets.
Updated public/graph_data.jso

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://filapi.ch/wp-content/uploads/2023/08/image-55-e1692977919342.jpeg

Crawling page: https://filapi.ch/wp-content/uploads/2023/08/image-54-e1692978173187.jpeg (depth=1)
Checking robots.txt at: https://filapi.ch/robots.txt
Can fetch https://filapi.ch/wp-content/uploads/2023/08/image-54-e1692978173187.jpeg: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://filapi.ch/wp-content/uploads/2023/08/image-54-e1692978173187.jpeg

Crawling page: https://filapi.ch/wp-content/uploads/2023/08/image-58-e1692978119186.jpeg (depth=1)
Checking robots.txt at: https://filapi.ch/robots.txt
Can fetch https://filapi.ch/wp-content/uploads/2023/08/image-58-e1692978119186.jpeg: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://filapi.ch/wp-content/uploads/2023/08/image-58-e1692978119186.jpeg
Skipping JPG: https://filapi.ch/wp-content/uploads/2023/08/image-38-e1692978241906.jpg
Skipping JPG: https://filapi.ch/wp-content/uploads/2023/08/image-32-e1692978305681.jpg
Skipping JPG: https://filapi.ch/wp-content/uploads/2023/08/image-35-e1692974641743.jpg

Crawling page: https://filapi.ch/wp-content/uploads/2023/08/image-57-e1692978459993.jpeg (depth=1)
Checking robots.txt at: https://filapi.ch/robots.txt
Can fetch https://filapi.ch/wp-content/uploads/2023/08/image-57-e1692978459993.jpeg: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://filapi.ch/wp-content/uploads/2023/08/image-57-e1692978459993.jpeg

Crawling page: https://filapi.ch/wp-content/uploads/2023/08/image-52-e1692978591384.jpeg (depth=1)
Checking robots.txt at: https://filapi.ch/robots.txt
Can fetch https://filapi.ch/wp-content/uploads/2023/08/image-52-e1692978591384.jpeg: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://filapi.ch/wp-content/uploads/2023/08/image-52-e1692978591384.jpeg
Updated public/graph_data.json with 32363 nodes and 70239 edges, plus visited sets.
Updated public/graph_data.json with 32363 nodes and 70239 edges, plus visited sets.
Updated public/graph_data.json with 32363 nodes and 70239 edges, plus visited sets.
Checking robots.txt at: https://guidesocial.ch/robots.txt
Can fetch https://guidesocial.ch: True
Updated public/graph_data.json with 32363 nodes and 70239 edges, plus visited sets.
Checking robots.txt at: https://20ans100francs.ch/robots.txt
Can fetch https://20ans100francs.ch: True
Updated public/graph_data.json with 32363 nodes and 70239 edges, plus visited sets.
Checking robots.txt at: https://competences-peche.ch/robots.txt
Can fetch https://competences-peche.ch: True
Updated public/graph_data.json with 32363 nodes and 70239 edges, plus visited sets.
Checking robots.txt at: https://formation-pecheurs.ch/robots.txt
Can fetch https://formation-pec

  k = self.parse_starttag(i)


Found 0 links at https://models.geo.lu.ch/ilimodels.xml

Crawling page: https://models.geo.lu.ch/ilisite.xml (depth=1)
Checking robots.txt at: https://models.geo.lu.ch/robots.txt
Can fetch https://models.geo.lu.ch/ilisite.xml: True
Found 0 links at https://models.geo.lu.ch/ilisite.xml
Updated public/graph_data.json with 32590 nodes and 71157 edges, plus visited sets.
Updated public/graph_data.json with 32590 nodes and 71157 edges, plus visited sets.
Checking robots.txt at: https://personal.lu.ch/robots.txt
Can fetch https://personal.lu.ch: True
Updated public/graph_data.json with 32590 nodes and 71157 edges, plus visited sets.
Updated public/graph_data.json with 32590 nodes and 71157 edges, plus visited sets.
Checking robots.txt at: https://umwelt-luzern.ch/robots.txt
Can fetch https://umwelt-luzern.ch: True

--- Crawling domain: umwelt-luzern.ch (Domain #0) ---

Crawling page: https://umwelt-luzern.ch (depth=0)
Checking robots.txt at: https://umwelt-luzern.ch/robots.txt
Can fetch http

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://renzoblumenthal.ch/uploads/1/1/1/2/111237811/edited/logo-homepage.png

Crawling page: https://renzoblumenthal.ch/news-und-fotos/der-neue-hofladen-ist-eroeffnet (depth=1)
Checking robots.txt at: https://renzoblumenthal.ch/robots.txt
Can fetch https://renzoblumenthal.ch/news-und-fotos/der-neue-hofladen-ist-eroeffnet: True
Found 37 links at https://renzoblumenthal.ch/news-und-fotos/der-neue-hofladen-ist-eroeffnet

Crawling page: http://www.renzoblumenthal.ch/store/p13/Blumenthaler_Fondue.html (depth=1)
Checking robots.txt at: http://www.renzoblumenthal.ch/robots.txt
Can fetch http://www.renzoblumenthal.ch/store/p13/Blumenthaler_Fondue.html: True
Found 20 links at http://www.renzoblumenthal.ch/store/p13/Blumenthaler_Fondue.html

Crawling page: http://www.renzoblumenthal.ch/store/p27/Blumenthaler_Trockenwurst_%22Krauseminze%22.html (depth=1)
Checking robots.txt at: http://www.renzoblumenthal.ch/robots.txt
Can fetch http://www.renzoblumenthal.ch/store/p27/Blumenthale

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://www.simplypuro.ch/data/files/puro_pressebilder.zip

Crawling page: https://simplypuro.ch/privacy (depth=1)
Checking robots.txt at: https://simplypuro.ch/robots.txt
Can fetch https://simplypuro.ch/privacy: True
Found 34 links at https://simplypuro.ch/privacy

Crawling page: https://simplypuro.ch/imprint (depth=1)
Checking robots.txt at: https://simplypuro.ch/robots.txt
Can fetch https://simplypuro.ch/imprint: True
Found 35 links at https://simplypuro.ch/imprint
Updated public/graph_data.json with 32727 nodes and 71607 edges, plus visited sets.
Updated public/graph_data.json with 32727 nodes and 71607 edges, plus visited sets.
Checking robots.txt at: https://cafe1842.ch/robots.txt
Can fetch https://cafe1842.ch: True
Updated public/graph_data.json with 32727 nodes and 71607 edges, plus visited sets.
Checking robots.txt at: https://rietberg.ch/robots.txt
Can fetch https://rietberg.ch: True
Updated public/graph_data.json with 32727 nodes and 71607 edges, plus visite

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://bwzuri.ch/fileadmin/dateien/dokumente/News/Zeitungsberichte/250131_NUZ_HotelsterbenUrnersee_NilsWipfli.png

Crawling page: https://bwzuri.ch/ueber-uns/aktuell/news-detail/infoabend-landwirt-in-efz-in-seedorf-180/ (depth=1)
Checking robots.txt at: https://bwzuri.ch/robots.txt
Can fetch https://bwzuri.ch/ueber-uns/aktuell/news-detail/infoabend-landwirt-in-efz-in-seedorf-180/: True
Found 162 links at https://bwzuri.ch/ueber-uns/aktuell/news-detail/infoabend-landwirt-in-efz-in-seedorf-180/

Crawling page: https://bwzuri.ch/ueber-uns/aktuell/news-detail/aktuelles-weiterbildungsprogramm-179/ (depth=1)
Checking robots.txt at: https://bwzuri.ch/robots.txt
Can fetch https://bwzuri.ch/ueber-uns/aktuell/news-detail/aktuelles-weiterbildungsprogramm-179/: True
Found 163 links at https://bwzuri.ch/ueber-uns/aktuell/news-detail/aktuelles-weiterbildungsprogramm-179/
Skipping PDF: https://bwzuri.ch/fileadmin/dateien/dokumente/weiterbildung/Daten/bwz_Inserate_Kurse_135x100mm_Hig

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.092/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.084/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.084/publikation/: True
Found 68 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.084/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.084/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.084/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.084/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.120.180/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.120.180/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.120.180/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.120.180/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.120.180/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.120.180/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.012/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.012/publikation/: True
Found 67 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.012/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.012/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.012/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.012/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.138/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.138/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.138/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.138/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.138/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.138/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.136/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.136/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.136/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.136/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.136/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.136/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.134/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.134/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.134/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.134/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.134/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.134/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.132/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.132/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.132/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.132/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.132/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.132/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.130/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.130/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.130/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.130/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.130/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.130/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.128/publikation/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.128/publikation/: True
Found 66 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.128/publikation/

Crawling page: https://kantonsamtsblatt.gr.ch/ekab/00.121.128/pdf/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/ekab/00.121.128/pdf/: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://kantonsamtsblatt.gr.ch/ekab/00.121.128/pdf/

Crawling page: https://kantonsamtsblatt.gr.ch/impressum/ (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/impressum/: True
Found 62 links at https://kantonsamtsblatt.gr.ch/impressum/

Crawling page: https://kantonsamtsblatt.gr.ch/index.php (depth=1)
Checking robots.txt at: https://kantonsamtsblatt.gr.ch/robots.txt
Can fetch https://kantonsamtsblatt.gr.ch/index.php: True
Found 91 links at https://kantonsamtsblatt.gr.ch/index.php
Updated public/graph_data.json with 32853 nodes and 71958 edges, plus visited sets.
Updated public/graph_data.json with 32853 nodes and 71958 edges, plus visited sets.
Checking robots.txt at: https://academiavivian.ch/robots.txt
Can fetch https://academiavivian.ch: True
Updated public/graph_data.json with 32853 nodes and 71958 edges, plus visited sets.
Checking robots.txt at: https://biblioteca-disentis.ch/robots.txt
Error che

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://www.zuercher-bienenfreunde.ch/wp-content/uploads/2017/12/DSC5192-e1514194539207.gif
Skipping JPG: https://www.zuercher-bienenfreunde.ch/wp-content/uploads/2017/12/DSC5932-freig.jpg
Skipping PDF: https://www.zuercher-bienenfreunde.ch/wp-content/uploads/2025/02/Programm-Beraterabende-2025-v1.pdf
Skipping PDF: https://www.zuercher-bienenfreunde.ch/wp-content/uploads/2024/04/VZB_Jahresbericht_2023-v2_web.pdf

Crawling page: https://www.zuercher-bienenfreunde.ch/reservation-lbs/ (depth=1)
Checking robots.txt at: https://www.zuercher-bienenfreunde.ch/robots.txt
Can fetch https://www.zuercher-bienenfreunde.ch/reservation-lbs/: True
Found 41 links at https://www.zuercher-bienenfreunde.ch/reservation-lbs/

Crawling page: https://www.zuercher-bienenfreunde.ch/goldsiegelbestellung/ (depth=1)
Checking robots.txt at: https://www.zuercher-bienenfreunde.ch/robots.txt
Can fetch https://www.zuercher-bienenfreunde.ch/goldsiegelbestellung/: True
Found 40 links at https://www.zuer

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://www.zuercher-bienenfreunde.ch/wp-content/uploads/2017/12/DSC1491-e1514194964479.gif

Crawling page: https://www.zuercher-bienenfreunde.ch/tag/beraterabende/ (depth=1)
Checking robots.txt at: https://www.zuercher-bienenfreunde.ch/robots.txt
Can fetch https://www.zuercher-bienenfreunde.ch/tag/beraterabende/: True
Found 105 links at https://www.zuercher-bienenfreunde.ch/tag/beraterabende/

Crawling page: https://www.zuercher-bienenfreunde.ch/tag/bienenbuch/ (depth=1)
Checking robots.txt at: https://www.zuercher-bienenfreunde.ch/robots.txt
Can fetch https://www.zuercher-bienenfreunde.ch/tag/bienenbuch/: True
Found 105 links at https://www.zuercher-bienenfreunde.ch/tag/bienenbuch/

Crawling page: https://www.zuercher-bienenfreunde.ch/tag/bienenhaus/ (depth=1)
Checking robots.txt at: https://www.zuercher-bienenfreunde.ch/robots.txt
Can fetch https://www.zuercher-bienenfreunde.ch/tag/bienenhaus/: True
Found 105 links at https://www.zuercher-bienenfreunde.ch/tag/bienen

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://www.naturtalent.ch/wp-content/uploads/2024/12/naturtalent_Dach_Logos_uebersicht.png

Crawling page: https://www.naturtalent.ch/impressum/ (depth=1)
Checking robots.txt at: https://www.naturtalent.ch/robots.txt
Can fetch https://www.naturtalent.ch/impressum/: True
Found 61 links at https://www.naturtalent.ch/impressum/

Crawling page: https://www.naturtalent.ch/datenschutz/ (depth=1)
Checking robots.txt at: https://www.naturtalent.ch/robots.txt
Can fetch https://www.naturtalent.ch/datenschutz/: True
Found 67 links at https://www.naturtalent.ch/datenschutz/
Updated public/graph_data.json with 33437 nodes and 73054 edges, plus visited sets.
Updated public/graph_data.json with 33437 nodes and 73054 edges, plus visited sets.
Checking robots.txt at: https://ig-prokulturland.ch/robots.txt
Can fetch https://ig-prokulturland.ch: True

--- Crawling domain: ig-prokulturland.ch (Domain #0) ---

Crawling page: https://ig-prokulturland.ch (depth=0)
Checking robots.txt at: ht

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://zskm.ch/berufsbildung/

Crawling page: https://www.zskm.ch/feed/ (depth=1)
Checking robots.txt at: https://www.zskm.ch/robots.txt
Can fetch https://www.zskm.ch/feed/: True
Found 0 links at https://www.zskm.ch/feed/
Updated public/graph_data.json with 33491 nodes and 73212 edges, plus visited sets.
Updated public/graph_data.json with 33491 nodes and 73212 edges, plus visited sets.
Checking robots.txt at: https://agvs-upsa.ch/robots.txt
Can fetch https://agvs-upsa.ch: True
Updated public/graph_data.json with 33491 nodes and 73212 edges, plus visited sets.
Checking robots.txt at: https://klima.ur.ch/robots.txt
Can fetch https://klima.ur.ch: True

--- Crawling domain: klima.ur.ch (Domain #0) ---

Crawling page: https://klima.ur.ch (depth=0)
Checking robots.txt at: https://klima.ur.ch/robots.txt
Can fetch https://klima.ur.ch: True
Found 81 links at https://klima.ur.ch

Crawling page: https://klima.ur.ch/klimawandel/ (depth=1)
Checking robots.txt at: https://klima.ur

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://agridigital.ch/wp-content/uploads/2022/03/Selbstcheck-Charta-Digitalisierung_d.docx
Skipping PDF: https://agridigital.ch/wp-content/uploads/2018/06/Unterzeichnungsformular-Charta-Digitalisierung-d-f-i-3.pdf

Crawling page: https://agridigital.ch/traegerschaft/ (depth=1)
Checking robots.txt at: https://agridigital.ch/robots.txt
Can fetch https://agridigital.ch/traegerschaft/: True
Found 40 links at https://agridigital.ch/traegerschaft/

Crawling page: https://agridigital.ch/steigerung-der-stickstoff-n-effizienz/ (depth=1)
Checking robots.txt at: https://agridigital.ch/robots.txt
Can fetch https://agridigital.ch/steigerung-der-stickstoff-n-effizienz/: True
Found 50 links at https://agridigital.ch/steigerung-der-stickstoff-n-effizienz/

Crawling page: https://agridigital.ch/de/impressum-de (depth=1)
Checking robots.txt at: https://agridigital.ch/robots.txt
Can fetch https://agridigital.ch/de/impressum-de: True
Found 42 links at https://agridigital.ch/de/impressum-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_6602.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_6570.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_6570.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_6570.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7101.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7101.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7101.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7140.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7140.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7140.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7853.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7853.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7853.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8030.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8030.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8030.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7860.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7860.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7860.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8027.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8027.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8027.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7840.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7840.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7840.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8108.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8108.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8108.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7942.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7942.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7942.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8033.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8033.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8033.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8074.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8074.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8074.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7941.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7941.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7941.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7900.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7900.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_7900.JPEG

Crawling page: https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8136.JPEG (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8136.JPEG: True


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://vindoeuvre.ch/sites/default/files/styles/pagedesigner_default/public/2025-01/IMG_8136.JPEG

Crawling page: https://vindoeuvre.ch/de/impressum (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/de/impressum: True
Found 55 links at https://vindoeuvre.ch/de/impressum

Crawling page: https://vindoeuvre.ch/de/sitemap (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/de/sitemap: True
Found 71 links at https://vindoeuvre.ch/de/sitemap

Crawling page: https://vindoeuvre.ch/agb (depth=1)
Checking robots.txt at: https://vindoeuvre.ch/robots.txt
Can fetch https://vindoeuvre.ch/agb: True
Found 50 links at https://vindoeuvre.ch/agb
Updated public/graph_data.json with 33583 nodes and 73589 edges, plus visited sets.
Updated public/graph_data.json with 33583 nodes and 73589 edges, plus visited sets.
Updated public/graph_data.json with 33583 nodes and 73589 edges, plus visited sets.

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://regiedecker.ch/files/1568705215-qrcode-10.png

Crawling page: https://regiedecker.ch/plan-du-site-fr39.html (depth=1)
Checking robots.txt at: https://regiedecker.ch/robots.txt
Can fetch https://regiedecker.ch/plan-du-site-fr39.html: True
Found 53 links at https://regiedecker.ch/plan-du-site-fr39.html
Updated public/graph_data.json with 33600 nodes and 73631 edges, plus visited sets.
Updated public/graph_data.json with 33600 nodes and 73631 edges, plus visited sets.
Checking robots.txt at: https://emoi.ch/robots.txt
Can fetch https://emoi.ch: True
Updated public/graph_data.json with 33600 nodes and 73631 edges, plus visited sets.
Updated public/graph_data.json with 33600 nodes and 73631 edges, plus visited sets.
Updated public/graph_data.json with 33600 nodes and 73631 edges, plus visited sets.
Updated public/graph_data.json with 33600 nodes and 73631 edges, plus visited sets.
Checking robots.txt at: https://cavalcade.ch/robots.txt
Can fetch https://cavalcade.ch

  soup = BeautifulSoup(resp.text, "html.parser")


Updated public/graph_data.json with 33789 nodes and 74260 edges, plus visited sets.
Updated public/graph_data.json with 33789 nodes and 74260 edges, plus visited sets.
Updated public/graph_data.json with 33789 nodes and 74260 edges, plus visited sets.
Checking robots.txt at: https://ricerca.schub.ch/robots.txt
Can fetch https://ricerca.schub.ch: True
Updated public/graph_data.json with 33789 nodes and 74260 edges, plus visited sets.
Checking robots.txt at: https://shop.agricoltura.ch/robots.txt
Can fetch https://shop.agricoltura.ch: True

--- Crawling domain: shop.agricoltura.ch (Domain #0) ---

Crawling page: https://shop.agricoltura.ch (depth=0)
Checking robots.txt at: https://shop.agricoltura.ch/robots.txt
Can fetch https://shop.agricoltura.ch: True
Found 307 links at https://shop.agricoltura.ch
Homepage has more than 200 links, treating as 'Webshop-like' and skipping BFS.
Updated public/graph_data.json with 33789 nodes and 74260 edges, plus visited sets.
Updated public/graph_data.j

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://maisondequartiersousgare.ch/download/cours-de-breakdance/

Crawling page: https://maisondequartiersousgare.ch/events/frankenstein/ (depth=1)
Checking robots.txt at: https://maisondequartiersousgare.ch/robots.txt
Can fetch https://maisondequartiersousgare.ch/events/frankenstein/: True
Found 81 links at https://maisondequartiersousgare.ch/events/frankenstein/

Crawling page: https://maisondequartiersousgare.ch/events/cette-soiree-sera-dediee-a-la-wertkritik-ou-theorie-de-la-valeur/ (depth=1)
Checking robots.txt at: https://maisondequartiersousgare.ch/robots.txt
Can fetch https://maisondequartiersousgare.ch/events/cette-soiree-sera-dediee-a-la-wertkritik-ou-theorie-de-la-valeur/: True
Found 80 links at https://maisondequartiersousgare.ch/events/cette-soiree-sera-dediee-a-la-wertkritik-ou-theorie-de-la-valeur/

Crawling page: https://maisondequartiersousgare.ch/events/piece-de-theatre-par-la-troupe-de-la-hep/ (depth=1)
Checking robots.txt at: https://maisondequarti

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://popjura.ch/wp-content/uploads/2021/09/recommandation.png

Crawling page: https://popjura.ch/2022/09/08/votation-du-15-mai-2022-2/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2022/09/08/votation-du-15-mai-2022-2/: True
Found 71 links at https://popjura.ch/2022/09/08/votation-du-15-mai-2022-2/

Crawling page: https://popjura.ch/2022/04/25/votation-du-15-mai-2022/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2022/04/25/votation-du-15-mai-2022/: True
Found 71 links at https://popjura.ch/2022/04/25/votation-du-15-mai-2022/

Crawling page: https://popjura.ch/2022/01/29/votation-du-13-fevrier-2022/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2022/01/29/votation-du-13-fevrier-2022/: True
Found 74 links at https://popjura.ch/2022/01/29/votation-du-13-fevrier-2022/

Crawling page: https://popjura.ch/wp-content/uploads/2022/01/r

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://popjura.ch/wp-content/uploads/2022/01/referendum-avs-21.png

Crawling page: https://popjura.ch/2022/01/08/non-a-un-demantelement-des-retraites-sur-le-dos-des-femmes/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2022/01/08/non-a-un-demantelement-des-retraites-sur-le-dos-des-femmes/: True
Found 73 links at https://popjura.ch/2022/01/08/non-a-un-demantelement-des-retraites-sur-le-dos-des-femmes/
Skipping JPG: https://popjura.ch/wp-content/uploads/2021/09/logo-bandeau.jpg

Crawling page: https://popjura.ch/2021/09/24/hommage-a-bernard-burkhard-et-dominique-tolotti/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2021/09/24/hommage-a-bernard-burkhard-et-dominique-tolotti/: True
Found 75 links at https://popjura.ch/2021/09/24/hommage-a-bernard-burkhard-et-dominique-tolotti/

Crawling page: https://popjura.ch/2021/09/04/recommandations-de-vote-du-pop-jura-votation-du-26-septem

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 links at https://popjura.ch/wp-content/uploads/2021/07/baniere-referendum-droit-timbre.png

Crawling page: https://popjura.ch/2021/07/08/referendum-contre-suppression-droit-de-timbre/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2021/07/08/referendum-contre-suppression-droit-de-timbre/: True
Found 73 links at https://popjura.ch/2021/07/08/referendum-contre-suppression-droit-de-timbre/

Crawling page: https://popjura.ch/2021/05/05/effectifs-a-letat-ladministration-lenseignement-naugmentent-cette-croyance/ (depth=1)
Checking robots.txt at: https://popjura.ch/robots.txt
Can fetch https://popjura.ch/2021/05/05/effectifs-a-letat-ladministration-lenseignement-naugmentent-cette-croyance/: True
Found 72 links at https://popjura.ch/2021/05/05/effectifs-a-letat-ladministration-lenseignement-naugmentent-cette-croyance/

Crawling page: https://popjura.ch/2021/04/30/promotion-jura-aupres-riches-maltraites-ailleurs/ (depth=1)
Checking robots.tx

Make Graph_data-filtered

In [1]:
import json

# Load the original graph data
with open('public/graph_data.json', 'r') as f:
    data = json.load(f)

# Filter nodes with "status" containing "Relevant"
filtered_nodes = [node for node in data['nodes'] if 'Relevant' in node['status']]

# Debugging: Print the number of filtered nodes
print(f"Number of filtered nodes: {len(filtered_nodes)}")

# Get the IDs of the filtered nodes
filtered_node_ids = {node['id'] for node in filtered_nodes}

# Debugging: Print the filtered node IDs
print(f"Filtered node IDs: {filtered_node_ids}")

# Filter edges that connect the filtered nodes
filtered_edges = [edge for edge in data['edges'] if edge['source'] in filtered_node_ids and edge['target'] in filtered_node_ids]

# Debugging: Print the number of filtered edges
print(f"Number of filtered edges: {len(filtered_edges)}")

# Create the filtered graph data
filtered_data = {
    'nodes': filtered_nodes,
    'edges': filtered_edges
}

# Write the filtered data to a new JSON file
with open('public/graph_data_filtered.json', 'w') as f:
    json.dump(filtered_data, f, indent=2)

print("Filtered graph data has been written to graph_data_filtered.json")

Number of filtered nodes: 2571
Filtered node IDs: {1, 16388, 9, 16395, 12, 8204, 20, 22, 24, 29, 8224, 16417, 8229, 16421, 8235, 53, 61, 16456, 79, 16472, 90, 94, 97, 16481, 100, 8292, 8293, 111, 8303, 125, 16510, 16512, 16513, 134, 16522, 16531, 16536, 16544, 16550, 167, 16551, 16554, 8363, 8367, 16562, 16566, 16571, 193, 8388, 16584, 8396, 16590, 16592, 16594, 214, 215, 8408, 16599, 218, 219, 8410, 16600, 223, 8416, 226, 234, 236, 8430, 239, 8431, 16622, 16625, 243, 244, 8438, 250, 253, 264, 271, 273, 275, 277, 16661, 16664, 281, 16665, 284, 8476, 8477, 287, 290, 16676, 16678, 295, 296, 300, 301, 8492, 303, 304, 16687, 306, 307, 16688, 16689, 16701, 16706, 323, 324, 16708, 330, 8522, 8523, 8524, 16722, 339, 342, 16730, 16734, 8551, 364, 16749, 8559, 8572, 8573, 16766, 16770, 8586, 8589, 8590, 8600, 16793, 8602, 415, 418, 16804, 431, 432, 8623, 8632, 8633, 8634, 452, 453, 8646, 8647, 16836, 460, 16844, 462, 8655, 464, 8657, 16848, 8659, 8660, 8661, 8662, 471, 16849, 8665, 474, 8666, 8