In [13]:
#%pip install bs4
#%pip install chardet
#%pip install networkx
#%pip install Jinja2
#%pip install pyvis
#%pip install ipysigma
#%pip install fa2
#%pip install os
#%pip install lxml


Collecting lxmlNote: you may need to restart the kernel to use updated packages.

  Downloading lxml-5.3.0-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Downloading lxml-5.3.0-cp312-cp312-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   --------------------------- ------------ 2.6/3.8 MB 16.9 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 9.9 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.3.0



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


The Crawler

In [None]:
import time
import requests
from collections import deque
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
import os

################################################################################
# Global crawler data
################################################################################
nodes = []  # each: { "url": str, "id": int, "status": str }
links = []  # each: { "source": int, "target": int }

visited_domains = set()       # which unified domains we have fully crawled
visited_pages = set()         # full URLs visited
edges_set = set()             # to avoid duplicate edges
domain_crawl_count = 0
max_domain_crawl_count = 100
sleep_time = 3
negative_list = ["srf.ch", 
                 "nzz.ch", 
                 "tagesanzeiger.ch",
                 "baselwandel.ch",
                 "naturwissenschaften.ch",
                 "20min.ch",
                 "baz.ch",
                 "bzbasel.ch",
                 "klimageschichten.so.ch",
                 "daten.stadt.sg.ch",
                 "ag.ch",
                 "mathias-binswanger.ch",
                 "24heures.ch",
                 "zentrumranft.ch",
                 "redcross.ch",
                 "letemps.ch",
                 "migros-service.ch",
                 "bernerzeitung.ch",
                 "sergeandpeppers.ch",
                 "silviodezanet.ch",
                 "dergewerbeverein.ch",
                 "berneroberlaender.ch",
                 "frapp.ch",
                 "ticketcorner.ch"
                ]

positive_list = [
    "prospecierara.ch"    
    ]

################################################################################
# Update JSON file with nodes and edges
################################################################################

def load_existing_data(json_file="public/graph_data.json"):
    global nodes, links, visited_domains, visited_pages
    if os.path.exists(json_file):
        if os.path.getsize(json_file) > 0:  # Check if file is not empty
            with open(json_file, 'r') as f:
                data = json.load(f)
                # Convert "label" -> "url" if needed
                raw_nodes = data.get("nodes", [])
                for n in raw_nodes:
                    if "url" not in n and "label" in n:
                        n["url"] = n["label"]
                nodes[:] = raw_nodes
                links[:] = data.get("edges", [])

                # Load visited sets if present
                visited_domains_data = data.get("visited_domains", [])
                visited_pages_data = data.get("visited_pages", [])
                visited_domains = set(visited_domains_data) if visited_domains_data else set()
                visited_pages = set(visited_pages_data) if visited_pages_data else set()
        else:
            print(f"{json_file} is empty. Initializing data structures.")
            nodes = []
            links = []
            visited_domains = set()
            visited_pages = set()
    else:
        print(f"{json_file} does not exist. Initializing data structures.")
        nodes = []
        links = []
        visited_domains = set()
        visited_pages = set()

def generate_json_from_data(nodes, links, output_json="public/graph_data.json"):
    # Build nodes
    nodes_list = []
    for node in nodes:
        nodes_list.append({
            "id": int(node["id"]),
            "label": node["url"],
            "status": node["status"],
            "size": 3,
            "x": node.get("x", 0),
            "y": node.get("y", 0),
        })

    # Build edges
    edges_list = []
    for link in links:
        edges_list.append({
            "source": int(link["source"]),
            "target": int(link["target"])
        })

    # Convert visited sets to lists for JSON
    graph_data = {
        "nodes": nodes_list,
        "edges": edges_list,
        "visited_domains": list(visited_domains),
        "visited_pages": list(visited_pages)
    }

    os.makedirs(os.path.dirname(output_json), exist_ok=True)
    with open(output_json, 'w') as f:
        json.dump(graph_data, f, indent=4)

    print(f"Updated {output_json} with {len(nodes_list)} nodes and {len(edges_list)} edges, plus visited sets.")

def add_node_if_missing(domain_str):
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    new_id = len(nodes) + 1
    nodes.append({"url": domain_str, "id": new_id, "status": "Unknown"})
    return new_id

def set_node_status(domain_str, status):
    for n in nodes:
        if n["url"] == domain_str:
            n["status"] = status
            return

def get_node_id(domain_str):
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    raise KeyError(f"Domain not found in nodes: {domain_str}")


################################################################################
# Utility functions
################################################################################

def unify_domain(url):
    """
    Returns a domain string without scheme and without 'www.' prefix.
    Example: 'https://www.urbanagriculturebasel.ch' -> 'urbanagriculturebasel.ch'
    """
    parsed = urlparse(url)
    netloc = parsed.netloc.lower()
    if netloc.startswith("www."):
        netloc = netloc[4:]
    return netloc

def canonical_domain(url):
    """
    For internal checks, returns the netloc in lowercase (still includes 'www.' if present).
    Used to decide if a link is internal or external within BFS.
    """
    return urlparse(url).netloc.lower()

def normalize_url(url):
    """
    Return a normalized full URL with scheme if missing, and lowercase netloc.
    """
    parsed = urlparse(url)
    scheme = parsed.scheme.lower() if parsed.scheme else "https"
    netloc = parsed.netloc.lower()
    return f"{scheme}://{netloc}{parsed.path}"

def can_crawl(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        robots_url = urljoin(base_url, "/robots.txt")
        print(f"Checking robots.txt at: {robots_url}")  # Debugging print

        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        can_fetch = rp.can_fetch("*", url)
        print(f"Can fetch {url}: {can_fetch}")  # Debugging print
        return can_fetch
    except Exception as e:
        print(f"Error checking robots.txt for {url}: {e}")
        return True

def detect_encoding_and_decode(raw):
    """
    Attempts to decode raw bytes, using chardet if available, otherwise utf-8 fallback.
    """
    try:
        import chardet
        result = chardet.detect(raw)
        enc = result["encoding"] or "utf-8"
        return raw.decode(enc, errors="replace")
    except ImportError:
        return raw.decode("utf-8", errors="replace")

def extract_links(url):
    if not can_crawl(url):
        print(f"Robots.txt disallows crawling => {url}")
        return []
    time.sleep(sleep_time)

    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        resp = requests.get(url, headers=headers, timeout=7)
        resp.raise_for_status()
        content = resp.content

        # First try html.parser, then fall back to lxml
        try:
            soup = BeautifulSoup(content, "html.parser")
        except Exception as e:
            print(f"Parser error with html.parser. Falling back to 'lxml'. Error: {e}")
            soup = BeautifulSoup(content, "lxml")

        valid_links = []
        for a in soup.find_all("a", href=True):
            raw_href = a["href"]
            try:
                merged_url = urljoin(url, raw_href)
                valid_links.append(merged_url)
            except ValueError:
                continue

        return valid_links

    except requests.RequestException as e:
        print(f"Failed to extract links from {url}: {e}")
        return []
    except UnicodeDecodeError as e:
        print(f"Unicode decode error for {url}: {e}")
        return []


def extract_visible_text(html):
    """
    Remove non-visible tags like <script>, <style>, <meta>, etc.
    Then get only the text from <body>.
    """
    soup = BeautifulSoup(html, "html.parser")
    # Remove tags not considered main content
    for tag_name in ["script", "style", "meta", "head", "noscript", "link"]:
        for t in soup.find_all(tag_name):
            t.decompose()

    # If you need to remove display:none elements:
    # for hidden in soup.select("[style*='display:none']"):
    #     hidden.decompose()

    # Some pages may not have a <body> tag; handle that gracefully
    body = soup.body
    if body is not None:
        text = body.get_text(separator=" ", strip=True)
    else:
        # fallback: entire soup
        text = soup.get_text(separator=" ", strip=True)
    
    return text

def contains_keyword(domain_url, keyword_list):
    """
    Check if the domain_url page text contains at least one of the given keywords 
    as a full standalone word. Use regex with negative lookbehind/lookahead 
    and print out what was matched for debugging.
    """
    if not can_crawl(domain_url):
        return False
    time.sleep(sleep_time)

    # Use the same headers for your keyword check
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        resp = requests.get(domain_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=7)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # Remove tags containing non-user-facing text
        for tag_name in ["script", "style", "head", "title", "meta", "noscript"]:
            for tag in soup.find_all(tag_name):
                tag.decompose()

        # Join all remaining text
        visible_text = ' '.join(soup.stripped_strings).lower()

        # Check if any keyword is present
        return any(kw.lower() in visible_text for kw in keyword_list)

    except Exception as e:
        print(f"contains_keyword failed for {domain_url}: {e}")
        return None

def add_node_if_missing(domain_str):
    """
    Use the 'domain_str' from unify_domain() as unique node key.
    If it doesn't exist in 'nodes', add it. Return node ID.
    """
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    new_id = len(nodes) + 1
    nodes.append({"url": domain_str, "id": new_id, "status": "Unknown"})
    return new_id

def set_node_status(domain_str, status):
    for n in nodes:
        if n["url"] == domain_str:
            n["status"] = status
            return

def get_node_id(domain_str):
    for n in nodes:
        if n["url"] == domain_str:
            return n["id"]
    raise KeyError(f"Domain not found in nodes: {domain_str}")

################################################################################
# BFS domain crawl
################################################################################
def bfs_crawl_domain(domain_str, depth=1):
    print(f"\n--- Crawling domain: {domain_str} (Domain #{domain_crawl_count}) ---")
    if domain_crawl_count > max_domain_crawl_count:
        print("Limit of crawlable domains reached. Stopping.")
        return

    start_page = f"https://{domain_str}"
    visited_pages.add(start_page)
    queue = deque([(start_page, 0)])
    
    # We assume the node for this domain already exists (e.g., after add_node_if_missing).
    try:
        source_id = get_node_id(domain_str)
    except KeyError:
        source_id = add_node_if_missing(domain_str)

    while queue:
        page_url, lvl = queue.popleft()
        if lvl > depth:
            break

        lower = page_url.lower()
        if lower.endswith(".pdf"):
            print(f"Skipping PDF: {page_url}")
            continue
        if lower.endswith(".jpg"):
            print(f"Skipping JPG: {page_url}")
            continue
        if lower.endswith(".mp4"):
            print(f"Skipping MP4: {page_url}")
            continue

        print(f"\nCrawling page: {page_url} (depth={lvl})")
        links_found = extract_links(page_url)
        print(f"Found {len(links_found)} links at {page_url}")

        # If it's the homepage (lvl=0) and we see > 200 links, classify + skip further
        if lvl == 0 and len(links_found) > 200:
            print("Homepage has more than 200 links, treating as 'Webshop-like' and skipping BFS.")
            set_node_status(domain_str, "Relevant, but possibly Webshop-like with too many links")
            break

        for link in links_found:
            link_unified = unify_domain(link).strip()
            if not link_unified:
                # Domain is empty or invalid
                continue

            if link_unified != domain_str:
                # External link => only add node and link if you actually want it
                try:
                    target_id = get_node_id(link_unified)
                except KeyError:
                    target_id = add_node_if_missing(link_unified)
                if (source_id, target_id) not in edges_set:
                    edges_set.add((source_id, target_id))
                    links.append({"source": source_id, "target": target_id})

            else:
                # Internal link => BFS deeper
                norm = normalize_url(link)
                if norm not in visited_pages and lvl < depth:
                    visited_pages.add(norm)
                    queue.append((norm, lvl + 1))

    # Finally, update the JSON output after finishing this domain
    generate_json_from_data(nodes, links, "public/graph_data.json")

################################################################################
# Main
################################################################################
def main():
    keyword_list = ["landwirtschaft", "landwirtschaftlich","agriculture","agricoltura","farming","agrar","fattoria","agricole","ferme","paysan","plouc","bauer"]

    # Load existing data from JSON
    load_existing_data()

    start_url = "https://ticketcorner.ch/"
    start_unified = unify_domain(start_url)
    add_node_if_missing(start_unified)

    # If domain is in positive_list, skip keyword check => mark Relevant + BFS
    if any(pdom in start_unified for pdom in positive_list):
        set_node_status(start_unified, "Relevant")
        bfs_crawl_domain(start_unified, depth=1)
    elif any(nd in start_unified for nd in negative_list):
        set_node_status(start_unified, "Negativliste")
    elif start_unified.endswith(".ch"):
        keyword_check = contains_keyword(start_url, keyword_list)
        if keyword_check is None:
            set_node_status(start_unified, "could not test keyword")
        elif keyword_check:
            set_node_status(start_unified, "Relevant")
            bfs_crawl_domain(start_unified, depth=1)
        else:
            set_node_status(start_unified, "Kein Bezug zu Landwirtschaft")
    else:
        set_node_status(start_unified, "Nicht in der Schweiz")

    idx = 0
    while idx < len(nodes) and domain_crawl_count < max_domain_crawl_count:
        node = nodes[idx]
        idx += 1

        if node["url"] in visited_domains:
            continue
        
        if node["status"] in ("Start", "Relevant"):
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            continue

        dom_str = node["url"]

        if dom_str in visited_domains:
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            continue

        if any(nd in dom_str for nd in negative_list):
            set_node_status(dom_str, "Negativliste")
            visited_domains.add(dom_str)  # Mark visited
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            continue

        if any(pdom in dom_str for pdom in positive_list):
            set_node_status(dom_str, "Relevant") 
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
            bfs_crawl_domain(dom_str, depth=1)
            continue

        if dom_str.endswith(".ch"):
            keyword_check = contains_keyword(f"https://{dom_str}", keyword_list)
            if keyword_check is None:
                set_node_status(dom_str, "could not test keyword")
            elif keyword_check:
                set_node_status(dom_str, "Relevant")
                bfs_crawl_domain(dom_str, depth=1)
            else:
                set_node_status(dom_str, "Kein Bezug zu Landwirtschaft")
            visited_domains.add(dom_str)  # Mark visited here as well
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON
        else:
            set_node_status(dom_str, "Nicht in der Schweiz")
            visited_domains.add(dom_str)  # Mark visited
            generate_json_from_data(nodes, links, 'public/graph_data.json')  # Update JSON


    





if __name__ == "__main__":
    main()

Checking robots.txt at: https://ticketcorner.ch/robots.txt


Make Graph_data-filtered

In [1]:
import json

# Load the original graph data
with open('public/graph_data.json', 'r') as f:
    data = json.load(f)

# Filter nodes with "status" containing "Relevant"
filtered_nodes = [node for node in data['nodes'] if 'Relevant' in node['status']]

# Debugging: Print the number of filtered nodes
print(f"Number of filtered nodes: {len(filtered_nodes)}")

# Get the IDs of the filtered nodes
filtered_node_ids = {node['id'] for node in filtered_nodes}

# Debugging: Print the filtered node IDs
print(f"Filtered node IDs: {filtered_node_ids}")

# Filter edges that connect the filtered nodes
filtered_edges = [edge for edge in data['edges'] if edge['source'] in filtered_node_ids and edge['target'] in filtered_node_ids]

# Debugging: Print the number of filtered edges
print(f"Number of filtered edges: {len(filtered_edges)}")

# Create the filtered graph data
filtered_data = {
    'nodes': filtered_nodes,
    'edges': filtered_edges
}

# Write the filtered data to a new JSON file
with open('public/graph_data_filtered.json', 'w') as f:
    json.dump(filtered_data, f, indent=2)

print("Filtered graph data has been written to graph_data_filtered.json")

Number of filtered nodes: 1925
Filtered node IDs: {1, 9, 12, 8204, 20, 22, 24, 29, 8224, 8229, 8235, 53, 61, 79, 90, 94, 97, 100, 8292, 8293, 111, 8303, 125, 134, 167, 8363, 8367, 193, 8388, 8396, 214, 215, 8408, 218, 219, 8410, 223, 8416, 226, 234, 236, 8430, 239, 8431, 243, 244, 8438, 250, 253, 264, 271, 273, 275, 277, 281, 284, 8476, 8477, 287, 290, 295, 296, 300, 301, 8492, 303, 304, 306, 307, 323, 324, 330, 8522, 8523, 8524, 339, 342, 8551, 364, 8559, 8572, 8573, 8586, 8589, 8590, 8600, 8602, 415, 418, 431, 432, 8623, 8632, 8633, 8634, 452, 453, 8646, 8647, 460, 462, 8655, 464, 8657, 8659, 8660, 8661, 8662, 471, 8665, 474, 8666, 8668, 478, 484, 8676, 8693, 506, 8700, 8701, 8702, 519, 520, 8712, 8713, 533, 535, 536, 8744, 8763, 577, 586, 8779, 8781, 591, 592, 594, 596, 597, 8790, 599, 8791, 601, 8805, 616, 8809, 8814, 8815, 625, 8819, 635, 636, 8839, 649, 8845, 667, 8866, 8868, 684, 685, 8878, 8880, 701, 702, 703, 706, 8898, 708, 709, 714, 8908, 718, 719, 8910, 8913, 8915, 8916, 89