In [None]:
# Cell 1 - Title / Notes (Markdown)
# Use a Markdown cell with:
# # Wikipedia Philosophy Crawler
# Follow the first valid link rule until reaching Philosophy, looping, or stalling.
# Requires the packages: `requests`, `beautifulsoup4`.

# Cell 2
from __future__ import annotations

import time
import urllib.parse
from dataclasses import dataclass
from typing import Optional, Set, Tuple

import requests
from bs4 import BeautifulSoup, NavigableString, Tag

API_URL = "https://en.wikipedia.org/w/api.php"
PHILOSOPHY_TITLE = "Philosophy"


class WikipediaAPIError(RuntimeError):
    """Raised when the Wikipedia API returns an unexpected response."""


@dataclass
class CrawlResult:
    start_title: str
    degree_of_separation: int
    path: Tuple[str, ...]
    termination_reason: str

    def __str__(self) -> str:
        lines = [
            f"Starting page: {self.start_title}",
            f"Degree of separation: {self.degree_of_separation}",
            f"Termination reason: {self.termination_reason}",
        ]
        return "\n".join(lines)


In [None]:
# Cell 3
def fetch_random_title(session: Optional[requests.Session] = None) -> str:
    """Return the title of a random article in the main namespace."""
    session = session or requests.Session()
    params = {
        "action": "query",
        "list": "random",
        "rnnamespace": 0,
        "rnlimit": 1,
        "format": "json",
    }
    response = session.get(API_URL, params=params, timeout=10)
    response.raise_for_status()
    data = response.json()
    random_pages = data.get("query", {}).get("random")
    if not random_pages:
        raise WikipediaAPIError("No random page returned by the Wikipedia API.")
    return random_pages[0]["title"]


def fetch_page_html(title: str, session: Optional[requests.Session] = None) -> str:
    """Return the rendered HTML for the given article title."""
    session = session or requests.Session()
    params = {
        "action": "parse",
        "page": title,
        "prop": "text",
        "format": "json",
        "formatversion": 2,
    }
    response = session.get(API_URL, params=params, timeout=10)
    response.raise_for_status()
    data = response.json()
    if "error" in data:
        raise WikipediaAPIError(data["error"].get("info", "Unknown API error."))
    parse_data = data.get("parse")
    if not parse_data or "text" not in parse_data:
        raise WikipediaAPIError(f"Missing page content for '{title}'.")
    return parse_data["text"]


In [None]:
# Cell 4
def is_valid_link(tag: Tag) -> bool:
    """Return True if the anchor tag is a valid article link to follow."""
    if tag.name != "a":
        return False

    href = tag.get("href")
    if not href or not href.startswith("/wiki/"):
        return False

    target = href.split("/wiki/", 1)[1]
    if ":" in target or "#" in target:
        return False

    classes = tag.get("class", [])
    if "new" in classes or "mw-selflink" in classes:
        return False

    for parent in tag.parents:
        if parent.name in ("i", "em"):
            return False
    return True


def extract_first_link(html: str) -> Optional[str]:
    """Return the title of the first valid link in the article body."""
    soup = BeautifulSoup(html, "html.parser")
    content = soup.find("div", class_="mw-parser-output")
    if not content:
        return None

    def traverse(node, depth: int) -> Tuple[Optional[str], int]:
        for child in node.children:
            if isinstance(child, NavigableString):
                for char in str(child):
                    if char == "(":
                        depth += 1
                    elif char == ")":
                        depth = max(depth - 1, 0)
                continue

            if isinstance(child, Tag):
                if child.name == "a" and depth == 0 and is_valid_link(child):
                    return child["href"], depth

                if child.name == "sup" and "reference" in child.get("class", []):
                    continue

                candidate, depth = traverse(child, depth)
                if candidate:
                    return candidate, depth
        return None, depth

    depth = 0
    for section in content.find_all(["p", "ul", "ol"], recursive=False):
        candidate, depth = traverse(section, depth)
        if candidate:
            decoded = urllib.parse.unquote(candidate.split("/wiki/", 1)[1])
            return decoded.replace("_", " ")
    return None


In [None]:
# Cell 5
def crawl_to_philosophy(
    start_title: Optional[str] = None,
    max_steps: int = 100,
    delay: float = 0.1,
    session: Optional[requests.Session] = None,
) -> CrawlResult:
    """Follow the first valid link on successive pages until Philosophy or termination."""
    session = session or requests.Session()
    current_title = start_title or fetch_random_title(session=session)
    visited: Set[str] = set()
    path = [current_title]

    for _ in range(max_steps):
        if current_title == PHILOSOPHY_TITLE:
            return CrawlResult(
                start_title=path[0],
                degree_of_separation=len(path) - 1,
                path=tuple(path),
                termination_reason="Reached Philosophy",
            )

        if current_title in visited:
            return CrawlResult(
                start_title=path[0],
                degree_of_separation=-1,
                path=tuple(path),
                termination_reason=f"Detected loop at '{current_title}'.",
            )

        visited.add(current_title)

        try:
            html = fetch_page_html(current_title, session=session)
        except (requests.RequestException, WikipediaAPIError) as exc:
            return CrawlResult(
                start_title=path[0],
                degree_of_separation=-1,
                path=tuple(path),
                termination_reason=f"Failed to fetch '{current_title}': {exc}",
            )

        next_title = extract_first_link(html)
        if not next_title:
            return CrawlResult(
                start_title=path[0],
                degree_of_separation=-1,
                path=tuple(path),
                termination_reason=f"No valid links on '{current_title}'.",
            )

        current_title = next_title
        path.append(current_title)

        if delay:
            time.sleep(delay)

    return CrawlResult(
        start_title=path[0],
        degree_of_separation=-1,
        path=tuple(path),
        termination_reason=f"Maximum step count ({max_steps}) exceeded.",
    )


In [None]:
# Cell 6 - Example usage (runs real HTTP requests)
session = requests.Session()
result = crawl_to_philosophy(start_title="Kevin Bacon", session=session, delay=0.0)
print(result)

print("\nTraversal path (first 15 nodes):")
for index, page in enumerate(result.path[:15]):
    print(f"{index:>2}: {page}")
