In [72]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
# img = mpimg.imread('erdos.jpg')
# imgplot = plt.imshow(img)
plt.show()

## Background 

    Starting from any random Wikipedia page, following the first* page link of the current page will eventually lead you to the "Philosophy" page of Wikipedia (~95% of the time).
     
    * Due to the structure of Wikipedia articles, the first valid link must NOT:
        
        be in either parentheses (usually language pages) or italics (usually disambiguations)
        
        be a link to either a meta page, a page outside Wikipedia, or a broken link
        
        be an in-page citation
        
        
        

## Basic Goal

    Make a webcrawler that starts from a random Wikipedia page, and follows the first page link until it either finds the Philosophy page, a page with no links, or loops back to a previously visited link.
    
    Return the name of the starting page, and the degrees of separation from the Philosophy page (set degree to -1 if terminates otherwise)

        Ex. the page "Kevin Bacon" should have a degree of separation of 12
        
    NOTE: For crawling, use the Wikipedia API. 
    
    You can start from a random Wikipedia page with a URL (https://en.wikipedia.org/wiki/Special:Random)   
    
   

## Advanced Goals

    1.) Get the degrees of separation for 1000 random pages. Find the median degree of separation as well as the distribution.
    
    2.) Considering your results, and the size of the English Wikipedia, estimate how many pages there are of degree 6.
    
    3.) Try the Cebuano Wikipedia. Following the same rules above, determine the network structure. 
    Is there is a page a page that all others tend to lead back to? 
    Does it tend to loop more? 
    In any case, are the degrees generally shorter/longer?
    
        

In [73]:
import requests

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "Grenada",
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

print(DATA)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# Wikipedia "Philosophy" Crawler (Final Fixed Version)
# Author: [Your Name]
# Course: Data Mining & Wrangling
# Description: Starts from a Wikipedia page and follows the first valid link until reaching "Philosophy"

import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import matplotlib.pyplot as plt

# === Setup ===
S = requests.Session()
API_URL = "https://en.wikipedia.org/w/api.php"
PHILOSOPHY_PAGE = "Philosophy"

# === Function: Get HTML via Wikipedia API ===
def get_page_html(title):
    """Fetch HTML content for a given Wikipedia page title using the API."""
    params = {"action": "parse", "page": title, "format": "json", "prop": "text"}
    try:
        response = S.get(url=API_URL, params=params, timeout=10)
        response.raise_for_status()
        if response.text.strip() == "":
            return None
        data = response.json()
    except Exception as e:
        print(f"[Error fetching {title}]: {e}")
        return None

    if "error" in data:
        return None
    return data["parse"]["text"]["*"]

# === Function: Extract the First Valid Wikipedia Link ===
def get_first_valid_link_from_html(html):
    """
    Extracts the first valid link from a Wikipedia HTML page.
    Rules:
      - Skip links in parentheses, italics, superscripts, or metadata.
      - Ignore anchors, files, or help pages.
    """
    soup = BeautifulSoup(html, "html.parser")
    content = soup.find("div", class_="mw-parser-output")
    if not content:
        return None

    # Remove unwanted tags before parsing links
    for tag in content.find_all(["table", "i", "em", "sup", "span", "small", "style"]):
        tag.decompose()

    # Scan paragraph text
    paragraphs = content.find_all("p", recursive=True)
    for p in paragraphs:
        if not p.text.strip():
            continue

        # Remove text inside parentheses
        text_no_parens = re.sub(r"\([^()]*\)", "", p.text)

        for a in p.find_all("a", href=True):
            href = a["href"]

            # Must be a valid internal wiki link
            if not href.startswith("/wiki/"):
                continue

            # Skip meta or anchor links
            if any(prefix in href for prefix in [":", "#"]):
                continue

            # Skip if link is in italics or superscripts
            if any(parent.name in ["i", "em", "sup", "small"] for parent in a.parents):
                continue

            # First valid link found
            return href.split("/wiki/")[-1]

    return None

# === Function: Follow the First Link Chain ===
def follow_to_philosophy(start_title="Grenada", max_steps=100):
    visited = set()
    current_title = start_title
    steps = 0

    while steps < max_steps:
        if current_title in visited:
            return start_title, -1, "Loop detected"
        visited.add(current_title)

        print(f"Step {steps}: {current_title}")

        if current_title.lower() == PHILOSOPHY_PAGE.lower():
            return start_title, steps, "Reached Philosophy"

        html = get_page_html(current_title)
        if not html:
            return start_title, -1, "Page fetch failed"

        next_title = get_first_valid_link_from_html(html)
        if not next_title:
            return start_title, -1, "No valid links found"

        current_title = next_title
        steps += 1
        time.sleep(0.6)  # polite delay to avoid throttling

    return start_title, -1, "Exceeded max steps"

# === Function: Get a Random Wikipedia Page Title ===
def get_random_wikipedia_title():
    """Returns a random article title from Wikipedia."""
    params = {
        "action": "query",
        "format": "json",
        "list": "random",
        "rnnamespace": 0,
        "rnlimit": 1
    }
    try:
        r = S.get(API_URL, params=params, timeout=10)
        data = r.json()
        return data["query"]["random"][0]["title"]
    except Exception as e:
        print(f"[Random page error]: {e}")
        return None

# === Main Execution ===
num_pages = 5  # Reduce to 5 for testing; can increase later
results = []

for i in range(num_pages):
    title = get_random_wikipedia_title()
    if not title:
        continue
    print(f"\n=== Random Page {i+1}/{num_pages}: {title} ===")
    start, degree, status = follow_to_philosophy(title)
    results.append({"start_page": start, "degree": degree, "status": status})

# === Results DataFrame & Visualization ===
df = pd.DataFrame(results)
print("\n--- Summary Table ---")
print(df)

valid_degrees = df[df["degree"] >= 0]["degree"]
if not valid_degrees.empty:
    plt.figure(figsize=(8,5))
    plt.hist(valid_degrees, bins=10, edgecolor="black")
    plt.title("Distribution of Degrees to Reach Philosophy")
    plt.xlabel("Degrees of Separation")
    plt.ylabel("Count")
    plt.grid(alpha=0.3)
    plt.show()
    print(f"\nMedian Degrees of Separation: {valid_degrees.median()}")
else:
    print("\nNo valid degree data available.")

print("\nStatus counts:\n", df["status"].value_counts())


[Random page error]: Expecting value: line 1 column 1 (char 0)
[Random page error]: Expecting value: line 1 column 1 (char 0)
[Random page error]: Expecting value: line 1 column 1 (char 0)
[Random page error]: Expecting value: line 1 column 1 (char 0)
[Random page error]: Expecting value: line 1 column 1 (char 0)

--- Summary Table ---
Empty DataFrame
Columns: []
Index: []


KeyError: 'degree'

In [69]:
headers = {
    'User-Agent': 'JOJIE-jbautista'
}

requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'prop': 'links',
        'titles': 'Emu_War',
        'pllimit': 500,
        'format': 'xml',
    },
    headers=headers
).text

'<?xml version="1.0"?><api batchcomplete=""><query><normalized><n from="Emu_War" to="Emu War" /></normalized><pages><page _idx="1274780" pageid="1274780" ns="0" title="Emu War"><links><pl ns="0" title="1933 Western Australian secession referendum" /><pl ns="0" title="Ambush" /><pl ns="0" title="Australia" /><pl ns="0" title="Australian Army" /><pl ns="0" title="Australian Light Horse" /><pl ns="0" title="Australian Senate" /><pl ns="0" title="Bibcode (identifier)" /><pl ns="0" title="Birds of Western Australia (book)" /><pl ns="0" title="Brumby shooting" /><pl ns="0" title="Campion, Western Australia" /><pl ns="0" title="Chandler, Western Australia" /><pl ns="0" title="Cinematographer" /><pl ns="0" title="Coolgardie Miner" /><pl ns="0" title="Cornell University Press" /><pl ns="0" title="Dam" /><pl ns="0" title="Dingo" /><pl ns="0" title="Dingo Fence" /><pl ns="0" title="Doi (identifier)" /><pl ns="0" title="Dominic Serventy" /><pl ns="0" title="Emu" /><pl ns="0" title="Expanding bulle

In [89]:
import requests
import pandas as pd
from urllib.parse import urlparse

headers = {
    "User-Agent": "JOJIE-jbautista"
}

def normalize_wiki_target(target: str) -> str:
    """
    Accept either a full Wikipedia article URL or a bare title and
    return the API-ready page title.
    """
    if target.lower().startswith("http"):
        parsed = urlparse(target)
        if parsed.netloc.endswith("wikipedia.org"):
            path = parsed.path  # e.g. /wiki/Emu_War
            if path.startswith("/wiki/") and len(path) > len("/wiki/"):
                return path.split("/wiki/", 1)[1]
    return target.replace(" ", "_")

# Choose the page to inspect: plain title or full URL both work
page_title = normalize_wiki_target("https://en.wikipedia.org/wiki/Emu_War")
page_title = normalize_wiki_target("https://en.wikipedia.org/wiki/Kevin_Bacon")
# page_title = normalize_wiki_target("Emu War")

params = {
    "action": "query",
    "prop": "links",
    "titles": page_title,
    "pllimit": 500,
    "format": "json"
}

response = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers=headers)

if response.status_code == 200 and "application/json" in response.headers.get("Content-Type", ""):
    data = response.json()
    pages = data.get("query", {}).get("pages", {})
    all_links = [
        link["title"]
        for page_info in pages.values()
        if "links" in page_info
        for link in page_info["links"]
    ]
    df = pd.DataFrame(all_links, columns=["Linked Pages"])
    print(df)
else:
    print("Error: Could not get valid JSON response")
    print("Response Code:", response.status_code)
    print("Response Content:", response.text[:500])

df = pd.DataFrame(response)
df


                                          Linked Pages
0       39th Santa Barbara International Film Festival
1       40th Santa Barbara International Film Festival
2                                             8 (play)
3                                       A Few Good Men
4                                A Few Good Men (film)
..                                                 ...
328  Template talk:Golden Globe Best Actor TV Minis...
329                          Template talk:Kevin Bacon
330  Template talk:Saturn Award for Best Actor on T...
331  Template talk:ScreenActorsGuildAward MaleTVMin...
332                             Help:Authority control

[333 rows x 1 columns]


Unnamed: 0,0
0,"b'{""batchcomplete"":"""",""query"":{""normalized"":[{..."
1,"b'itle"":""Kevin Bacon"",""links"":[{""ns"":0,""title""..."
2,"b'ra International Film Festival""},{""ns"":0,""ti..."
3,"b'film)""},{""ns"":0,""title"":""Access Hollywood""},..."
4,"b'ickman""},{""ns"":0,""title"":""Albert Finney""},{""..."
...,...
101,b'te:Saturn Award for Best Actor on Television...
102,"b'1,""title"":""Template talk:American Riviera Aw..."
103,"b'""ns"":11,""title"":""Template talk:Golden Globe ..."
104,"b'11,""title"":""Template talk:Saturn Award for B..."


Working Script but does not hit Philosophy

In [95]:
import requests
from bs4 import BeautifulSoup
import time

# --- Session setup with a custom User-Agent (important for Wikipedia API) ---
S = requests.Session()
S.headers.update({'User-Agent': 'JOJIE-jbautista'})

BASE_URL = "https://en.wikipedia.org"
PHILOSOPHY_PAGE = "Philosophy"

# --- Get a random Wikipedia page title ---
def get_random_page_title():
    resp = S.get("https://en.wikipedia.org/wiki/Special:Random", allow_redirects=True)
    # resp = S.get("https://en.wikipedia.org/wiki/Animal", allow_redirects=True)
    return resp.url.split("/wiki/")[-1]

# --- Get first valid link from a Wikipedia page ---
def get_first_valid_link(title):
    url = f"{BASE_URL}/wiki/{title}"
    resp = S.get(url)
    if resp.status_code != 200:
        return None

    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find("div", class_="mw-parser-output")
    if not content:
        return None

    # Clean up: remove tables, italics, and small metadata sections
    for tag in content.find_all(["table", "i", "em", "span", "sup", "small"]):
        tag.decompose()

    # Find first valid link in a paragraph
    for paragraph in content.find_all("p", recursive=True):
        if not paragraph.text.strip():
            continue

        # Ignore text inside parentheses
        for a in paragraph.find_all("a", href=True):
            href = a["href"]
            # Must be a valid wiki article link (not Help:, File:, #, etc.)
            if not href.startswith("/wiki/") or any(prefix in href for prefix in [":", "#"]):
                continue

            # Skip links inside italics or superscripts
            if any(parent.name in ["i", "em", "sup", "small"] for parent in a.parents):
                continue

            return href.split("/wiki/")[-1]
    return None

# --- Main crawler function ---
def follow_to_philosophy(start_title=None, max_steps=100):
    if not start_title:
        start_title = get_random_page_title()

    visited = set()
    current_title = start_title
    steps = 0

    print(f"\nStarting from: {current_title}")

    while steps < max_steps:
        if current_title in visited:
            print("Loop detected!")
            return start_title, -1
        visited.add(current_title)

        print(f"Step {steps}: {current_title}")
        if current_title.lower() == PHILOSOPHY_PAGE.lower():
            print("Reached Philosophy!")
            return start_title, steps

        next_title = get_first_valid_link(current_title)
        if not next_title:
            print("No valid links found — dead end.")
            return start_title, -1

        current_title = next_title
        steps += 1
        time.sleep(0.5)  # polite delay

    print("Exceeded max steps.")
    return start_title, -1

# --- Run a single test ---
start, degree = follow_to_philosophy()
print(f"\nResult: Started from '{start}', Degrees of Separation = {degree}")



Starting from: 15_Again
Step 0: 15_Again
Step 1: Electronica
Step 2: Electronic_music
Step 3: Music_genre
Step 4: Music
No valid links found — dead end.

Result: Started from '15_Again', Degrees of Separation = -1


In [13]:
import requests
import re
import time

# --- Session setup ---
S = requests.Session()
S.headers.update({'User-Agent': 'JOJIE-jbautista'})

BASE_URL = "https://en.wikipedia.org"
PHILOSOPHY_PAGE = "Philosophy"

# --- Always start from Kevin Bacon ---
def get_random_page_title():
    resp = S.get("https://en.wikipedia.org/wiki/Kevin_Bacon", allow_redirects=True)
    resp = S.get("https://en.wikipedia.org/wiki/Whale", allow_redirects=True)
    # resp = S.get("https://en.wikipedia.org/wiki/Animal", allow_redirects=True)
    return resp.url.split("/wiki/")[-1]

# --- Regex patterns ---
# Basic pattern for <a href="/wiki/...">text</a>
link_pattern = re.compile(
    r'<a href="(/wiki/[^"#:]+)"[^>]*>(.*?)</a>',
    re.IGNORECASE | re.DOTALL
)

# Pattern to strip HTML tags (for text-only cleanup)
tag_cleaner = re.compile(r'<[^>]+>')

# --- Find first valid link in HTML using regex ---
def get_first_valid_link_regex(html: str) -> str | None:
    """
    Return the first valid /wiki/... link in the article HTML,
    starting after the bolded name (</b>), skipping <figcaption> sections,
    italicized links, and ignoring links inside parentheses.
    """

    # Focus only on the main article body
    m = re.search(r'<div id="mw-content-text"[^>]*>(.*?)<div id="mw-navigation"', html, flags=re.DOTALL)
    if m:
        html = m.group(1)

    # 🔹 Start parsing after the bolded name (</b>)
    marker_index = html.find("</b>")
    if marker_index != -1:
        html = html[marker_index + len("</b>"):]

    # 🔹 Remove all <figcaption>...</figcaption> sections
    html = re.sub(r'<figcaption.*?</figcaption>', '', html, flags=re.DOTALL | re.IGNORECASE)

    # 🔹 Remove all italicized sections (<i>...</i> and <em>...</em>)
    html = re.sub(r'<i.*?</i>', '', html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r'<em.*?</em>', '', html, flags=re.DOTALL | re.IGNORECASE)

    # Limit to lead section (before first <h2>)
    lead_section = html.split("<h2")[0]

    # Remove citation superscripts
    lead_section = re.sub(r'<sup.*?</sup>', '', lead_section, flags=re.DOTALL)

    paren_depth = 0
    pos = 0

    for match in link_pattern.finditer(lead_section):
        href, text = match.groups()
        before = lead_section[pos:match.start()]
        pos = match.end()

        # Track parentheses nesting
        for ch in before:
            if ch == "(":
                paren_depth += 1
            elif ch == ")":
                paren_depth = max(paren_depth - 1, 0)

        # Only consider links outside parentheses
        if paren_depth == 0:
            clean_text = tag_cleaner.sub('', text).strip()
            if clean_text:
                # Skip irrelevant or self links
                if "Main_Page" in href or href.lower().endswith("kevin_bacon"):
                    continue
                return href.split("/wiki/")[-1]

    return None




# --- Get first valid link from a Wikipedia title ---
def get_first_valid_link(title: str) -> str | None:
    url = f"{BASE_URL}/wiki/{title}"
    r = S.get(url)
    if r.status_code != 200:
        print(f"[HTTP {r.status_code}] {url}")
        return None

    html = r.text
    link = get_first_valid_link_regex(html)
    return link

# --- Follow links until reaching Philosophy ---
def follow_to_philosophy(start_title=None, max_steps=100):
    if not start_title:
        start_title = get_random_page_title()

    visited = set()
    current = start_title
    steps = 0
    path = [current]

    print(f"\nStarting from: {current}")

    while steps < max_steps:
        if current in visited:
            print("Loop detected!")
            return start_title, -1, path
        visited.add(current)

        print(f"Step {steps}: {current}")
        if current.lower() == PHILOSOPHY_PAGE.lower():
            print("Reached Philosophy!")
            return start_title, steps, path

        next_title = get_first_valid_link(current)
        if not next_title:
            print("No valid links found — dead end.")
            return start_title, -1, path

        if steps == 0:
            print(f"First link found from {current} → {next_title}")

        path.append(next_title)
        current = next_title
        steps += 1
        time.sleep(0.4)

    print("Exceeded max steps.")
    return start_title, -1, path

# --- Run the crawler ---
start_title = get_random_page_title()
start, degree, path = follow_to_philosophy(start_title)

print(f"\nResult: Started from '{start}', Degrees of Separation = {degree}")
print("Full Path:")
print(" → ".join(path))



Starting from: Whale
Step 0: Whale
First link found from Whale → Aquatic_animal
Step 1: Aquatic_animal
Step 2: Animal
Step 3: Multicellular
Step 4: Organism
Step 5: Life
Step 6: Matter
Step 7: Mass
Step 8: Analytical_mechanics
Step 9: Lagrangian_mechanics
Loop detected!

Result: Started from 'Whale', Degrees of Separation = -1
Full Path:
Whale → Aquatic_animal → Animal → Multicellular → Organism → Life → Matter → Mass → Analytical_mechanics → Lagrangian_mechanics → Analytical_mechanics


Random Slop

In [91]:
import requests
import re
import time

# --- Session setup ---
S = requests.Session()
S.headers.update({'User-Agent': 'JOJIE-jbautista'})

BASE_URL = "https://en.wikipedia.org"
PHILOSOPHY_PAGE = "Philosophy"

# --- Always start from Kevin Bacon ---
def get_random_page_title():
    resp = S.get("https://en.wikipedia.org/wiki/Kevin_Bacon", allow_redirects=True)
    return resp.url.split("/wiki/")[-1]

# --- Regex patterns ---
# Basic pattern for <a href="/wiki/...">text</a>
link_pattern = re.compile(
    r'<a href="(/wiki/[^"#:]+)"[^>]*>(.*?)</a>',
    re.IGNORECASE | re.DOTALL
)

# Pattern to strip HTML tags (for text-only cleanup)
tag_cleaner = re.compile(r'<[^>]+>')

# --- Find first valid link in HTML using regex ---
def get_first_valid_link_regex(html: str) -> str | None:
    """
    Return the first valid /wiki/... link in the lead section of a Wikipedia article,
    skipping links inside parentheses.
    """
    # Focus only on the article content (ignore navigation/sidebar/header)
    m = re.search(r'<div id="mw-content-text"[^>]*>(.*?)<div id="mw-navigation"', html, flags=re.DOTALL)
    if m:
        html = m.group(1)
    else:
        # fallback: if pattern not found, use entire HTML
        pass

    # Limit search to lead section (before first <h2>)
    lead_section = html.split("<h2")[0]

    # Remove citation superscripts
    lead_section = re.sub(r'<sup.*?</sup>', '', lead_section, flags=re.DOTALL)

    paren_depth = 0
    pos = 0

    for match in link_pattern.finditer(lead_section):
        href, text = match.groups()
        before = lead_section[pos:match.start()]
        pos = match.end()

        for ch in before:
            if ch == "(":
                paren_depth += 1
            elif ch == ")":
                paren_depth = max(paren_depth - 1, 0)

        if paren_depth == 0:
            clean_text = tag_cleaner.sub('', text).strip()
            if clean_text:
                # skip Main_Page or other irrelevant links
                if href.endswith("Main_Page"):
                    continue
                return href.split("/wiki/")[-1]
    return None


# --- Get first valid link from a Wikipedia title ---
def get_first_valid_link(title: str) -> str | None:
    url = f"{BASE_URL}/wiki/{title}"
    r = S.get(url)
    if r.status_code != 200:
        print(f"[HTTP {r.status_code}] {url}")
        return None

    html = r.text
    link = get_first_valid_link_regex(html)
    return link

# --- Follow links until reaching Philosophy ---
def follow_to_philosophy(start_title=None, max_steps=100):
    if not start_title:
        start_title = get_random_page_title()

    visited = set()
    current = start_title
    steps = 0
    path = [current]

    print(f"\nStarting from: {current}")

    while steps < max_steps:
        if current in visited:
            print("Loop detected!")
            return start_title, -1, path
        visited.add(current)

        print(f"Step {steps}: {current}")
        if current.lower() == PHILOSOPHY_PAGE.lower():
            print("Reached Philosophy!")
            return start_title, steps, path

        next_title = get_first_valid_link(current)
        if not next_title:
            print("No valid links found — dead end.")
            return start_title, -1, path

        if steps == 0:
            print(f"First link found from {current} → {next_title}")

        path.append(next_title)
        current = next_title
        steps += 1
        time.sleep(0.4)

    print("Exceeded max steps.")
    return start_title, -1, path

# --- Run the crawler ---
start_title = get_random_page_title()
start, degree, path = follow_to_philosophy(start_title)

print(f"\nResult: Started from '{start}', Degrees of Separation = {degree}")
print("Full Path:")
print(" → ".join(path))



Starting from: Kevin_Bacon
Step 0: Kevin_Bacon
No valid links found — dead end.

Result: Started from 'Kevin_Bacon', Degrees of Separation = -1
Full Path:
Kevin_Bacon


Random slop 2

In [3]:
import requests
import time
from bs4 import BeautifulSoup

S = requests.Session()
S.headers.update({'User-Agent': 'JOJIE-jbautista'})

BASE_URL = "https://en.wikipedia.org"
PHILOSOPHY_PAGE = "Philosophy"

def get_random_page_title():
    resp = S.get("https://en.wikipedia.org/wiki/Kevin_Bacon", allow_redirects=True)
    return resp.url.split("/wiki/")[-1]

def get_first_valid_link_regex(html: str) -> str | None:
    """
    Return the first valid /wiki/... link in the HTML,
    starting only after the first <tbody> and skipping links inside parentheses.
    """
    # Find the position after the first <tbody>
    tbody_index = html.find("<tbody>")
    if tbody_index != -1:
        html = html[tbody_index:]  # Trim everything before <tbody>

    # Remove citation superscripts
    html = re.sub(r'<sup.*?</sup>', '', html, flags=re.DOTALL)

    # Limit to content before first <h2> if you still want to stop at lead section
    lead_section = html.split("<h2")[0]

    paren_depth = 0
    pos = 0

    for match in link_pattern.finditer(lead_section):
        href, text = match.groups()
        before = lead_section[pos:match.start()]
        pos = match.end()

        # Track parentheses nesting
        for ch in before:
            if ch == "(":
                paren_depth += 1
            elif ch == ")":
                paren_depth = max(paren_depth - 1, 0)

        if paren_depth == 0:
            clean_text = tag_cleaner.sub('', text).strip()
            if clean_text:
                # skip links to Main_Page or Help/Portal/Filespaces
                if any(x in href for x in ["Main_Page", "Help:", "File:", "Portal:"]):
                    continue
                return href.split("/wiki/")[-1]

    return None


def follow_to_philosophy(start_title=None, max_steps=100):
    if not start_title:
        start_title = get_random_page_title()

    visited = set()
    current = start_title
    steps = 0
    path = [current]

    print(f"\nStarting from: {current}")

    while steps < max_steps:
        if current in visited:
            print("Loop detected!")
            return start_title, -1, path
        visited.add(current)

        print(f"Step {steps}: {current}")
        if current.lower() == PHILOSOPHY_PAGE.lower():
            print("Reached Philosophy!")
            return start_title, steps, path

        next_title = get_first_valid_link(current)
        if not next_title:
            print("No valid links found — dead end.")
            return start_title, -1, path

        if steps == 0:
            print(f"First link found from {current} → {next_title}")

        path.append(next_title)
        current = next_title
        steps += 1
        time.sleep(0.4)

    print("Exceeded max steps.")
    return start_title, -1, path

# --- Run the crawler ---
start_title = get_random_page_title()
start, degree, path = follow_to_philosophy(start_title)

print(f"\nResult: Started from '{start}', Degrees of Separation = {degree}")
print("Full Path:")
print(" → ".join(path))



Starting from: Kevin_Bacon
Step 0: Kevin_Bacon
No valid links found — dead end.

Result: Started from 'Kevin_Bacon', Degrees of Separation = -1
Full Path:
Kevin_Bacon
