<a href="https://colab.research.google.com/github/EmirhanEge1/Wikipedia-Speedrunner/blob/main/Speedrun_Wiki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U sentence-transformers wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.9.0.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.9.0-py3-none-any.whl size=15422 sha256=c6f75a3a09d5a5f372457d8c23d942a1a10eb0c24435eb544bd9d73cf8c70790
  Stored in directory: /root/.cache/pip/wheels/08/22/bd/5181c75f59d48538eb0c0f3246ac541b8a3f0bce3bfd097047
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.9.0


In [None]:
import wikipediaapi
import time
import heapq
import torch
import pandas as pd
import random
import re
from sentence_transformers import SentenceTransformer, util

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)
wiki_en = wikipediaapi.Wikipedia(
    user_agent='DataEngineerBot/1.0 (example@email.com)',
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)




In [3]:

def is_valid_link(link):
    exclude = ["Category:", "File:", "Help:", "Template:", "Talk:", "Portal:", "Special:", "Wikipedia:"]
    return not any(link.startswith(prefix) for prefix in exclude)

def fast_pre_filter(links, target_page, top_n=50):
    target_words = set(re.findall(r'\w+', target_page.lower()))
    scored_links = []

    for link in links:
        score = 0
        link_lower = link.lower()
        for word in target_words:
            if word in link_lower:
                score += 1
        scored_links.append((score, link))

    scored_links.sort(key=lambda x: x[0], reverse=True)
    return [link for score, link in scored_links[:top_n]]
def fast_pre_filter(links, target_page, top_n=50):
    target_words = set(re.findall(r'\w+', target_page.lower()))
    scored_links = []

    for link in links:
        score = 0
        link_lower = link.lower()
        for word in target_words:
            if word in link_lower:
                score += 1
        scored_links.append((score, link))

    scored_links.sort(key=lambda x: x[0], reverse=True)
    return [link for score, link in scored_links[:top_n]]

In [4]:
def start_wikipedia_speedrunner(start_page, target_page, timeout=60):
    start_time = time.perf_counter()
    target_emb = model.encode(target_page, convert_to_tensor=True, normalize_embeddings=True)
    target_keywords = [w.lower() for w in target_page.split() if len(w) > 2]

    queue = [(-1.0, start_page, [start_page])]
    visited = {start_page.lower()}
    step_count = 0

    while queue:
        elapsed = time.perf_counter() - start_time
        if elapsed > timeout:
            return {"steps": 0, "explored": step_count, "success": False, "status": "Timed Out", "duration": elapsed, "path": []}

        neg_score, current_title, path = heapq.heappop(queue)
        step_count += 1

        if current_title.lower() == target_page.lower():
            return {"steps": len(path)-1, "explored": step_count, "success": True, "status": "Success", "duration": elapsed, "path": path}

        try:
            time.sleep(0.04)
            page = wiki_en.page(current_title)
            if not page.exists(): continue
            all_links = list(page.links.keys())
        except Exception as e:
            time.sleep(1.0)
            continue

        raw_links = [l for l in all_links if l.lower() not in visited and is_valid_link(l)]
        turbo_links = fast_pre_filter(raw_links, target_page, top_n=50)

        if not turbo_links: continue

        try:
            link_embs = model.encode(turbo_links, convert_to_tensor=True, normalize_embeddings=True, batch_size=128)
            cos_scores = util.dot_score(target_emb, link_embs)[0]
        except: continue

        for i, link_name in enumerate(turbo_links):
            link_score = cos_scores[i].item()
            link_lower = link_name.lower()

            for kw in target_keywords:
                if kw in link_lower: link_score += 0.5



            if i < 15: link_score += 0.05

            link_score -= (len(path) * 0.02)

            if link_lower not in visited:
                visited.add(link_lower)
                heapq.heappush(queue, (-link_score, link_name, path + [link_name]))

    return {"steps": 0, "explored": step_count, "success": False, "status": "No Path", "duration": time.perf_counter() - start_time, "path": []}

In [None]:
start_wikipedia_speedrunner( "Walmart" ,"Hong Kong")

{'steps': 2,
 'explored': 3,
 'success': True,
 'status': 'Success',
 'duration': 1.1575472109998373,
 'path': ['Walmart', 'Hong Kong Supermarket', 'Hong Kong']}

In [9]:
"""
Modes:
- 'vital': Targets the top 1,000/10,000 essential articles.
- 'academic': Focuses on academic disciplines and their specialized branches.
- 'deep': Scrapes links from the deeper layers of a random topic.
"""
def run_benchmark(num_tests=50, mode="vital"):
    print(f"\n Mode '{mode}'")
    pool = []

    if mode == "vital":
        categories = [
            "History", "Geography", "Arts", "Philosophy_and_religion",
            "Everyday_life", "Society_and_social_sciences", "Biology_and_health_sciences",
            "Physical_sciences", "Technology", "Mathematics"
        ]
        for cat in categories:
            try:
                p = wiki_en.page(f"Wikipedia:Vital_articles/Level/4/{cat}")
                valid_links = [l for l in p.links.keys() if is_valid_link(l)]
                pool.extend(valid_links)
            except: continue
        pool = list(set(pool))
    elif mode == "academic":
        hubs = ["Outline of academic disciplines", "List of academic fields"]
        for hub in hubs:
            p = wiki_en.page(hub)
            pool.extend([l for l in p.links.keys() if is_valid_link(l)])
        pool = list(set(pool))
    elif mode == "deep":
        base_topics = [
    "Science", "History", "Technology", "Arts",
    "Geography", "Philosophy", "Religion", "Mathematics",
    "Biology", "Medicine", "Economics", "Law",
    "Literature", "Music", "Politics", "Environment",
    "Psychology", "Sociology", "Sports", "Engineering"
]
        chosen = wiki_en.page(random.choice(base_topics))
        first_tier = random.sample(list(chosen.links.keys()), min(30, len(chosen.links)))
        for link in first_tier:
            try:
                p = wiki_en.page(link)
                sub_links = list(p.links.keys())
                pool.extend(random.sample(sub_links, min(10, len(sub_links))))
            except: continue
        pool = [l for l in pool if is_valid_link(l)]

    print(f"\nBenchmark Start: {num_tests} Test")
    print("=" * 110)
    results = []

    for i in range(num_tests):
        start, end = random.sample(pool, 2)
        print(f"[{i+1:03d}/{num_tests}] {start[:25]:<25} ➜ {end[:25]:<25}", end=" | ", flush=True)

        res = start_wikipedia_speedrunner(start, end, timeout=60)

        success = res.get("success", False)
        status = res.get("status", "Error")
        duration = round(res.get("duration", 0), 2)
        steps = res.get("steps", 0)

        icon = "✅" if success else "❌"
        print(f"{icon} {status:<10} | {duration:>5.2f}s | Adım: {steps}")

        if success:
            print(f"   Route: {' ➜ '.join(res['path'])}")

        results.append({
            "Start": start, "End": end, "Steps": steps,
            "Duration": duration, "Status": status, "Path": " ➜ ".join(res["path"])
        })

        if (i + 1) % 10 == 0:
            pd.DataFrame(results).to_csv(f"benchmark_{mode}_interim.csv", index=False)

    df = pd.DataFrame(results)
    df.to_csv(f"benchmark_{mode}_final.csv", index=False)

    if not df.empty:
        success_rate = (len(df[df.Status == "Success"]) / len(df)) * 100
        print("=" * 110)
        print(f"Result %{success_rate:.2f} Success | Avg Time: {df['Duration'].mean():.2f}s")

    return df

In [6]:
df_vital = run_benchmark(num_tests=300, mode="vital")


 Mode 'vital'

Benchmark Start: 300 Test
[001/300] Metamorphic rock          ➜ The Raven                 | ❌ Timed Out  | 60.02s | Adım: 0
[002/300] Militarism                ➜ Battle of Vienna          | ✅ Success    |  7.08s | Adım: 6
   Route: Militarism ➜ Battle ➜ Battle of Leipzig ➜ Battle of Austerlitz ➜ Battle of Austerlitz order of battle ➜ Vienna ➜ Battle of Vienna
[003/300] Reconquista               ➜ Like a Rolling Stone      | ✅ Success    | 54.01s | Adım: 18
   Route: Reconquista ➜ Age of Enlightenment ➜ Mary Wollstonecraft ➜ A Sculpture for Mary Wollstonecraft ➜ Gladstone Memorial, London ➜ Portland stone ➜ Freestone (masonry) ➜ Stone carving ➜ Carved stone balls ➜ Bird stone ➜ Stone circle ➜ Standing stone ➜ Stone slab ➜ Ledger stone ➜ Grave stone ➜ Stone ➜ Stones (disambiguation) ➜ The Rolling Stones ➜ Like a Rolling Stone
[004/300] Renaissance               ➜ Container                 | ✅ Success    |  6.38s | Adım: 5
   Route: Renaissance ➜ Aeropittura ➜ Fillìa ➜ Dev

In [10]:
df_academic = run_benchmark(num_tests=300, mode="academic")


 Mode 'academic'

Benchmark Start: 300 Test
[001/300] Administrative law        ➜ Foreign policy            | ✅ Success    |  1.40s | Adım: 2
   Route: Administrative law ➜ Drug policy ➜ Foreign policy
[002/300] Apiculture                ➜ Knowledge management      | ✅ Success    |  3.03s | Adım: 4
   Route: Apiculture ➜ Agricultural Research Service ➜ Risk Management Agency ➜ Risk management ➜ Knowledge management
[003/300] Emergency management      ➜ Caodaism                  | ✅ Success    | 23.87s | Adım: 7
   Route: Emergency management ➜ Adhocracy ➜ Anarchy ➜ Absolute monarchy ➜ Absolutism (European history) ➜ Enlightened absolutism ➜ Confucianism ➜ Caodaism
[004/300] Marketing geography       ➜ Paleontology              | ✅ Success    |  1.09s | Adım: 2
   Route: Marketing geography ➜ Geochronology ➜ Paleontology
[005/300] S2CID (identifier)        ➜ Community organizing      | ✅ Success    |  3.77s | Adım: 6
   Route: S2CID (identifier) ➜ Conference proceedings ➜ Academic conf

In [11]:
df_deep = run_benchmark(num_tests=300, mode="deep")


 Mode 'deep'

Benchmark Start: 300 Test
[001/300] Economist                 ➜ Politeia                  | ❌ Timed Out  | 60.50s | Adım: 0
[002/300] Mixed economy             ➜ Historical school of econ | ✅ Success    |  1.54s | Adım: 2
   Route: Mixed economy ➜ Chicago school of economics ➜ Historical school of economics
[003/300] Reputation of William Sha ➜ New classical macroeconom | ✅ Success    |  7.68s | Adım: 5
   Route: Reputation of William Shakespeare ➜ Historical background of the New Testament ➜ New Testament ➜ Classical antiquity ➜ Classical economics ➜ New classical macroeconomics
[004/300] Resource economics        ➜ Sovereignty               | ✅ Success    |  2.65s | Adım: 4
   Route: Resource economics ➜ Agricultural land ➜ English land law ➜ Parliamentary sovereignty ➜ Sovereignty
[005/300] Financial law             ➜ Topolobampo               | ❌ Timed Out  | 60.41s | Adım: 0
[006/300] Philology                 ➜ Order of Saint Benedict   | ✅ Success    |  4.64s | Ad