In [5]:
import pywikibot as pwb
import webbrowser as wb
import pandas as pd
import requests
import re
import os
import json
from typing import Dict, List

site = pwb.Site("en", "marble_kingdoms")  # (language_code, family)
site.login()

In [6]:
def get_episode_data() -> Dict[str, Dict[str, List[str]]]:
    """Get all episode data using pywikibot"""
    all_episodes = {}
    
    # Process episodes 3 through 37
    for episode_num in range(3, 38):
        page_title = f"MK {episode_num}"
        try:
            page = pwb.Page(site, page_title)
            if page.exists():
                print(f"Processing {page_title}...")
                episode_data = parse_episode_page(page)
                if episode_data:
                    all_episodes[page_title] = episode_data
                    print(f"  Found {sum(len(heroes) for heroes in episode_data.values())} heroes in {len(episode_data)} levels")
                else:
                    print(f"  No hero data found in {page_title}")
            else:
                print(f"Page {page_title} does not exist")
        except Exception as e:
            print(f"Error processing {page_title}: {e}")
    
    return all_episodes

def parse_episode_page(page: pwb.Page) -> Dict[str, List[str]]:
    """Parse a single episode page to extract heroes by level"""
    episode_data = {}
    wikitext = page.text
    
    # Find the "Heroes of MK X" section using a more robust pattern
    heroes_pattern = r'==+[^=]*Heroes of MK[^=]*==+(.*?)(?==+=|\{\{|$)'
    heroes_section_match = re.search(heroes_pattern, wikitext, re.IGNORECASE | re.DOTALL)
    
    if not heroes_section_match:
        # Try alternative pattern in case the section naming varies
        heroes_pattern_alt = r'==+[^=]*Heroes[^=]*==+(.*?)(?==+=|\{\{|$)'
        heroes_section_match = re.search(heroes_pattern_alt, wikitext, re.IGNORECASE | re.DOTALL)
        
        if not heroes_section_match:
            return {}
    
    heroes_section = heroes_section_match.group(1)
    
    # Find all level sections (Level X, Special, Stage X, etc.)
    level_pattern = r'===+([^=]+)===+(.*?)(?===+=|\{\{|$)'
    
    for match in re.finditer(level_pattern, heroes_section, re.DOTALL):
        level_name = match.group(1).strip()
        level_content = match.group(2).strip()
        
        # Check if this is actually a level section (not some other heading)
        if not re.match(r'(?i)(level|stage|special|lvl)\s*\d*', level_name) and 'special' not in level_name.lower():
            continue
        
        # Extract hero names from wikilinks [[Hero Name]] or [[Hero Name|display text]]
        heroes = []
        
        # Method 1: Direct wikilink patterns
        wikilink_matches = re.findall(r'\[\[([^\]|]+)(?:\|[^\]]*)?\]\]', level_content)
        for hero in wikilink_matches:
            hero_name = hero.strip()
            if hero_name and hero_name not in heroes:
                heroes.append(hero_name)
        
        # Method 2: Look for bullet lists with hero names
        if not heroes:
            bullet_lines = re.findall(r'\*+\s*\[\[([^\]|]+)', level_content)
            for hero in bullet_lines:
                hero_name = hero.strip()
                if hero_name and hero_name not in heroes:
                    heroes.append(hero_name)
        
        # Method 3: Look for comma-separated hero names in paragraphs
        if not heroes:
            # Find text that might contain hero names
            text_blocks = re.split(r'\n\n+', level_content)
            for block in text_blocks:
                if '[[[' in block:  # Likely contains wikilinks
                    block_heroes = re.findall(r'\[\[([^\]|]+)(?:\|[^\]]*)?\]\]', block)
                    for hero in block_heroes:
                        hero_name = hero.strip()
                        if hero_name and hero_name not in heroes:
                            heroes.append(hero_name)
        
        if heroes:
            episode_data[level_name] = heroes
    
    return episode_data

def save_to_json(data: Dict, filename: str = 'marble_kingdoms_heroes.json'):
    """Save the data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Data saved to {filename}")

def save_to_csv(data: Dict, filename: str = 'marble_kingdoms_heroes.csv'):
    """Save the data to a CSV file using pandas"""
    rows = []
    for episode, levels in data.items():
        for level, heroes in levels.items():
            for hero in heroes:
                rows.append({
                    'episode': episode,
                    'level': level,
                    'hero': hero
                })
    
    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved to {filename}")

def display_results(data: Dict):
    """Display the results in a readable format"""
    print("\n" + "="*60)
    print("MARBLE KINGDOMS HEROES DATABASE")
    print("="*60)
    
    for episode, levels in data.items():
        print(f"\n{episode}")
        for level, heroes in levels.items():
            print(f"  --{level}")
            print(f"    ----{', '.join(heroes)}")
    
    total_episodes = len(data)
    total_levels = sum(len(levels) for levels in data.values())
    total_heroes = sum(len(heroes) for levels in data.values() for heroes in levels.values())
    
    print(f"\nSUMMARY: {total_episodes} episodes, {total_levels} levels, {total_heroes} hero appearances")

def main():
    """Main function to execute the scraping process"""
    print("Starting Marble Kingdoms hero scraping...")
    print("Logged in as:", site.user())
    
    # Get all episode data
    heroes_dict = get_episode_data()
    
    # Display results
    display_results(heroes_dict)
    
    # Save to files
    save_to_json(heroes_dict)
    save_to_csv(heroes_dict)
    
    # Optional: Open the JSON file in default application
    if input("\nOpen JSON file? (y/n): ").lower() == 'y':
        wb.open('marble_kingdoms_heroes.json')

if __name__ == "__main__":
    main()

Starting Marble Kingdoms hero scraping...
Logged in as: Louisa-Bot
Processing MK 3...
  No hero data found in MK 3
Processing MK 4...
  No hero data found in MK 4
Processing MK 5...
  No hero data found in MK 5
Processing MK 6...
  No hero data found in MK 6
Processing MK 7...
  No hero data found in MK 7
Processing MK 8...
  No hero data found in MK 8
Processing MK 9...
  No hero data found in MK 9
Processing MK 10...
  No hero data found in MK 10
Processing MK 11...
  No hero data found in MK 11
Processing MK 12...
  No hero data found in MK 12
Processing MK 13...
  No hero data found in MK 13
Processing MK 14...
  No hero data found in MK 14
Processing MK 15...
  No hero data found in MK 15
Processing MK 16...
  No hero data found in MK 16
Processing MK 17...
  No hero data found in MK 17
Processing MK 18...
  No hero data found in MK 18
Processing MK 19...
  No hero data found in MK 19
Processing MK 20...
  No hero data found in MK 20
Processing MK 21...
  No hero data found in MK 


Open JSON file? (y/n):  n


In [9]:
def debug_episode_extraction(page: pwb.Page) -> Dict[str, List[str]]:
    """Debug function to show exactly what's extracted from an episode"""
    episode_data = {}
    wikitext = page.text
    
    print(f"\n{'='*60}")
    print(f"DEBUGGING EXTRACTION FOR: {page.title()}")
    print(f"{'='*60}")
    
    # Show the raw heroes section first - use a more flexible pattern
    heroes_match = re.search(r'== Heroes of MK \d+ ==\s*\n(.*?)(?=\n==|\n\{\{|$)', wikitext, re.DOTALL)
    
    if not heroes_match:
        print("❌ NO HEROES SECTION FOUND!")
        return {}
    
    heroes_section = heroes_match.group(1)
    print("📋 HEROES SECTION CONTENT:")
    print("-" * 40)
    print(repr(heroes_section))  # Use repr to see hidden characters
    print("-" * 40)
    
    # Find all level subsections - FIXED PATTERN with optional spaces
    level_pattern = r'===+\s*(Level \d+|Special|Stage \d+)\s*===+\s*\n(.*?)(?=\n===+|\n==|\n\{\{|$)'
    level_matches = list(re.finditer(level_pattern, heroes_section, re.DOTALL))
    
    print(f"🔍 Found {len(level_matches)} level sections")
    
    if not level_matches:
        print("❌ NO LEVEL SECTIONS FOUND! Trying alternative patterns...")
        
        # Try alternative patterns
        alt_patterns = [
            r'===+([^=]+)===+\s*\n(.*?)(?=\n===+|\n==|\n\{\{|$)',
            r'===+(.*?)\n(.*?)(?=\n===+|\n==|\n\{\{|$)',
            r'(Level \d+|Special|Stage \d+)[\s\S]*?\[\[(.*?)\]\]'
        ]
        
        for i, pattern in enumerate(alt_patterns):
            alt_matches = list(re.finditer(pattern, heroes_section, re.DOTALL))
            print(f"Alternative pattern {i+1}: Found {len(alt_matches)} matches")
            for match in alt_matches:
                print(f"  Match: {match.groups()}")
    
    for i, match in enumerate(level_matches):
        level_name = match.group(1).strip()
        level_content = match.group(2).strip()
        
        print(f"\n📊 LEVEL {i+1}: {level_name}")
        print(f"   Content: '{level_content}'")
        
        # Extract hero names
        heroes = []
        
        # Method: Find all wikilinks
        wikilink_heroes = re.findall(r'\[\[([^\]|]+)(?:\|[^\]]*)?\]\]', level_content)
        print(f"   Wikilinks found: {wikilink_heroes}")
        
        for hero in wikilink_heroes:
            hero_clean = hero.strip()
            if hero_clean and hero_clean not in heroes:
                heroes.append(hero_clean)
        
        print(f"   ✅ FINAL HEROES: {heroes}")
        
        if heroes:
            episode_data[level_name] = heroes
        else:
            print(f"   ❌ NO HEROES EXTRACTED from {level_name}")
    
    return episode_data

def parse_episode_page_final(page: pwb.Page) -> Dict[str, List[str]]:
    """Final parser that handles the actual wiki format"""
    episode_data = {}
    wikitext = page.text
    
    # Find heroes section
    heroes_match = re.search(r'== Heroes of MK \d+ ==\s*\n(.*?)(?=\n==|\n\{\{|$)', wikitext, re.DOTALL)
    if not heroes_match:
        return {}
    
    heroes_section = heroes_match.group(1)
    
    # Find all level subsections - more flexible pattern
    level_pattern = r'===+\s*([^=]+)\s*===+\s*\n(.*?)(?=\n===+|\n==|\n\{\{|$)'
    level_matches = re.finditer(level_pattern, heroes_section, re.DOTALL)
    
    for match in level_matches:
        level_name = match.group(1).strip()
        level_content = match.group(2).strip()
        
        # Only process if it looks like a level heading
        if not any(keyword in level_name.lower() for keyword in ['level', 'special', 'stage', 'lvl']):
            continue
        
        # Extract all wikilinks
        hero_matches = re.findall(r'\[\[([^\]|]+)(?:\|[^\]]*)?\]\]', level_content)
        heroes = []
        
        for hero in hero_matches:
            hero_clean = hero.strip()
            if hero_clean and hero_clean not in heroes:
                heroes.append(hero_clean)
        
        if heroes:
            episode_data[level_name] = heroes
    
    return episode_data

# Test with MK 17 first
print("Testing extraction with MK 17...")
mk17_page = pwb.Page(site, "MK 17")
if mk17_page.exists():
    # First debug to see what's happening
    debug_data = debug_episode_extraction(mk17_page)
    print(f"\n🎯 DEBUG RESULT: {debug_data}")
    
    # Then use the final parser
    final_data = parse_episode_page_final(mk17_page)
    print(f"🎯 FINAL RESULT: {final_data}")

Testing extraction with MK 17...

DEBUGGING EXTRACTION FOR: MK 17
📋 HEROES SECTION CONTENT:
----------------------------------------
"''Monks and Kingsguards debuted in this episode.''\n"
----------------------------------------
🔍 Found 0 level sections
❌ NO LEVEL SECTIONS FOUND! Trying alternative patterns...
Alternative pattern 1: Found 0 matches
Alternative pattern 2: Found 0 matches
Alternative pattern 3: Found 0 matches

🎯 DEBUG RESULT: {}
🎯 FINAL RESULT: {}


In [14]:
import pywikibot as pwb
import re
import json
from typing import Dict, List

site = pwb.Site("en", "marble_kingdoms")
site.login()

def parse_episode_page_flexible(page: pwb.Page) -> Dict[str, List[str]]:
    """Flexible parser that handles different formatting variations"""
    episode_data = {}
    wikitext = page.text
    
    # More flexible pattern to find heroes section - handles MK34, MK 34, etc.
    heroes_patterns = [
        r'==\s*Heroes of MK\s*\d+\s*==\s*\n(.*?)(?=\n==[^=]|\{\{|$)',
        r'==\s*Heroes of Marble Kingdoms\s*\d+\s*==\s*\n(.*?)(?=\n==[^=]|\{\{|$)',
        r'==\s*Heroes\s*==\s*\n(.*?)(?=\n==[^=]|\{\{|$)',
        r'===+\s*Heroes\s*===+\s*\n(.*?)(?=\n==[^=]|\{\{|$)'
    ]
    
    heroes_section = None
    for pattern in heroes_patterns:
        heroes_match = re.search(pattern, wikitext, re.DOTALL | re.IGNORECASE)
        if heroes_match:
            heroes_section = heroes_match.group(1)
            break
    
    if not heroes_section:
        print(f"  No heroes section found in {page.title()} with any pattern")
        return {}
    
    # Find all level sections - handle various level formats
    level_pattern = r'===+\s*(Level\s*\d+|Special|Stage\s*\d+|Lvl\s*\d+)\s*===+\s*\n(.*?)(?=\n===+\s*(?:Level|Special|Stage|Lvl)|\n==[^=]|\{\{|$)'
    level_matches = list(re.finditer(level_pattern, heroes_section, re.DOTALL | re.IGNORECASE))
    
    if not level_matches:
        # Try alternative pattern without the lookahead
        level_pattern_alt = r'===+\s*(Level\s*\d+|Special|Stage\s*\d+|Lvl\s*\d+)\s*===+\s*\n(.*)'
        level_matches = list(re.finditer(level_pattern_alt, heroes_section, re.DOTALL | re.IGNORECASE))
    
    for match in level_matches:
        level_name = match.group(1).strip()
        level_content = match.group(2).strip()
        
        # Extract all wikilinks
        hero_matches = re.findall(r'\[\[([^\]|]+)(?:\|[^\]]*)?\]\]', level_content)
        heroes = []
        
        for hero in hero_matches:
            hero_clean = hero.strip()
            if hero_clean and hero_clean not in heroes:
                heroes.append(hero_clean)
        
        if heroes:
            episode_data[level_name] = heroes
    
    return episode_data

def debug_specific_episodes():
    """Debug specific episodes that are failing"""
    problematic_episodes = [34, 35, 36]
    
    for ep_num in problematic_episodes:
        page_title = f"MK {ep_num}"
        print(f"\n{'='*60}")
        print(f"DEBUGGING {page_title}")
        print(f"{'='*60}")
        
        page = pwb.Page(site, page_title)
        if page.exists():
            # First, let's see the raw content around where heroes should be
            wikitext = page.text
            
            # Look for any mention of "heroes" in the text
            heroes_lines = []
            lines = wikitext.split('\n')
            for i, line in enumerate(lines):
                if 'hero' in line.lower():
                    heroes_lines.append((i, line))
                    # Show context around this line
                    start = max(0, i-3)
                    end = min(len(lines), i+4)
                    print(f"\nFound 'hero' at line {i}:")
                    for j in range(start, end):
                        marker = ">>> " if j == i else "    "
                        print(f"{marker}{j}: {lines[j]}")
            
            # Now try the flexible parser
            data = parse_episode_page_flexible(page)
            print(f"\nParser result: {data}")
            
        else:
            print(f"Page {page_title} not found")

def get_all_episode_data_fixed():
    """Get data for all episodes with improved parsing"""
    all_data = {}
    
    for episode_num in range(3, 38):
        page_title = f"MK {episode_num}"
        try:
            page = pwb.Page(site, page_title)
            if page.exists():
                print(f"Processing {page_title}...")
                data = parse_episode_page_flexible(page)
                if data:
                    all_data[page_title] = data
                    print(f"  ✅ Found {len(data)} levels, {sum(len(h) for h in data.values())} heroes")
                else:
                    print(f"  ⚠️  No heroes found in {page_title}")
                    # For problematic episodes, let's debug more
                    if episode_num >= 34:
                        debug_episode_content(page)
            else:
                print(f"  ❌ Page {page_title} not found")
        except Exception as e:
            print(f"  ❌ Error with {page_title}: {e}")
    
    return all_data

def debug_episode_content(page: pwb.Page):
    """Debug the content of a specific episode"""
    wikitext = page.text
    print(f"  Debugging {page.title()} content...")
    
    # Look for the exact heroes section structure
    lines = wikitext.split('\n')
    for i, line in enumerate(lines):
        if 'heroes' in line.lower() and '==' in line:
            print(f"  Found potential heroes heading at line {i}: {line}")
            # Show surrounding content
            for j in range(max(0, i-2), min(len(lines), i+15)):
                marker = " --> " if '===' in lines[j] and 'level' in lines[j].lower() else "     "
                print(f"  {j:3d}{marker}{lines[j]}")
            break
'''
# First debug the problematic episodes
print("Debugging problematic episodes (34, 35, 36)...")
debug_specific_episodes()

# Then process all episodes
print("\n\nProcessing all episodes...")
all_episodes_data = get_all_episode_data_fixed()

# Save results
with open('marble_kingdoms_heroes.json', 'w') as f:
    json.dump(all_episodes_data, f, indent=2)
print(f"\n💾 Saved data for {len(all_episodes_data)} episodes")

# Show summary
print("\n📊 SUMMARY:")
for episode, levels in all_episodes_data.items():
    hero_count = sum(len(heroes) for heroes in levels.values())
    print(f"{episode}: {len(levels)} levels, {hero_count} heroes")
'''
print(all_episodes_data)

{'MK 3': {'Level 4': ['The Red Captain']}, 'MK 4': {'Level 6': ['The Orange Captain', 'The Orange Fighter'], 'Level 5': ['The Orange Berserker', 'The Yellow Captain'], 'Level 3': ['The Magenta King']}, 'MK 5': {'Level 4': ['Mireille (MK 5)']}, 'MK 6': {'Level 3': ['Thalia'], 'Special': ['Pherthy']}, 'MK 7': {'Level 6': ['Zaria'], 'Level 4': ['Average Silksong Fan'], 'Level 3': ['Joanwerd']}, 'MK 8': {'Level 8': ['Trigger'], 'Level 7': ['Verbole (MK 8)', 'Da Gaming Boi'], 'Level 6': ['Anya', 'Frithpaul', 'Aiden Lucky', 'Ethelse', 'Manntho', 'Delgundleof', 'Marbelous Racing'], 'Level 5': ['Obstagoon the Mastermind', 'Blox Dealer', 'Malte V', 'Beorthird', 'Helmvid', 'Nightlock', 'Kozladoev', 'Han Chen', 'Pherord', 'Mireille (MK 8)', 'Gia Thuong Nguyen', 'Tolry', 'Jeff Heff', 'Dorgrif'], 'Level 4': ['Dead Buter', 'Treee', 'Category:Episodes', 'zh:MK 8']}, 'MK 9': {'Level 3': ['Warinus', 'Category:Episodes', 'zh:MK 9']}, 'MK 10': {'Level 6': ['Agobard'], 'Level 5': ['Ferumald', 'Amara', 'Cr