In [3]:
# palworld_overall_tiers_to_csv.py
# pip install requests beautifulsoup4 pandas
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup, NavigableString

TIER_URL = "https://palworld.gg/tier-list"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

# accept "S", "S Tier", " A ", etc.
TIER_PATTERNS = [
    re.compile(r"^\s*([SABCD])\s*$"),
    re.compile(r"^\s*([SABCD])\s*Tier\s*$", re.I),
]

def fetch_html(url: str) -> str:
    s = requests.Session()
    r = s.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def normalize_tier(text: str) -> str | None:
    if not text:
        return None
    t = text.strip()
    for pat in TIER_PATTERNS:
        m = pat.match(t)
        if m:
            return m.group(1)  # "S Tier" -> "S"
    return None

def nearest_previous_tier(tag) -> str | None:
    # Walk backwards through prior strings/tags to find a tier token
    for prev in tag.find_all_previous(string=True, limit=800):
        tier = normalize_tier(str(prev))
        if tier:
            return tier
    return None

def clean_name(text: str) -> str | None:
    if not text: 
        return None
    t = text.strip()
    # Strip common "Image:" prefix (varies by site markup; keep generic)
    t = re.sub(r"(?i)^\s*image\s*:\s*", "", t).strip()
    # Filter obvious non-names / layout tokens
    if not t or len(t) > 48 or "/" in t:
        return None
    return t

def names_from_block(node) -> list[str]:
    """Extract candidate Pal names from a DOM block."""
    found = []

    # 1) <a href="/pal/...">  (most language-agnostic)
    for a in node.select('a[href^="/pal/"]'):
        label = a.get("title") or a.get("aria-label") or a.get_text(" ", strip=True)
        nm = clean_name(label)
        if not nm:
            # fallback to slug
            href = a.get("href", "")
            slug = href.rsplit("/", 1)[-1].replace("-", " ").title()
            nm = clean_name(slug)
        if nm:
            found.append(nm)

    # 2) <img alt="Image: Name"> or title="Name"
    for img in node.find_all("img"):
        nm = clean_name(img.get("alt") or img.get("title") or "")
        if nm:
            found.append(nm)

    # 3) captions/spans as a weak fallback
    for tagname in ("figcaption", "span"):
        for t in node.find_all(tagname):
            label = t.get("title") or t.get("aria-label") or t.get_text(" ", strip=True)
            nm = clean_name(label)
            if nm:
                found.append(nm)

    # de-dup but preserve order
    seen, out = set(), []
    for n in found:
        if n not in seen:
            seen.add(n)
            out.append(n)
    return out

def parse_overall_tiers(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")

    rows = []

    # Strategy: iterate all "pal blocks" we can identify (img or link to /pal/)
    # and for each, map to the nearest previous tier token (S/A/B/C/D).
    candidates = []

    # collect nodes that likely contain pal entries
    candidates += soup.find_all("img")
    candidates += soup.select('a[href^="/pal/"]')

    for node in candidates:
        # the enclosing block usually contains consistent name signals
        block = node if node.name == "img" else node.parent
        names = names_from_block(block)
        if not names:
            continue
        tier = nearest_previous_tier(node)
        if not tier:
            continue
        for nm in names:
            rows.append({"pal": nm, "tier": tier})

    # finalize
    df = pd.DataFrame(rows, columns=["pal", "tier"]).dropna().drop_duplicates()
    if not df.empty:
        order = {"S":0,"A":1,"B":2,"C":3,"D":4}
        df = df.sort_values(["tier","pal"], key=lambda s: s.map(order) if s.name=="tier" else s).reset_index(drop=True)
    return df

if __name__ == "__main__":
    html = fetch_html(TIER_URL)
    # quick diagnostics to help if you still get 0 rows
    print("HTML length:", len(html))
    print("Contains '/pal/' links?", "/pal/" in html)
    print("Contains 'Image:' tokens?", "Image:" in html or "image:" in html)

    df = parse_overall_tiers(html)
    print(df.head())
    df.to_csv("tierlist_overall.csv", index=False)
    print(f"Saved tierlist_overall.csv with {len(df)} rows")


HTML length: 192537
Contains '/pal/' links? True
Contains 'Image:' tokens? True
                pal tier
0         Bellanoir    S
1  Bellanoir Libero    S
2       Frostallion    S
3  Frostallion Noct    S
4          Jetragon    S
Saved tierlist_overall.csv with 226 rows
