In [1]:
import csv
import time
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE = "https://palworld.gg"
INDEX_URL = urljoin(BASE, "/pals")
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; PalScraper/1.0; +https://example.com)"
}

def get_soup(url, retries=3, backoff=1.5):
    for i in range(retries):
        r = requests.get(url, headers=HEADERS, timeout=20)
        if r.status_code == 200:
            return BeautifulSoup(r.text, "html.parser")
        time.sleep(backoff * (i+1))
    r.raise_for_status()

def parse_index():
    soup = get_soup(INDEX_URL)
    pals = []
    
    for a in soup.select('a[href^="/pal/"]'):
        text = " ".join(a.get_text(" ", strip=True).split())
        name_match = re.match(r"(.+?)\s+#?(-?\d+)\s+(\d+)\s+(\w+)", text)
        
        if not name_match:
            # Fallback: just use the slug as name if needed
            slug = a.get("href").split("/")[-1]
            pals.append({
                "name": slug.replace("-", " ").title(),
                "number": "",
                "rarity": "",
                "url": urljoin(BASE, a.get("href"))
            })
            continue
        name, number, rarity_num, rarity_label = name_match.groups()
        pals.append({
            "name": name,
            "number": number,
            "rarity": rarity_label,
            "url": urljoin(BASE, a.get("href"))
        })

    seen = set()
    unique = []
    for p in pals:
        key = (p["name"], p["number"], p["url"])
        if key not in seen:
            seen.add(key)
            unique.append(p)
    return unique

def parse_detail(url):
    soup = get_soup(url)
    data = {}

    h1 = soup.select_one("h1")
    if h1:
        data["name_detail"] = h1.get_text(strip=True)

    elements = {"Fire","Water","Electricity","Leaf","Earth","Ice","Dragon","Dark","Normal"}
    text_all = soup.get_text("\n", strip=True)
    found_el = None
    for el in elements:
        if f"\n{el}\n" in text_all:
            found_el = el
            break
    data["element"] = found_el or ""

    stat_keys = ["HP","Defense","Crafting Speed","Melee Attack","Shot Attack",
                 "Price","Stamina","Support","Running Speed","Sprinting Speed","Slow Walk Speed"]
    for k in stat_keys:
        m = re.search(rf"\b{k}\b\s+(\d+)", text_all)
        data[k.lower().replace(" ", "_")] = m.group(1) if m else ""


    work = []
    work_section = text_all.split("Work Suitability")[-1] if "Work Suitability" in text_all else ""

    for line in work_section.splitlines():
        line = line.strip()
        if not line: 
            continue

        if line.startswith("Partner Skill") or line.startswith("Passive Skills") or line.startswith("Active Skills"):
            break

        if "Lv" in line:
            if work and "Lv" not in work[-1]:
                work[-1] = f"{work[-1]} ({line})"
            else:
                work.append(line)
        else:
            work.append(line)
    data["work_suitability"] = "; ".join([w for w in work if any(c.isalpha() for c in w)])

    partner = ""
    if "Partner Skill" in text_all:
        part = text_all.split("Partner Skill")[1]
        partner = part.split("Passive Skills")[0] if "Passive Skills" in part else part.split("Active Skills")[0] if "Active Skills" in part else part
        partner = partner.strip()
    data["partner_skill"] = partner


    passive = ""
    if "Passive Skills" in text_all:
        part = text_all.split("Passive Skills")[1]
        passive = part.split("Active Skills")[0] if "Active Skills" in part else part
        passive = passive.strip()
    data["passive_skills"] = passive

    active = ""
    if "Active Skills" in text_all:
        part = text_all.split("Active Skills")[1]

        for stop in ["Contact us", "Privacy Policy"]:
            if stop in part:
                part = part.split(stop)[0]
        active = part.strip()
    data["active_skills"] = active


    drops = ""
    if "Possible Drops" in text_all:
        part = text_all.split("Possible Drops")[1]
        # stop at "Stats" usually
        if "Stats" in part:
            part = part.split("Stats")[0]
        drops = re.sub(r"\s+", " ", part).strip()
    data["possible_drops"] = drops

    return data

def main():
    index = parse_index()

    with open("pals_index.csv", "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["name","number","rarity","url"])
        w.writeheader()
        for row in index:
            w.writerow(row)


    rows = []
    for i, p in enumerate(index, start=1):
        time.sleep(0.75)  # be polite
        detail = parse_detail(p["url"])
        row = {**p, **detail}
        rows.append(row)
        print(f"[{i}/{len(index)}] {p['name']}")


    cols = ["name","number","rarity","url","name_detail","element",
            "hp","defense","crafting_speed","melee_attack","shot_attack",
            "price","stamina","support","running_speed","sprinting_speed","slow_walk_speed",
            "work_suitability","partner_skill","passive_skills","active_skills","possible_drops"]
    with open("pals_full.csv", "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        for r in rows:
            w.writerow({k: r.get(k, "") for k in cols})

if __name__ == "__main__":
    main()


[1/225] Anubis
[2/225] Arsox
[3/225] Astegon
[4/225] Azurmane
[5/225] Azurobe
[6/225] Azurobe Cryst
[7/225] Bastigor
[8/225] Beakon
[9/225] Beegarde
[10/225] Bellanoir
[11/225] Bellanoir Libero
[12/225] Blazamut
[13/225] Blazamut Ryu
[14/225] Blazehowl
[15/225] Blazehowl Noct
[16/225] Blue Slime
[17/225] Braloha
[18/225] Bristla
[19/225] Broncherry
[20/225] Broncherry Aqua
[21/225] Bushi
[22/225] Bushi Noct
[23/225] Caprity
[24/225] Caprity Noct
[25/225] Cattiva
[26/225] Cave Bat
[27/225] Cawgnito
[28/225] Celaray
[29/225] Celaray Lux
[30/225] Celesdir
[31/225] Chikipi
[32/225] Chillet
[33/225] Chillet Ignis
[34/225] Cinnamoth
[35/225] Cremis
[36/225] Croajiro
[37/225] Croajiro Noct
[38/225] Cryolinx
[39/225] Cryolinx Terra
[40/225] Daedream
[41/225] Dazemu
[42/225] Dazzi
[43/225] Dazzi Noct
[44/225] Demon Eye
[45/225] Depresso
[46/225] Digtoise
[47/225] Dinossom
[48/225] Dinossom Lux
[49/225] Direhowl
[50/225] Dogen
[51/225] Dumud
[52/225] Dumud Gild
[53/225] Eikthyrdeer
[54/225] Eikt