In [2]:
import asyncio
import contextlib
import itertools
import os

import polars as pl
from pydoll.browser.chrome import Chrome
from pydoll.browser.options import Options
from pydoll.constants import By
from pydoll.element import WebElement
from tqdm import tqdm

SCRAPE_CHUNK_PAGES = 60
repo_path = r"D:\projets_python_ssd\Sencrop\perso\MH_Wilds_tools"

In [3]:
headless = False
chrome_user_data = os.path.join(os.getcwd(), "chrome_user_data")

options = Options()
if headless:
    options.add_argument("--headless")
# options.add_argument("--no-sandbox")
options.add_argument("--mute-audio")
options.add_argument("--disable-dev-shm-usage")
# options.add_argument("--disable-blink-features=AutomationControlled")

options.add_argument(f"--user-data-dir={chrome_user_data}")
options.add_argument("--profile-directory=Default")
# options.add_argument("--remote-debugging-port=9223")
options.add_argument("--referer=https://patreon.com")
# options.set_capability("goog:loggingPrefs", {"performance": "ALL"})

In [4]:
async def get_quest_rewards(
    quest: dict[str, str], browser, root_url: str
) -> list[dict[str, str]]:
    href = quest["href"]
    page = await browser.get_page()
    await page.go_to(url=f"{root_url}{href}")

    recompenses_element = await page.find_element(
        By.XPATH,
        "//h3[contains(text(), 'Récompenses')]",
    )
    parent_element = await recompenses_element.find_element(By.XPATH, "..")

    recompenses_list = []

    recompenses = await parent_element.find_elements(By.TAG_NAME, "tr")
    for recompense in recompenses:
        value = await recompense.find_elements(By.TAG_NAME, "td")
        value = value[0]
        item = await value.find_element(By.TAG_NAME, "a")
        item = await item.get_element_text()
        quantity = await value.get_element_text()
        quantity = quantity.replace(item, "").strip().replace("x", "")
        match quantity:
            case "":
                quantity = 1
            case _:
                quantity = int(quantity)
        recompenses_list.append({"item": item, "quantity": quantity})
    quest["rewards"] = recompenses_list
    await page.close()
    return quest


async def get_quest_details(quest: WebElement) -> dict[str, str]:
    quest_name_element = await quest.find_elements(By.TAG_NAME, "td")
    quest_name_element = quest_name_element[0]
    quest_name = await quest_name_element.find_element(By.TAG_NAME, "a")
    return {
        "name": await quest_name.get_element_text(),
        "href": quest_name.get_attribute("href"),
    }

In [5]:
root_url = "https://mhwilds.kiranico.com"
quest_page = f"{root_url}/fr/data/missions"

async with Chrome(options=options) as browser:
    await browser.start()
    page = await browser.get_page()
    await page.go_to(url=quest_page, timeout=10)

    quest_scroll_element = await page.find_element(
        By.XPATH, "/html/body/div[1]/div/div/div[2]/div/div[2]"
    )
    quests = await quest_scroll_element.find_elements(By.TAG_NAME, "tr")

    all_quests = await asyncio.gather(*[get_quest_details(quest) for quest in quests])

    all_quests_rewards = []
    for i in tqdm(range(0, len(all_quests) + 1, SCRAPE_CHUNK_PAGES)):
        chunk = all_quests[i : i + SCRAPE_CHUNK_PAGES]
        quest_rewards_chunk = await asyncio.gather(
            *(
                get_quest_rewards(
                    quest=quest,
                    browser=browser,
                    root_url=root_url,
                )
                for quest in chunk
            )
        )
        all_quests_rewards.extend(quest_rewards_chunk)
    quest_df = (
        pl.DataFrame(all_quests_rewards)
        #
        .explode("rewards")
        .with_columns(
            pl.col("rewards").struct.field("item").alias("item"),
            pl.col("rewards").struct.field("quantity").alias("quantity"),
        )
        .drop("rewards")
        .filter(pl.col("item").is_not_null())
        .sort("item", "quantity", descending=[False, True])
        .to_pandas()
        .get(["name", "item", "quantity"])
    )
    quest_df.to_parquet(os.path.join(repo_path, "data", "quests.parquet"))


100%|██████████| 2/2 [00:15<00:00,  7.93s/it]


# Scrape armors

In [6]:
async def extract_armor_data(
    browser,
    root_url: str,
    href: str,
) -> list[dict[str, str]]:
    """
    Extract armor data from a web page, parsing talent table rows to collect piece details.

    Navigates to a specific URL, finds the talent table, and extracts information about
    each armor piece including its name, jewel levels, and associated talents.

    Returns:
        list[dict[str, str]]: A list of dictionaries containing armor piece details with
        keys 'piece', 'name', 'jewels', and 'talents'.
    """
    page = await browser.get_page()
    await page.go_to(url=f"{root_url}{href}")

    # Get skill table
    talent_table = await page.find_element(
        By.XPATH,
        '//th[contains(text(), "Talents de l\'équipement")]',
    )
    talent_table = await talent_table.find_element(By.XPATH, "..")
    talent_table = await talent_table.find_element(By.XPATH, "..")

    all_pieces = []
    talent_table_rows = (await talent_table.find_elements(By.TAG_NAME, "tr"))[1:]
    for talent_table_row in talent_table_rows:
        piece, name, jewels, talent = await talent_table_row.find_elements(
            By.TAG_NAME, "td"
        )
        piece, name, jewels = await asyncio.gather(
            *[x.get_element_text() for x in (piece, name, jewels)]
        )

        # Extract jewel levels
        all_jewels = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0}
        for char in jewels.replace("[", "").replace("]", ""):
            all_jewels[char] += 1

        # Extract talent information
        all_talents = []
        with contextlib.suppress(Exception):
            talents = await talent.find_elements(By.TAG_NAME, "a")

        for _talent in talents:
            talent_name = await _talent.get_element_text()
            talent_level = int(talent_name.split("+")[-1])
            talent_name = talent_name.split("+")[:-1]
            talent_name = "+".join(talent_name).strip()
            all_talents.append(
                {
                    "talent_name": talent_name,
                    "talent_level": talent_level,
                }
            )
            piece_dict = {
                "piece": piece,
                "name": name,
                "jewels": all_jewels,
                "talents": all_talents,
            }
            all_pieces.append(piece_dict)
    await page.close()
    return all_pieces


root_url = "https://mhwilds.kiranico.com"
armor_page = f"{root_url}/fr/data/armor-series"

async with Chrome(options=options) as browser:
    await browser.start()
    page = await browser.get_page()
    await page.go_to(armor_page)

    scroll_element = await page.find_element(By.TAG_NAME, "table")
    hrefs = await scroll_element.find_elements(By.TAG_NAME, "a")
    hrefs = [element.get_attribute("href") for element in hrefs]

    all_armor_data = []
    for i in tqdm(range(0, len(hrefs) + 1, SCRAPE_CHUNK_PAGES)):
        chunk = hrefs[i : i + SCRAPE_CHUNK_PAGES]
        armor_data_chunk = await asyncio.gather(
            *(
                extract_armor_data(
                    browser=browser,
                    root_url=root_url,
                    href=href,
                )
                for href in chunk
            )
        )
        all_armor_data.extend(armor_data_chunk)

    all_armor_data = list(itertools.chain.from_iterable(all_armor_data))


100%|██████████| 3/3 [00:30<00:00, 10.13s/it]


In [None]:
armor_pieces = (
    pl.DataFrame(all_armor_data)
    #
    .explode("talents")
    .with_columns(
        pl.col("talents").struct.field("talent_name").alias("talent_name"),
        pl.col("talents").struct.field("talent_level").alias("talent_level"),
    )
    .drop("talents")
    .with_columns(
        *[
            pl.col("jewels").struct.field(jwl_lvl).alias(f"jewel_{jwl_lvl}")
            for jwl_lvl in ["0", "1", "2", "3", "4"]
        ]
    )
    .drop("jewels")
    .unique()
)
armor_pieces.write_parquet(os.path.join(repo_path, "data", "armor_pieces.parquet"))
armor_pieces


piece,name,talent_name,talent_level,jewel_0,jewel_1,jewel_2,jewel_3,jewel_4
str,str,str,i64,i64,i64,i64,i64,i64
"""Tête""","""Masque d'espoir""","""Crâne d'acier""",1,3,0,0,0,0
"""Torse""","""Cotte d'espoir""","""Bénédiction""",1,3,0,0,0,0
"""Bras""","""Avant-bras d'espoir""","""Bénédiction""",1,3,0,0,0,0
"""Taille""","""Tassette d'espoir""","""Crâne d'acier""",1,3,0,0,0,0
"""Jambes""","""Grèves d'espoir""","""Bénédiction""",1,3,0,0,0,0
…,…,…,…,…,…,…,…,…
"""Tête""","""Mimiflore α""","""Embuscade""",1,2,1,0,0,0
"""Tête""","""Heaume d'expédition α""","""Totem élémentaire""",1,2,1,0,0,0
"""Tête""","""Heaume d'expédition α""","""Union""",1,2,1,0,0,0
"""Tête""","""Heaume d'expédition α""","""Totem élémentaire""",1,2,1,0,0,0


In [10]:
armor_pieces.unique()

piece,name,talent_name,talent_level,jewel_0,jewel_1,jewel_2,jewel_3,jewel_4
str,str,str,i64,i64,i64,i64,i64,i64
"""Taille""","""Tassette Congalala α""","""Pelage de renforcement""",1,2,1,0,0,0
"""Bras""","""Gants de mailles α""","""Géologiste""",1,2,1,0,0,0
"""Bras""","""Avant-bras Commission α""","""Rengainage éclair""",2,2,1,0,0,0
"""Bras""","""Avant-bras Vespoid α""","""Embuscade""",1,3,0,0,0,0
"""Torse""","""Cotte Uth Duna""","""Performance optimale""",1,2,1,0,0,0
…,…,…,…,…,…,…,…,…
"""Bras""","""Avant-bras Melahoa α""","""Sagesse transmise""",1,2,0,1,0,0
"""Jambes""","""Grèves Xu Wu α""","""Poussée d'adrénaline""",1,1,1,1,0,0
"""Tête""","""Heaume Quematrice""","""Maîtrise des écailles""",1,3,0,0,0,0
"""Tête""","""Heaume Congalala""","""Mycologue extrême""",1,3,0,0,0,0


# Charms

In [8]:
root_url = "https://mhwilds.kiranico.com"
charms_url = f"{root_url}/fr/data/charms"


async def extract_charm_row_data(charm_element: WebElement) -> dict[str, str]:
    charm_name = await charm_element.find_element(By.TAG_NAME, "a")
    charm_name = await charm_name.get_element_text()
    charm_href = (await charm_element.find_element(By.TAG_NAME, "a")).get_attribute(
        "href"
    )
    return {"name": charm_name, "href": charm_href}


async def extract_charm_data(
    browser,
    root_url: str,
    charm_element: dict[str, str],
) -> dict[str, str]:
    page = await browser.get_page()
    await page.go_to(f"{root_url}{charm_element['href']}")

    talent_table = await page.find_element(By.TAG_NAME, "tbody")
    talents = await talent_table.find_elements(By.TAG_NAME, "tr")

    charm_talents = []
    for talent in talents:
        name, lvl, desc = await talent.find_elements(By.TAG_NAME, "td")
        name = await name.get_element_text()
        lvl = await lvl.get_element_text()
        desc = await desc.get_element_text()

        lvl = int(lvl.replace("Lv", "").strip())
        charm_talents.append({"name": name, "lvl": lvl})
    charm_element["talents"] = charm_talents
    await page.close()
    return charm_element


async with Chrome(options=options) as browser:
    await browser.start()
    page = await browser.get_page()
    await page.go_to(url=charms_url)

    scroll_element = await page.find_element(By.TAG_NAME, "table")
    charm_elements = await page.find_elements(By.TAG_NAME, "tr")

    all_charms = await asyncio.gather(
        *[extract_charm_row_data(charm_element) for charm_element in charm_elements]
    )

    all_charm_data = []
    for i in tqdm(range(0, len(all_charms) + 1, SCRAPE_CHUNK_PAGES)):
        chunk = all_charms[i : i + SCRAPE_CHUNK_PAGES]
        charm_data_chunk = await asyncio.gather(
            *[
                extract_charm_data(
                    browser=browser,
                    root_url=root_url,
                    charm_element=charm_element,
                )
                for charm_element in chunk
            ]
        )
        all_charm_data.extend(charm_data_chunk)


100%|██████████| 3/3 [00:27<00:00,  9.06s/it]


In [9]:
charms_data = (
    pl.DataFrame(all_charms)
    #
    .explode("talents")
    .with_columns(
        pl.col("talents").struct.field("name").alias("talent_name"),
        pl.col("talents").struct.field("lvl").alias("talent_lvl"),
    )
    .drop("talents")
    .sort("talent_name", "talent_lvl", descending=[False, True])
)
charms_data.write_parquet(os.path.join(repo_path, "data", "charms.parquet"))
charms_data


name,href,talent_name,talent_lvl
str,str,str,i64
"""Talisman d'absorption""","""/fr/data/charms/talisman-dabso…","""Absorption élémentaire""",1
"""Talisman de survie III""","""/fr/data/charms/talisman-de-su…","""Ami de la nature""",3
"""Talisman de survie II""","""/fr/data/charms/talisman-de-su…","""Ami de la nature""",2
"""Talisman de survie""","""/fr/data/charms/talisman-de-su…","""Ami de la nature""",1
"""Talisman anti-immobilisation I…","""/fr/data/charms/talisman-anti-…","""Anti-immobilisation""",3
…,…,…,…
"""Talisman d'incision II""","""/fr/data/charms/talisman-dinci…","""Écorcheur""",2
"""Talisman d'incision""","""/fr/data/charms/talisman-dinci…","""Écorcheur""",1
"""Talisman d'eau III""","""/fr/data/charms/talisman-deau-…","""Étanchéité""",3
"""Talisman d'eau II""","""/fr/data/charms/talisman-deau-…","""Étanchéité""",2
