In [None]:
from selenium import webdriver
from selenium.common.exceptions import (NoSuchElementException,
                                        WebDriverException)
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException


import sqlite3
import re

In [None]:
chrome_opt = Options()
chrome_opt.add_argument("--incognito") # 設置隱身模式，可以避免個人化廣告，加速網頁瀏覽
chrome_opt.add_argument("--headless") # 無頭模式
chrome_opt.add_argument("disable-extensions") # 禁用 Chrome 瀏覽器上的現有擴展
chrome_opt.add_argument("disable-popup-blocking") # 禁用 Chrome 瀏覽器上顯示的彈出窗口
chrome_opt.add_argument("disable-infobars") # 阻止Chrome顯示“Chrome正在由自動化軟件控制”的通知

# url = "https://www.pokemon-card.com/card-search/index.php?\
#     keyword=&se_ta=&regulation_sidebar_form=all&pg=&illust=&sm_and_keyword=true"  # all
url = "https://www.pokemon-card.com/card-search/index.php?\
    keyword=&se_ta=&regulation_sidebar_form=XY&pg=&illust=&sm_and_keyword=true"  # standard

In [None]:
import collections

head_text_mapping_jp_en = {
    "ワザ": "pokemon",
    "グッズ": "tool",
    "ポケモンのどうぐ": "item",
    "スタジアム": "stadium",
    "サポート": "supporter",
    "基本エネルギー": "basic_energy",
    "特殊エネルギー": "special_energy"
}
head_text_mapping_en_jp = {value: key for key, value in head_text_mapping_jp_en.items()}


def process_type_icon(type_icon_str):
    match = re.search(r'icon-(\w+)', type_icon_str)
    return match.group(1) if match else 'NaN'


def extract_ability(driver):
    try:
        ability_header = driver.find_element(By.XPATH, "//h2[contains(text(), '特性')]")
        ability_name = ability_header.find_element(By.XPATH, "following-sibling::h4").text
        ability_description = ability_header.find_element(By.XPATH, "following-sibling::p").text
        ability_info = f"{ability_name}: {ability_description}"
    except NoSuchElementException:
        ability_info = "NaN"

    return ability_info

def extract_attack(driver):
    output = []

    try:
        moves_elements = driver.find_elements(By.XPATH, "//h2[contains(text(), 'ワザ')]/following-sibling::h4")
        for h4 in moves_elements:
            # Initialize an empty list to hold the move details
            move_details = []
            
            # Extract the type icons
            move_types = []
            icons = h4.find_elements(By.XPATH, ".//span[contains(@class, 'icon')]")
            for icon in icons:
                class_attr = icon.get_attribute('class')
                move_types.append(process_type_icon(class_attr))
            move_types = ' '.join(move_types)
            move_details.append(move_types)
            
            # Extract the move name and damage
            move_name_and_damage = h4.text.replace('\n', ', ')
            move_details.append(move_name_and_damage)
            
            # Extract the description from the next <p> tag
            description = h4.find_element(By.XPATH, "following-sibling::p[1]").text.strip()
            move_details.append(description)
            
            # Format and print the extracted details
            formatted_output = ", ".join(move_details)
            output.append(formatted_output)
    except NoSuchElementException:
        output.append("NaN")

    return output

def extract_hp_type(driver):
    try:
        hp_type_element = driver.find_element(By.XPATH, "//span[contains(text(), 'タイプ')]")
        icon_element = hp_type_element.find_element(By.XPATH, "../span[contains(@class, 'icon')]")
        icon_text = icon_element.get_attribute('class')
        return process_type_icon(icon_text)
    except NoSuchElementException:
        return "NaN"

def extract_img_url(driver):
    try:
        fit = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'fit')))
        img_url = fit.get_attribute('src')
        return img_url
    except NoSuchElementException:
        return "NaN"

def extract_card_code(driver):
    try:
        fit = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'fit')))
        img_url = fit.get_attribute('src')
        product_code = img_url.split('/')[-2]
        subtext_div = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'subtext')))
        card_code = subtext_div.text.strip().replace(' ', '')
        return f'{product_code}-{card_code}'
    except NoSuchElementException:
        return "NaN"

def extract_rarity_code(driver):
    try:
        img_element = driver.find_element(By.XPATH, "//img[contains(@src, 'ic_rare')]")
        src_value = img_element.get_attribute('src')
        # Extract 'rarity_code' from the src attribute using regex
        match = re.search(r'ic_rare_(.+?)\.gif', src_value)
        return match.group(1) if match else "NaN"
    except NoSuchElementException:
        return "NaN"

def extract_pokemon_desc(driver):
    try:
        no_and_name = driver.find_element(By.CSS_SELECTOR, '.card h4').text
        height_and_weight = driver.find_element(By.CSS_SELECTOR, '.card p:nth-of-type(1)').text
        description = driver.find_element(By.CSS_SELECTOR, '.card p:nth-of-type(2)').text
        return f'{no_and_name} {height_and_weight} {description}'.replace('\u3000', ' ')
    except NoSuchElementException:
        return "NaN"
    
def extract_desc(driver, head):
    try:
        description = driver.find_element(
            By.XPATH, 
            f"//h2[contains(text(), '{head_text_mapping_en_jp.get(head, '')}')]/following-sibling::p[1]").text
        return f'{description}'.replace('\u3000', ' ').replace('\n', '')
    except NoSuchElementException:
        return "NaN"

def extract_special_rule(driver):
    try:
        special_rules_header = driver.find_element(By.XPATH, "//h2[contains(text(), '特別なルール')]")
        paragraphs = special_rules_header.find_elements(By.XPATH, "following-sibling::p")
        for paragraph in paragraphs:
            text = paragraph.text
            if 'ポケモンex' in text: return 'ポケモンex'
            elif 'ポケモンV' in text: return 'ポケモンV'
            elif 'ポケモンVMAX' in text: return 'ポケモンVMAX'
            elif 'ポケモンVSTAR' in text: return 'ポケモンVSTAR'
            elif 'ポケモンV-UNION' in text: return 'ポケモンV-UNION'
            elif 'かがやくポケモン' in text: return 'かがやくポケモン'
        return 'NaN'
    except NoSuchElementException:
        return 'NaN'
    
def extract_weak_resis_retreat(driver):
    data = {'weakness': 'NaN', 'resistance': 'NaN', 'retreat': 'NaN'}

    table = driver.find_element(By.CSS_SELECTOR, 'div.RightBox table')
    try:
        weakness = table.find_element(By.XPATH, ".//tr[2]/td[1]/span").get_attribute('class')
        weakness = process_type_icon(weakness)
        weakness += table.find_element(By.XPATH, ".//tr[2]/td[1]").text
        data['weakness'] = weakness
    except NoSuchElementException:
        pass

    try:
        resistance = table.find_element(By.XPATH, ".//tr[2]/td[2]/span").get_attribute('class')
        resistance = process_type_icon(resistance)
        resistance += table.find_element(By.XPATH, ".//tr[2]/td[2]").text
        data['resistance'] = resistance
    except NoSuchElementException:
        pass
    
    try:
        retreat = table.find_elements(By.XPATH, ".//tr[2]/td[3]/span[contains(@class, 'icon')]")
        retreats = [process_type_icon(icon.get_attribute('class')) for icon in retreat]
        retreats = dict(collections.Counter(retreats))
        retreats_str = ", ".join(f"{key}x{value}" for key, value in retreats.items())
        data['retreat'] = retreats_str
    except NoSuchElementException:
        pass

    return data

def check_card_type(driver):
    try:
        subheadings = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'mt20')))
        for subheading in subheadings:
            for key, value in head_text_mapping_jp_en.items():
                if key == subheading.text: return value
        return "NaN"
    except NoSuchElementException:
        return "NaN"

# =====
def extract_detail_info_jp(url):
    detail_info = {}

    with webdriver.Chrome(options=chrome_opt) as driver:
        driver.implicitly_wait(2)
        driver.get(url)
        card_type = check_card_type(driver)
        heading = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'Heading1.mt20')))
        # general
        detail_info['card_type'] = card_type
        detail_info['card_name_jp'] = heading.text
        detail_info['img_url_jp'] = extract_img_url(driver)
        detail_info['card_code_jp'] = extract_card_code(driver)
        detail_info['rarity_code'] = extract_rarity_code(driver)

        if card_type == "pokemon":
            detail_info['desc_jp'] = extract_pokemon_desc(driver)
            detail_info['type'] = driver.find_element(By.CSS_SELECTOR, '.type').text
            detail_info['hp'] = driver.find_element(By.CLASS_NAME, 'hp-num').text
            detail_info['hp_type'] = extract_hp_type(driver)
            detail_info['attacks'] = extract_attack(driver)
            detail_info['ability'] = extract_ability(driver)
            detail_info['special_rule'] = extract_special_rule(driver)
            detail_info.update(extract_weak_resis_retreat(driver))
        elif card_type in ["tool", "stadium", "supporter", "item", "special_energy"]:
            detail_info['desc_jp'] = extract_desc(driver, card_type)

    return detail_info

urls = [
    "https://www.pokemon-card.com/card-search/details.php/card/45186/regu/XY", # pokemon
    "https://www.pokemon-card.com/card-search/details.php/card/44501/regu/XY", # tool
    "https://www.pokemon-card.com/card-search/details.php/card/41298/regu/XY", # stadium
    "https://www.pokemon-card.com/card-search/details.php/card/42203/regu/XY", # supporter
    "https://www.pokemon-card.com/card-search/details.php/card/42183/regu/XY", # item
    "https://www.pokemon-card.com/card-search/details.php/card/44968/regu/XY", # special energy
    "https://www.pokemon-card.com/card-search/details.php/card/44950/regu/XY", # basic energy
]

for url in urls:
    detail_info = extract_detail_info_jp(url)
    display(detail_info)

# test case: pokemon basic, rule (ex, v, kaka)
# - with ability
# - 2 attacks
# - without escape
# - without resistance
# - without weakness
# energy, trainer (supporter, tool, item, stadium)



In [None]:
with webdriver.Chrome(options=chrome_opt) as driver:
    # Init driver
    driver.implicitly_wait(2)
    driver.get(url)

    is_next = True
    while is_next:
        # list_items = driver.find_elements(By.CLASS_NAME, "List_item")
        list_items = WebDriverWait(driver, 10).until(
                EC.visibility_of_all_elements_located((By.CLASS_NAME, 'List_item'))
        )
        
        for list_item in list_items:
            try:
                img_item = WebDriverWait(list_item, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'li.List_item img[data-src]'))
                )
                print(img_item.get_attribute('data-src'))

                data_src = img_item.get_attribute('data-src')  # e.g. /assets/images/card_images/large/SV2a/043491_P_ZENIGAME.jpg
                card_id = int(data_src.split('/')[-1].split('_')[0])
                detail_info_url = f"https://www.pokemon-card.com/card-search/details.php/card/{card_id}/regu/XY"
            except StaleElementReferenceException:
                img_item = WebDriverWait(list_item, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'li.List_item img[data-src]'))
                )
                print('try again')
                print(img_item.get_attribute('data-src'))
            except TimeoutException:
                print("No image item found within the wait time")

                try:
                    next_page_button = WebDriverWait(list_item, 10).until(
                        EC.presence_of_element_located((By.XPATH, '//li[contains(.,"次のページ")]'))
                    )
                    next_page_button.click()
                    print("Next page button found")
                    is_next = True
                except TimeoutException:
                    print("No next page button found within the wait time")
                    is_next = False



    # https://www.pokemon-card.com/card-search/details.php/card/45147/regu/XY
    # inject info to db
    # next page until end