# Day31
## Selenium 物件定位 – XPath
- 在 Selenium 中使用 XPath
- 盤點 XPath 語法



## 作業說明
練習更多在 selenium 中使用 XPath 的變化用法
- 目標網站： https://channel.jd.com/outdoor.html

目標：
- 取得 Day29 作業所有小類別名稱下的
  - 品牌列表（名稱、連結）

![](https://i.imgur.com/SbV4W35.png)

Hint: 
- 請根據引導完成這份代碼
- 記得先安裝 Chrome 瀏覽器，才能順利啟動 chromedriver
- 會用到我們 Day20 所學的 Xpath

### 套件安裝

In [1]:
!pip install -U selenium
!pip install webdriver_manager
!pip install fake-useragent



### 套件導入

In [2]:
from fake_useragent import UserAgent
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
import time
from tqdm import tqdm

In [3]:
from selenium.webdriver.chrome.service import Service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

### 使用 fake-useragent 產生 User Agent

In [5]:
driver = webdriver.Chrome(service=service)

# 目標網址
base_url = 'https://channel.jd.com/outdoor.html'
driver.get(base_url)

opt = webdriver.ChromeOptions()
user_agent = UserAgent()
opt.add_argument('--user-agent=%s' % user_agent)

driver.close()

### 獲取所有小類別頁面連結 
> Day29/Day30 內容，可自行載入儲存的爬取表格，也可以再爬一次

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

# ✅ 設定 Chrome 選項
options = Options()
options.add_argument("--headless")  # 無頭模式 (不顯示瀏覽器)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")  # 避免載入手機版

# ✅ 啟動 WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# ✅ 目標網址
base_url = 'https://channel.jd.com/outdoor.html'
driver.get(base_url)

# ✅ 等待網頁完全加載 (最多等待 15 秒)
WebDriverWait(driver, 15).until(
    EC.presence_of_element_located((By.ID, "Categorys"))
)

# ✅ 爬取所有大分類 (`dt`)
categories = driver.find_elements(By.XPATH, '//div[@id="Categorys"]//dl[@class="item-inner"]/dt')

# ✅ 爬取所有小分類 (`a`)
subcategories = driver.find_elements(By.XPATH, '//div[@id="Categorys"]//dl[@class="item-inner"]/dd/a')

# ✅ 存放爬取結果
data = []

# ✅ 遍歷所有分類
for dt in categories:
    category_name = dt.text.strip()
    
    # ✅ 找到該分類下所有的小分類
    sub_links = dt.find_elements(By.XPATH, "./following-sibling::dd/a")
    
    for sub in sub_links:
        sub_name = sub.text.strip()
        sub_href = sub.get_attribute("href")
        
        # ✅ 確保連結完整 (部分是 `//` 開頭)
        if sub_href.startswith("//"):
            sub_href = "https:" + sub_href
        
        data.append((category_name, sub_name, sub_href))

# ✅ 顯示爬取結果
for item in data:
    print(item)

# ✅ 關閉瀏覽器
driver.quit()


('户外鞋服', '冲锋衣裤', 'https://list.jd.com/list.html?cat=1318%2C2628%2C12123&go=0')
('户外鞋服', '徒步鞋', 'https://list.jd.com/list.html?cat=1318%2C2628%2C12136&go=0')
('户外鞋服', '抓绒衣裤', 'https://list.jd.com/list.html?cat=1318,2628,12128')
('户外鞋服', '羽绒服棉服', 'https://list.jd.com/list.html?cat=1318,2628,12126')
('户外鞋服', '越野跑鞋', 'https://list.jd.com/list.html?cat=1318,2628,12137')
('户外鞋服', '软壳', 'https://list.jd.com/list.html?cat=1318,2628,12129')
('户外鞋服', '登山鞋', 'https://list.jd.com/list.html?cat=1318,2628,12134')
('户外鞋服', '休闲鞋', 'https://list.jd.com/list.html?cat=1318,2628,12138')
('户外装备', '帐篷', 'https://list.jd.com/list.html?cat=1318,1462,1473')
('户外装备', '照明', 'https://list.jd.com/list.html?cat=1318,1462,1476')
('户外装备', '背包', 'https://list.jd.com/list.html?cat=1318,1462,1472')
('户外装备', '户外仪表', 'https://list.jd.com/list.html?cat=1318,1462,2631')
('户外装备', '工具', 'https://list.jd.com/list.html?cat=1318,1462,1479')
('户外装备', '望远镜', 'https://list.jd.com/list.html?cat=1318,1462,1480')
('户外装备', '旅游用品', 'htt

In [7]:
import pandas as pd

# 將小類別頁面連結存成 DataFrame
df_cates = pd.DataFrame(data, columns=["medium_cate","small_cate","url"])
df_cates.head()

Unnamed: 0,medium_cate,small_cate,url
0,户外鞋服,冲锋衣裤,https://list.jd.com/list.html?cat=1318%2C2628%...
1,户外鞋服,徒步鞋,https://list.jd.com/list.html?cat=1318%2C2628%...
2,户外鞋服,抓绒衣裤,"https://list.jd.com/list.html?cat=1318,2628,12128"
3,户外鞋服,羽绒服棉服,"https://list.jd.com/list.html?cat=1318,2628,12126"
4,户外鞋服,越野跑鞋,"https://list.jd.com/list.html?cat=1318,2628,12137"


### 取得類別下的品牌列表

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

# ✅ 設定 Chrome 選項
options = Options()
options.add_argument("--headless")  # 無頭模式
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")  # 避免載入手機版

# ✅ 啟動 WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# ✅ 目標網址
base_url = 'https://channel.jd.com/outdoor.html'
driver.get(base_url)

# ✅ 等待網頁完全加載
WebDriverWait(driver, 15).until(
    EC.presence_of_element_located((By.ID, "Categorys"))
)

# ✅ 存放爬取結果
data = []

# ✅ 重新獲取 `categories` 避免失效
for _ in range(3):  # 最多重試 3 次
    try:
        categories = driver.find_elements(By.XPATH, '//div[@id="Categorys"]//dl[@class="item-inner"]/dt')
        WebDriverWait(driver, 5).until(EC.visibility_of(categories[0]))  # 確保第一個分類可見
        break  # 成功取得分類後跳出
    except:
        print("⚠️ [警告] 無法獲取大分類，重試中...")
        time.sleep(2)

# ✅ 遍歷所有分類
for i in range(len(categories)):
    try:
        dt = categories[i]  # 重新獲取有效的元素
        category_name = dt.text.strip()
    except:
        print(f"⚠️ [警告] 大分類失效，跳過...")
        continue  # 如果元素失效，跳過這個分類

    # ✅ 重新獲取所有小分類，確保不會 `stale`
    sub_links = dt.find_elements(By.XPATH, "./following-sibling::dd/a")
    
    for j in range(len(sub_links)):
        try:
            # ✅ 重新獲取 `sub_links`
            sub_links = dt.find_elements(By.XPATH, "./following-sibling::dd/a")
            sub = sub_links[j]

            # ✅ 等待小分類可見
            WebDriverWait(driver, 5).until(EC.visibility_of(sub))

            sub_name = sub.text.strip()
            sub_href = sub.get_attribute("href")

            # ✅ 確保連結完整
            if sub_href.startswith("//"):
                sub_href = "https:" + sub_href

            # ✅ 進入小分類頁面，抓取品牌
            driver.get(sub_href)
            time.sleep(2)  # 確保頁面加載

            # ✅ 等待品牌列表載入
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div#J_selector div.J_selectorLine div[title] a"))
                )

                # ✅ 爬取品牌列表
                brand_elements = driver.find_elements(By.CSS_SELECTOR, "div#J_selector div.J_selectorLine div[title] a")
                brand_list = [(brand.text.strip(), brand.get_attribute("href")) for brand in brand_elements if brand.text.strip()]

            except:
                brand_list = []  # 沒有品牌篩選的頁面

            # ✅ 存儲爬取的品牌數據
            for brand_name, brand_href in brand_list:
                if brand_href.startswith("//"):
                    brand_href = "https:" + brand_href
                data.append((category_name, sub_name, sub_href, brand_name, brand_href))

            # ✅ 回到主頁
            driver.get(base_url)
            time.sleep(1)

        except:
            print(f"⚠️ [警告] {category_name} - 小分類失效，跳過...")
            continue

# ✅ 存入 CSV
with open("jd_brands.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["大分類", "小分類", "小分類網址", "品牌", "品牌網址"])
    writer.writerows(data)

# ✅ 顯示爬取結果
print("爬取到的分類-品牌數據 (儲存於 jd_brands.csv):")
for item in data[:10]:  # 只顯示前 10 筆
    print(item)

# ✅ 關閉瀏覽器
driver.quit()


⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 户外鞋服 - 小分類失效，跳過...
⚠️ [警告] 大分類失效，跳過...
⚠️ [警告] 大分類失效，跳過...
⚠️ [警告] 大分類失效，跳過...
⚠️ [警告] 大分類失效，跳過...
爬取到的分類-品牌數據 (儲存於 jd_brands.csv):


In [11]:
df_brands = pd.DataFrame(data, columns=["small_cate", "brand_name", "brand_page", "brand_logo"])
print(df_brands.shape)
df_brands.head()

(0, 4)


Unnamed: 0,small_cate,brand_name,brand_page,brand_logo
