In [3]:
pip install requests beautifulsoup4 pandas openpyxl selenium webdriver-manager

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client<2.0,>=1.8.0 (from selenium)
  Downloading websocket_client-1.9.0-py3-none-any.whl.metadata (8.3 kB)
Collecting attrs>=23.2.0 (from trio<1.0,>=0.31.0->selenium)
  Downloading attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio<1.0,>=0.31.0->selenium)
  D


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from collections import Counter

# Selenium 相關套件
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# =========================================================
# 設定：資料工程師常見技能關鍵字詞庫 (用於內文掃描分析)
# =========================================================
# 您可以自由擴充這個列表
DATA_ENGINEER_KEYWORDS = [
    # 程式語言
    "python", "java", "scala", "go", "golang", "r", "c++", "c#", "sql", "nosql",
    # 資料庫 & 倉儲
    "mysql", "postgresql", "postgres", "mongodb", "redis", "cassandra", 
    "oracle", "sql server", "bigquery", "redshift", "snowflake", "dynamodb",
    # 大數據 & 處理框架
    "spark", "hadoop", "kafka", "flink", "hive", "hbase", "airflow", "dbt", 
    "pandas", "numpy", "databricks", "presto", "trino",
    # 雲端平台
    "aws", "azure", "gcp", "google cloud", "aliyun",
    # DevOps & 工具
    "docker", "kubernetes", "k8s", "git", "jenkins", "gitlab", "linux", "bash", "shell",
    "terraform", "ansible",
    # BI & 視覺化
    "tableau", "power bi", "looker", "superset"
]

def extract_skills_from_content(text):
    """
    從工作內容的一大段文字中，掃描是否包含特定技能關鍵字
    """
    if not text:
        return ""
    
    text_lower = text.lower()
    found_skills = []
    
    for keyword in DATA_ENGINEER_KEYWORDS:
        # 使用正規表達式避免抓到單字的一部分 (例如避免把 'mysql' 抓成 'sql')
        # \b 代表單字邊界
        pattern = r"(?:^|\W)" + re.escape(keyword) + r"(?:$|\W)"
        if re.search(pattern, text_lower):
            found_skills.append(keyword)
            
    return ", ".join(found_skills) # 回傳逗號分隔的字串

# =========================================================
# 1. 核心邏輯：單頁資料抓取
# =========================================================
def safe_get_text(tag):
    return tag.get_text(strip=True) if tag else None

def match_any(text, keywords):
    return any(k in text for k in keywords)

def get_job_details(url):
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return None
    except Exception:
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    # --- Breadcrumbs ---
    breadcrumbs = [bc.get_text(strip=True) 
                   for bc in soup.find_all(class_=re.compile("Breadcrumbs_labelText"))]
    # 預設為 None，可自動 fallback 
    job_title = company = industry = category = None 
    if len(breadcrumbs) >= 4: 
        # 正常情況 index 結構為： 
        # 0=首頁, 1=公司, 2=公司名稱, 3=產業, 4=職務類別, 5=職務名稱 
        job_title = breadcrumbs[2] 
        company = breadcrumbs[1] 
        industry = breadcrumbs[3] 
        category = breadcrumbs[4]

    # --- 更新時間 ---
    updated_date = None
    for lab in soup.find_all("div", class_=re.compile("InlineMessage_label")):
        txt = safe_get_text(lab)
        if txt and ("更新" in txt):
            updated_date = txt
            break

    # --- 工作內容 (含技能分析) ---
    content_sections = soup.find_all("div", class_="RailsHtml_container__LlMcK")
    content_text = "\n".join([c.get_text("\n", strip=True) for c in content_sections])
    
    # *** 新增：從內文分析技能 ***
    skills_from_content = extract_skills_from_content(content_text)

    # --- 右側資訊 ---
    job_info_container = soup.find("div", class_=re.compile("JobDescriptionRightColumn_jobInfo"))

    job_data = {
        "職缺名稱": job_title,
        "公司名稱": company,
        "產業類別": industry,
        "職務類別": category,
        "職缺連結": url,
        "更新時間": updated_date,
        "職務型態": "未標示",
        "職務等級": "未標示",
        "招募人數": "未標示",
        "地點": None,
        "薪資": None,
        "經驗": "經驗不拘",
        "管理責任": "不需負擔管理責任",
        "遠端工作": "不支援遠端",
        "其他標籤(右側欄)": "",       # 網站原本標示的 tag
        "技能工具(內文分析)": skills_from_content, # 我們自己分析出的
        "工作內容": content_text,
    }

    if job_info_container:
        rows = job_info_container.find_all("div", recursive=False)
        for row in rows:
            text = safe_get_text(row)
            if not text: continue

            if match_any(text, ["全職", "兼職", "實習", "Contract", "派遣"]):
                parts = text.split("・")
                job_data["職務型態"] = parts[0]
                if len(parts) > 1: job_data["職務等級"] = parts[1]
            elif row.find(class_=re.compile("locationsWrapper")):
                job_data["地點"] = text
            elif match_any(text, ["TWD", "USD", "月薪", "年薪", "時薪"]) and any(c.isdigit() for c in text):
                job_data["薪資"] = text
            elif "經驗" in text or "year" in text.lower():
                job_data["經驗"] = text
            elif "管理" in text:
                job_data["管理責任"] = text
            elif "遠端" in text or "Remote" in text:
                job_data["遠端工作"] = text
            elif (text.isdigit() or "人" in text) and len(text) < 10:
                if "管理" not in text:
                    number_match = re.search(r"(\d+)", text)
                    if number_match: job_data["招募人數"] = number_match.group(1)
            elif row.find("a"):
                job_data["其他標籤(右側欄)"] = row.get_text(separator=", ", strip=True)

    return job_data

# =========================================================
# 2. 列表抓取：改為分頁模式 (Page Pagination)
# =========================================================
def get_job_links_by_page(keyword, max_pages=3):
    """
    透過 URL 參數 &page=1, &page=2 進行翻頁抓取
    """
    print(f"啟動瀏覽器搜尋：{keyword}，預計抓取 {max_pages} 頁...")
    
    chrome_options = Options()
    # chrome_options.add_argument("--headless") # 開發時建議先註解掉，看得到瀏覽器比較安心
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    base_url = "https://www.cake.me/jobs"
    job_links = []
    seen_urls = set()

    for page in range(1, max_pages + 1):
        # 組合分頁網址
        target_url = f"{base_url}?q={keyword}&page={page}"
        print(f"  -> 正在讀取第 {page} 頁: {target_url}")
        
        driver.get(target_url)
        time.sleep(4) # 等待頁面載入 (Cake 載入速度中等，建議給 3-5 秒)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # 抓取該頁所有職缺連結
        atags = soup.find_all("a", href=re.compile(r"/companies/.+/jobs/"))
        
        new_links_count = 0
        for a in atags:
            href = a.get("href")
            if not href.startswith("http"):
                full_url = "https://www.cake.me" + href
            else:
                full_url = href
            
            if full_url not in seen_urls:
                job_links.append(full_url)
                seen_urls.add(full_url)
                new_links_count += 1
            if "no job found" in safe_get_text(a).lower():
                # 如果看到「找不到職缺」的訊息，表示沒有更多職缺了
                print("    找不到更多職缺，提前結束抓取。")
                driver.quit()
                return job_links
        
        print(f"     第 {page} 頁找到 {new_links_count} 筆新職缺 (目前累計: {len(job_links)})")

    driver.quit()
    return job_links

# =========================================================
# 3. 主程式
# =========================================================
if __name__ == "__main__":
    keyword = "資料工程師"
    MAX_PAGES = 1  # 您可以設定要抓幾頁，例如 5 或 10
    
    # A. 抓取連結
    links = get_job_links_by_page(keyword, max_pages=MAX_PAGES)
    
    all_jobs_data = []

    # B. 抓取細節
    print("\n開始逐筆抓取詳細資料並分析內文技能...")
    for index, link in enumerate(links):
        print(f"[{index+1}/{len(links)}] 分析中: {link}")
        
        details = get_job_details(link)
        if details:
            all_jobs_data.append(details)
        
        time.sleep(random.uniform(1.5, 3)) # 禮貌性延遲

    # C. 匯出與簡單統計
    if all_jobs_data:
        df = pd.DataFrame(all_jobs_data)
        
        # 欄位排序
        columns_order = [
            "職缺名稱", "公司名稱", "地點", "薪資", "經驗", 
            "技能工具(內文分析)", "其他標籤(右側欄)", # 將技能欄位往前放
            "職務型態", "職務等級", "招募人數", "管理責任", "遠端工作", 
            "更新時間", "產業類別", "職務類別", "職缺連結", "工作內容"
        ]
        final_cols = [c for c in columns_order if c in df.columns]
        df = df[final_cols]

        filename = f"Cake_{keyword}_含技能分析_test.xlsx"
        df.to_excel(filename, index=False, engine="openpyxl")
        print(f"\n成功！資料已儲存至: {filename}")
        
        # --- 額外功能：顯示最熱門的技能 ---
        print("\n=== 技能需求統計 (Top 10) ===")
        all_skills = []
        for skills_str in df["技能工具(內文分析)"]:
            if skills_str:
                all_skills.extend(skills_str.split(", "))
        
        if all_skills:
            counter = Counter(all_skills)
            for skill, count in counter.most_common(10):
                print(f"{skill}: {count} 次")
        else:
            print("未分析到任何技能關鍵字。")

    else:
        print("\n沒有抓取到任何資料。")

啟動瀏覽器搜尋：資料工程師，預計抓取 1 頁...
  -> 正在讀取第 1 頁: https://www.cake.me/jobs?q=資料工程師&page=1
     第 1 頁找到 10 筆新職缺 (目前累計: 10)

開始逐筆抓取詳細資料並分析內文技能...
[1/10] 分析中: https://www.cake.me/companies/commeet/jobs/data-assistant-engineerdata-science-intern
[2/10] 分析中: https://www.cake.me/companies/pinkoi/jobs/b2ec807e-05ce-449b-bd06-42404466362d-data-engineer-10e1700edfb1a970396752254773d1
[3/10] 分析中: https://www.cake.me/companies/7-eleven-vietnam/jobs/data-engineer-intern-b66
[4/10] 分析中: https://www.cake.me/companies/WorldQuant/jobs/4607049006-data-engineer-8207e93ed0b79a6e5296899e2822d0
[5/10] 分析中: https://www.cake.me/companies/opennet-limited/jobs/data-engineer-99f
[6/10] 分析中: https://www.cake.me/companies/WorldQuant/jobs/4329394006-senior-data-engineer-fd4904f48a6d46e057cf48beed7c19
[7/10] 分析中: https://www.cake.me/companies/Google/jobs/120947988674552518-senior-data-engineer-youtube-b2162348c9f8a6da1a3301ea7cdac7
[8/10] 分析中: https://www.cake.me/companies/Google/jobs/109015621412233926-technical-solutions