In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import threading
import queue
import time

from crawler import login_linkedin_driver, CrawlerJob, result_router, linkedin_page_crawler
from url_generator import generate_urls
from cookies import save_cookies, load_cookies

In [None]:
import sys
import os

# 设置工作路径为当前notebook所在目录
notebook_dir = os.getcwd()
if notebook_dir not in sys.path:
    sys.path.append(notebook_dir)

CHROMEDRIVER_PATH = crawler

In [20]:
def init_driver(cookies_file="cookies.pkl"):
    # options = Options()

    # # 禁止浏览器检测自动化行为
    # options.add_argument("disable-blink-features=AutomationControlled")

    # # 伪造请求头
    # options.add_argument(
    #     "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49"
    # )

    # # 隐藏“正受到自动测试软件的控制”提示
    # options.add_experimental_option("excludeSwitches", ["enable-automation"])

    # # 禁用自动化扩展
    # options.add_experimental_option("useAutomationExtension", False)

    # # 初始化 WebDriver
    # driver = webdriver.Chrome(options=options)
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--remote-debugging-port=0")

    driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH),options=options)
    driver.get("https://www.linkedin.com")  # 必须先打开域名才能加 cookie


    # 载入 cookie
    if load_cookies(driver, cookies_file):
        driver.refresh()
        time.sleep(3)  # 等待页面刷新完成
    else:
        print("没有 cookies，需要人工登录")
        # 如果第一次跑，需要人工登录并保存 cookie
        login_linkedin_driver(driver)
        save_cookies(driver, cookies_file)
        driver.refresh()
        time.sleep(3)

    return driver


def worker(worker_id, job_queue, results, results_lock, cookies_file="cookies.pkl"):
    driver = init_driver(cookies_file)
    try:
        while True:
            job = job_queue.get(timeout=60)
            try:
                data = job.handler(driver, job.url, time_sleep=3, wait_time=10)
                # use result_router to handle the result
                result_router(data, job_queue, results, results_lock)
            finally:
                job_queue.task_done()
    except queue.Empty:
        pass
    finally:
        driver.quit()
        print(f"[Worker {worker_id}] finished")


def run_crawler(urls, num_workers=3):
    """运行爬虫调度"""
    job_queue = queue.Queue()
    results: list[dict] = []
    results_lock = threading.Lock()

    for url in urls:
        job_queue.put(url)

    threads = []
    for i in range(num_workers):
        t = threading.Thread(
            target=worker,
            args=(i, job_queue, results, results_lock),
            kwargs={"cookies_file": "cookies.pkl"},
        )
        t.start()
        threads.append(t)

    # 等待所有任务完成
    job_queue.join()

    # 等待所有线程退出
    for t in threads:
        t.join()

    return results


In [21]:
def save_results(results, output_file="results.json"):
    import json
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Results saved to {output_file}")

def main(args):
    # 载入args
    keywords = args.keywords
    states = args.states
    num_workers = args.workers


    # 生成爬虫队列
    urls = generate_urls(keyword=keywords, states=states)
    jobs = [CrawlerJob(url, linkedin_page_crawler) for url in urls]

    # 运行爬虫
    results = run_crawler(jobs, num_workers)
    print(f"爬取完成，共获得 {len(results)} 条结果")

    # 保存结果
    save_results(results)

    # 退出
    print("所有任务完成，退出")
    

def run_main(keywords, states=None, workers=3):
    urls = generate_urls(keyword=keywords, states=states)
    jobs = [CrawlerJob(url, linkedin_page_crawler) for url in urls]
    results = run_crawler(jobs, workers)
    print(f"爬取完成，共获得 {len(results)} 条结果")
    save_results(results)
    print("所有任务完成，退出")
    return results

In [22]:
results = run_main("data center", workers=4)

Cookies 已载入
Cookies 已载入
Cookies 已载入
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center
Cookies 已载入
Total jobs found: 48522
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=1
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=3
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=4
Total jobs found: 3512
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=5
Total jobs found: 26030
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=6
Total jobs found: 8013
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=1
Total jobs found: 2067
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=4
Total 

Exception in thread Thread-14 (worker):
Traceback (most recent call last):
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 1411, in getresponse
    response.begin()
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 324, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 285, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^

Total jobs found: 7690
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2


    self.run()
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/threading.py", line 989, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/26/4nzkhp6s3dlbh_7nlm1_f4g48rkqy_/T/ipykernel_61461/252523320.py", line 52, in worker
  File "/Users/50357691/Desktop/LinkedIn-DataCenter-Jobs/crawler.py", line 171, in linkedin_page_crawler
    job_main = get_linkedin_job_main_page(driver, url, time_sleep, wait_time)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/Desktop/LinkedIn-DataCenter-Jobs/crawler.py", line 149, in get_linkedin_job_main_page
    driver.get(url)
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/selenium/webdriver/remote/webdriver.py", line 483, in get
    self.execute(Command.GET, {"url": url})
  File "/

Total jobs found: 7689
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7689
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7677
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7643
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7648
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7621
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7635
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+center&f_E=2
Total jobs found: 7635
超过 1000 条，生成细化筛选的任务...
访问页面: https://www.linkedin.com/jobs/search/?sortBy=R&keywords=data+cente

Exception in thread Thread-15 (worker):
Traceback (most recent call last):
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 1411, in getresponse
    response.begin()
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 324, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 285, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^

[Worker 1] finished


Exception in thread Thread-17 (worker):
Traceback (most recent call last):
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 1411, in getresponse
    response.begin()
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 324, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 285, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^

[Worker 3] finished


Exception in thread Thread-16 (worker):
Traceback (most recent call last):
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 1411, in getresponse
    response.begin()
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 324, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/http/client.py", line 285, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^

[Worker 2] finished
爬取完成，共获得 17 条结果
Results saved to results.json
所有任务完成，退出


In [25]:
import json
import pandas as pd

# 读取结果 JSON
with open("results.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 确认 data 是列表（每个元素是一个 dict）
if isinstance(data, dict):
    # 如果是 dict，可能里面有 'results' 或类似 key
    if "results" in data:
        data = data["results"]
    else:
        data = [data]

# 转成 DataFrame
df = pd.DataFrame(data)

# 确保 original_url 在列里
if "original_url" not in df.columns:
    # 如果 JSON 里有 url / JobURL / source_url，可以映射一下
    if "JobURL" in df.columns:
        df = df.rename(columns={"JobURL": "original_url"})
    elif "url" in df.columns:
        df = df.rename(columns={"url": "original_url"})
    else:
        # 如果没有，先加一个空列，避免报错
        df["original_url"] = None

In [26]:
df

Unnamed: 0,original_url,jobs
0,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4306223827', 'job_name': 'CIO Con..."
1,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4303687184', 'job_name': 'Control..."
2,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4295731709', 'job_name': 'Busines..."
3,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4302356976', 'job_name': 'Vice Pr..."
4,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4292462907', 'job_name': 'Global ..."
5,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4258058923', 'job_name': 'Head of..."
6,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4258058922', 'job_name': 'Head of..."
7,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4293338580', 'job_name': 'Head of..."
8,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4293337591', 'job_name': 'Head of..."
9,https://www.linkedin.com/jobs/search/?sortBy=R...,"[{'job_id': '4183460094', 'job_name': 'Head of..."
