In [1]:
import time, random, re, urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from getpass import getpass
from selenium.webdriver.common.keys import Keys

In [2]:
import sys
import os

# 设置工作路径为当前notebook所在目录
notebook_dir = os.getcwd()
if notebook_dir not in sys.path:
    sys.path.append(notebook_dir)

In [3]:
from crawler import linkedin_common_crawler, login_linkedin
from url_generator import generate_urls
from cookies import save_cookies, load_cookies

In [4]:
urls = generate_urls()

Generating 270 combinations...
Successfully generated 270 URLs.


In [6]:
from concurrent.futures import ThreadPoolExecutor

max_workers = 10
executor = ThreadPoolExecutor(max_workers=max_workers)

print(f"Max Pool: {max_workers}")

Max Pool: 10


In [7]:
from queue import Queue

# 创建一个队列并将urls加入队列
url_queue = Queue()
for url in urls:
    url_queue.put(url)

print(f"Already added {url_queue.qsize()} URLs in the queue.")

Already added 270 URLs in the queue.


In [17]:
service = Service(ChromeDriverManager(driver_version="140").install())
driver = webdriver.Chrome(service=service)

Service process refused to terminate gracefully with SIGTERM, escalating to SIGKILL.
Traceback (most recent call last):
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/site-packages/selenium/webdriver/common/service.py", line 179, in _terminate_process
    self.process.wait(60)
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/subprocess.py", line 1264, in wait
    return self._wait(timeout=timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/50357691/miniforge3/envs/crawler/lib/python3.12/subprocess.py", line 2038, in _wait
    raise TimeoutExpired(self.args, timeout)
subprocess.TimeoutExpired: Command '['/Users/50357691/.wdm/drivers/chromedriver/mac64/140.0.7339.207/chromedriver-mac-arm64/chromedriver', '--port=49697']' timed out after 60 seconds


In [18]:
driver.get("https://www.linkedin.com/")
load_cookies(driver)

Cookies 已载入


True

In [36]:
def crawl_url(url):
    
    try:
        result = linkedin_common_crawler(url)
        print(f"Crawl Successfully: {url}")
        return result
    except Exception as e:
        print(f"Crawl Failed: {url}, Error: {e}")
        return None

futures = []
while not url_queue.empty():
    url = url_queue.get()
    future = executor.submit(crawl_url, url)
    futures.append(future)

for future in futures:
    data = future.result()
    if data is not None:
        results.append(data)

In [None]:
import threading
import queue
import time

url_queue = queue.Queue()

results = []
results_lock = threading.Lock()

def init_driver(cookies_file="cookies.pkl"):
    driver = webdriver.Chrome()
    driver.get("https://www.linkedin.com")  

    # Load cookies if available
    if load_cookies(driver, cookies_file):
        driver.refresh()
        time.sleep(3) 
    else:
        print("no cookies found.")
        driver.get("https://www.linkedin.com/login")
        input("Please press Enter...")
        save_cookies(driver, cookies_file)
        driver.refresh()
        time.sleep(3)

    return driver


def worker(worker_id, url_queue, cookies_file="cookies.pkl"):
    driver = init_driver(cookies_file)
    while True:
        try:
            url = url_queue.get(timeout=5)
        except queue.Empty:
            break

        try:
            data = linkedin_common_crawler(driver, url)
            if data:
                with results_lock:
                    results.append(data)
                print(f"[Worker {worker_id}] OK: {url}, got {len(data['jobs'])} jobs")
            else:
                print(f"[Worker {worker_id}] No data: {url}")
        except Exception as e:
            print(f"[Worker {worker_id}] Error on {url}: {e}")
        finally:
            url_queue.task_done()

    driver.quit()
    print(f"[Worker {worker_id}] finished")


def run_crawler(urls, num_workers=3):
    for url in urls:
        url_queue.put(url)

    threads = []
    for i in range(num_workers):
        t = threading.Thread(target=worker, args=(i, url_queue))
        t.start()
        threads.append(t)

    url_queue.join()

    for t in threads:
        t.join()

    return results


In [None]:
run_crawler(urls, num_workers=5)

In [42]:
import pandas as pd

def results_to_dataframe(results):
    """
    Flatten LinkedIn job scraping results into a pandas DataFrame,
    keeping the original search URL for each job.
    """
    rows = []
    for block in results:
        search_url = block.get("url")
        for job in block.get("jobs", []):
            job_copy = job.copy()
            job_copy["original_url"] = search_url
            rows.append(job_copy)

    df = pd.DataFrame(rows)

    # Show all columns, don’t drop any accidentally
    return df

In [43]:
df = results_to_dataframe(results)
df.head()

Unnamed: 0,job_id,job_name,company_name,job_location,job_metadata,job_url,original_url
0,4092074034,Experienced Apprentice Electrician - Commercial,Weifield Group Contracting | A LOENBRO Company,"Centennial, CO (On-site)","$23/hr - $29/hr · 401(k), Medical, Vision, +1 ...",https://www.linkedin.com/jobs/view/4092074034/...,https://www.linkedin.com/jobs/search/?keywords...
1,4150887505,2025 Summer Civil Engineer Assistant,BKF Engineers,"Pleasanton, CA (On-site)",$20/hr - $31.47/hr,https://www.linkedin.com/jobs/view/4150887505/...,https://www.linkedin.com/jobs/search/?keywords...
2,4268452702,Engineer Intern Applicant - Engineer Intern 2,Louisiana Department Of Transportation and Dev...,"Baton Rouge, LA (On-site)",$96.3K/yr,https://www.linkedin.com/jobs/view/4268452702/...,https://www.linkedin.com/jobs/search/?keywords...
3,4208295571,Engineering Intern - Secure Design,Walter P Moore,"Washington, DC (On-site)",$25.30/hr - $35.20/hr,https://www.linkedin.com/jobs/view/4208295571/...,https://www.linkedin.com/jobs/search/?keywords...
4,4145060930,Praktikant (w/m/d),Rödl & Partner USA,"Atlanta, GA (On-site)",$20/hr - $32/hr,https://www.linkedin.com/jobs/view/4145060930/...,https://www.linkedin.com/jobs/search/?keywords...


In [54]:
df = df.drop_duplicates(subset=["job_id"], keep="first")


In [55]:
df.shape

(366, 7)

In [77]:
df.to_csv("Raw_linkedin_jobs.csv", index=False)