In [1]:
pip install selenium webdriver-manager beautifulsoup4


Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
import time
from bs4 import BeautifulSoup

# setup Firefox WebDriver
options = webdriver.FirefoxOptions()
options.add_argument('--headless')  # 无头模式，避免弹出浏览器界面
options.add_argument('--disable-gpu')  # 禁用GPU加速
options.add_argument('start-maximized')  # 最大化窗口

# 初始化 WebDriver
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)

# 目标网址
url = "https://www.arabidopsis.org/news/jobs"

# 请求网页
driver.get(url)

# 等待页面加载，适当延时
time.sleep(5)  # 根据需要增加等待时间，确保所有动态内容加载完毕

# 获取网页内容
webpage = driver.page_source

# 使用 BeautifulSoup 解析网页
soup = BeautifulSoup(webpage, 'html.parser')

# 查找职位列表
job_list = soup.find_all('div', class_='news-item')

# 确认是否找到了职位数据
if not job_list:
    print("没有找到职位信息，请检查网页结构")
    driver.quit()  # 关闭浏览器
    exit()

# 创建一个列表，用于保存抓取的数据
jobs_data = []

# 遍历职位列表
for job in job_list:
    try:
        # 提取职位标题和链接
        job_title_tag = job.find('div', class_='news-header').find('a')
        job_title = job_title_tag.text.strip() if job_title_tag else 'N/A'

        # 提取职位链接
        job_link = job_title_tag['href'] if job_title_tag else 'N/A'
        job_link = f"https://www.arabidopsis.org{job_link}"  # 补全链接

        # 提取位置（假设位置是下一个 div 标签）
        job_location = job.find('div', class_='news-header').find_next('div').text.strip() if job.find('div', class_='news-header').find_next('div') else 'N/A'

        # 提取申请截止日期（假设截止日期在下一个 div 标签）
        application_deadline = job.find('div', class_='news-header').find_next('div').find_next('div').text.strip() if job.find('div', class_='news-header').find_next('div').find_next('div') else 'N/A'

        # 添加职位信息到 jobs_data 列表
        jobs_data.append([job_title, job_location, application_deadline, job_link])

    except Exception as e:
        print(f"抓取职位时出错: {e}")

# 输出为 CSV 文件
csv_filename = 'job_listings.csv'

# 打开文件并写入数据
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # 写入 CSV 表头
    writer.writerow(['Job Title', 'Location', 'Application Deadline', 'Job Link'])
    
    # 写入每个职位的数据
    for job_data in jobs_data:
        writer.writerow(job_data)

print(f"数据已保存到 {csv_filename}")

# 关闭浏览器
driver.quit()


数据已保存到 job_listings.csv


In [7]:
import pandas as pd
data = pd.read_csv("job_listings.csv")
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Job Title,Location,Application Deadline,Job Link
0,"Dec 11, 2024, Graduate research assistanship MS","North Dakota State University, Dickinson, ND, USA","Dec 10, 2024, Plant Microbe Interaction Postdo...",https://www.arabidopsis.org/news/jobs/view?fil...
1,"Dec 10, 2024, Plant Microbe Interaction Postdo...","Lawrence Berkeley National Laboratory, Emeryvi...","*Application Deadline: Jan 15, 2025",https://www.arabidopsis.org/news/jobs/view?fil...
2,"Dec 10, 2024, PhD positionin Plant Evolutionar...","Yunnan University, Kunming, Yunnan, China","*Application Deadline: Jan 12, 2025",https://www.arabidopsis.org/news/jobs/view?fil...
3,"Dec 6, 2024, Master's, Ph.D., and Postdoc Posi...","Texas Tech University, Lubbock, TX, USA","Dec 6, 2024, Technician, stomata, photosynthes...",https://www.arabidopsis.org/news/jobs/view?fil...
4,"Dec 6, 2024, Technician, stomata, photosynthes...","University of Illinois Urbana Champaign, Urban...","*Application Deadline: Jan 2, 2025",https://www.arabidopsis.org/news/jobs/view?fil...
