In [1]:
# HTTP 请求配置
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# 设置pandas显示选项，取消输出省略
pd.set_option('display.max_rows', None)  # 显示所有行
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.width', None)  # 自动适配宽度
pd.set_option('display.max_colwidth', None)  # 显示列的完整内容

# 爬取目标: Python.org
URL = "https://www.python.org"

# 1. 设置伪装 Headers (模拟真实浏览器)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9'
}

try:
    start_time = time.time()
    response = requests.get(URL, headers=headers, timeout=10)
    response.raise_for_status() # 检查 404/500 错误
    print(f"✅ 请求成功! 状态码: {response.status_code}")
    print(f"⏱️ 耗时: {time.time() - start_time:.4f} 秒")
except requests.exceptions.RequestException as e:
    print(f"❌ 请求失败: {e}")



✅ 请求成功! 状态码: 200
⏱️ 耗时: 1.2987 秒


In [2]:
# BeautifulSoup 解析与 CSS 选择器
soup = BeautifulSoup(response.text, 'html.parser')

# 1. 提取所有导航链接 (使用 CSS Selector)
# 查找 nav 标签下的 li 下的 a 标签
nav_links = soup.select('nav li a') 

data_list = []
for link in nav_links:
    text = link.get_text(strip=True)
    href = link.get('href')
    # 处理相对路径
    if href and href.startswith('/'):
        href = URL + href
    if text:
        data_list.append({'Category': 'Nav', 'Text': text, 'URL': href})

# 2. 提取 Latest News (假设结构)
news_items = soup.select('.blog-widget li')
for item in news_items:
    try:
        date = item.find('time').get_text(strip=True)
        title = item.find('a').get_text(strip=True)
        data_list.append({'Category': 'News', 'Text': title, 'Date': date})
    except AttributeError:
        continue



In [3]:
# 数据结构化与导出
df = pd.DataFrame(data_list)
print(f"\n=== 抓取结果 ({len(df)} 条) ===")
display(df.head(10))

# 过滤特定数据
news_df = df[df['Category'] == 'News']
print("\n=== 最新新闻 ===")
display(news_df)




=== 抓取结果 (69 条) ===


Unnamed: 0,Category,Text,URL,Date
0,Nav,Python,https://www.python.org/,
1,Nav,PSF,https://www.python.org/psf/,
2,Nav,Docs,https://docs.python.org,
3,Nav,PyPI,https://pypi.org/,
4,Nav,Jobs,https://www.python.org/jobs/,
5,Nav,Community,https://www.python.org/community/,
6,Nav,About,https://www.python.org/about/,
7,Nav,Applications,https://www.python.org/about/apps/,
8,Nav,Quotes,https://www.python.org/about/quotes/,
9,Nav,Getting Started,https://www.python.org/about/gettingstarted/,



=== 最新新闻 ===


Unnamed: 0,Category,Text,URL,Date
64,News,"Python 3.15.0 alpha 5 (yes, another alpha!)",,2026-01-14
65,News,Python 3.15.0 alpha 4,,2026-01-13
66,News,Anthropic invests $1.5 million in the Python S...,,2026-01-13
67,News,PSF News: $500K+ Raised for Python for Everyon...,,2026-01-08
68,News,Python 3.15.0 alpha 3,,2025-12-16


In [4]:
# 保存数据
df.to_csv('python_org_data.csv', index=False)