In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
脚本功能：从 Female detainees' records.html 中提取案件链接并生成 CSV
"""

import re
from bs4 import BeautifulSoup
import pandas as pd

# 1. 读取 HTML 文件（注意页面声明的是 windows-1252 编码）
with open("Female detainees' records.html", encoding="windows-1252") as f:
    html = f.read()

# 2. 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(html, "lxml")
body = soup.body

# 3. 找到所有 <a> 标签
all_links = body.find_all("a")

# 4. 过滤出以 “Case <数字>” 开头的链接
case_links = []
case_pattern = re.compile(r"^Case\s+\d+", re.IGNORECASE)
for a in all_links:
    txt = a.get_text(strip=True)
    if case_pattern.match(txt):
        case_links.append(a)

# 5. 从每个链接中提取 case_number, description, url
records = []
for a in case_links:
    txt = a.get_text(" ", strip=True)      # 如 "Case 2657 Moy Chin See his wife"
    href = a.get("href")                   # 如 "https://www.frederickbee.com/2657.html"
    m = re.match(r"Case\s+(\d+)\s+(.*)", txt, re.IGNORECASE)
    if not m:
        continue
    records.append({
        "case_number": m.group(1),
        "description": m.group(2),         # 如 "Moy Chin See his wife"
        "url": href
    })

# 6. 转为 pandas DataFrame
df = pd.DataFrame(records)

# 7. 去重（如果同一 case_number 出现多次，则保留第一次）
df = df.drop_duplicates(subset="case_number", keep="first")

# 8. 排除“appeal”、“Testimony”、“records”等非主诉求条目
exclude_re = re.compile(r"appeal|testimony|records", re.IGNORECASE)
df = df[~df["description"].str.contains(exclude_re)]

# 9. （可选）进一步拆分姓名和其他说明，需要人工检查正则是否覆盖所有情况
# df[["name", "extra"]] = df["description"].str.extract(r"^([\w\s\.\']+)(.*)$", expand=True)

# 10. 导出为 CSV（UTF-8 编码）
output_file = "female_detainees_cases_1882_1892.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"提取完成，文件已保存为：{output_file}")


提取完成，文件已保存为：female_detainees_cases_1882_1892.csv
