In [2]:
from pathlib import Path
import requests

# List of PMC article URLs
urls = [
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC4863427",
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC7223160",
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC7264388",
]

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://pmc.ncbi.nlm.nih.gov/",
}

for url in urls:
    pmcid = url.split("/")[-1]
    outfile = Path(f"{pmcid}.html")
    with requests.Session() as s:
        s.headers.update(headers)
        resp = s.get(url, timeout=30)
        resp.raise_for_status()
        if not resp.encoding:
            resp.encoding = "utf-8"
        outfile.write_text(resp.text, encoding=resp.encoding)
    print(f"Saved {outfile.resolve()} ({outfile.stat().st_size:,} bytes)")

Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC4863427.html (180,248 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7223160.html (357,542 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7223160.html (357,542 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7264388.html (205,454 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7264388.html (205,454 bytes)


In [10]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

html_files = [
    'PMC4863427.html',
    'PMC7223160.html',
    'PMC7264388.html',
]

results = []

# --------- 工具函式 ---------
def normalize_number(val: str) -> str:
    """將各種奇怪的負號、空白、註腳去掉"""
    val = val.strip()
    val = val.replace('−', '-').replace('–', '-')
    val = re.sub(r'\u00a0', ' ', val)  # 移除 &nbsp;
    val = re.sub(r'[\*\†\‡\^a-zA-Z]+$', '', val)  # 移除尾端註腳
    return val

def is_number(val: str) -> bool:
    """判斷是否為數字"""
    try:
        float(normalize_number(val))
        return True
    except Exception:
        return False

def extract_keywords(soup, html_text):
    meta_keywords = soup.find('meta', attrs={'name': 'citation_keywords'})
    if meta_keywords and meta_keywords.get('content'):
        return meta_keywords.get('content')

    for tag in soup.find_all(['b', 'strong', 'h3', 'h4', 'p', 'span']):
        if tag.get_text(strip=True).lower().startswith('keywords'):
            next_text = ''
            text = tag.get_text(separator=' ', strip=True)
            if ':' in text:
                next_text = text.split(':', 1)[1].strip()
            if not next_text:
                sibling = tag.find_next_sibling(text=True)
                if sibling:
                    next_text = sibling.strip()
            if not next_text and tag.parent:
                sibling = tag.parent.find_next_sibling(text=True)
                if sibling:
                    next_text = sibling.strip()
            if next_text:
                return next_text

    kw_match = re.search(r'Keywords?[:\s]+([\w\s;,-]+)', html_text, re.IGNORECASE)
    if kw_match:
        return kw_match.group(1).strip()

    return ''

def find_all_xyz_indices(header_rows):
    """ 從多行表頭中找出所有 X,Y,Z 的索引位置（可能有 Left/Right 兩組） """
    last_row = header_rows[-1]  # 最底層表頭
    flat_headers = [cell.strip().lower() for cell in last_row]

    xyz_groups = []
    temp = {}
    for i, h in enumerate(flat_headers):
        if h == 'x':
            temp['x'] = i
        elif h == 'y':
            temp['y'] = i
        elif h == 'z':
            temp['z'] = i

        if len(temp) == 3:  # 收集到一組
            xyz_groups.append((temp['x'], temp['y'], temp['z']))
            temp = {}

    return xyz_groups if xyz_groups else None

# --------- 主程式 ---------
for html_file in html_files:
    html_path = Path(html_file)
    if not html_path.exists():
        print(f"File not found: {html_file}")
        continue

    html_text = html_path.read_text(encoding='utf-8', errors='ignore')
    soup = BeautifulSoup(html_text, 'html.parser')

    # PMCID
    pmcid = re.search(r'PMC\d+', html_file).group() if re.search(r'PMC\d+', html_file) else ''

    # PMID
    pmid = ''
    meta_pmid = soup.find('meta', attrs={'name': 'citation_pmid'})
    if meta_pmid:
        pmid = meta_pmid.get('content')
    else:
        pmid_match = re.search(r'PMID[:\s]+(\d+)', html_text)
        if pmid_match:
            pmid = pmid_match.group(1)

    # Keywords
    keywords = extract_keywords(soup, html_text)

    # 找表格
    tables = soup.find_all('table')
    for table_idx, table in enumerate(tables, 1):
        rows = table.find_all('tr')
        if not rows:
            continue

        # header rows → 把 thead 和前兩行都加進來
        header_rows = []
        thead = table.find('thead')
        if thead:
            for row in thead.find_all('tr'):
                header_rows.append([cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])])
        if not header_rows:
            for row in rows[:2]:
                cells = row.find_all(['th', 'td'])
                if all(cell.name == 'th' for cell in cells):
                    header_rows.append([cell.get_text(strip=True) for cell in cells])
        if not header_rows:
            header_rows = [[cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]]

        xyz_groups = find_all_xyz_indices(header_rows)

        if xyz_groups:
            for row in rows[len(header_rows):]:
                cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                for (x_idx, y_idx, z_idx) in xyz_groups:
                    if len(cols) > max(x_idx, y_idx, z_idx):
                        x_val, y_val, z_val = cols[x_idx], cols[y_idx], cols[z_idx]

                        # ✅ 每組獨立成一筆 row
                        if any(is_number(v) for v in [x_val, y_val, z_val]):
                            results.append({
                                'PMID': pmid,
                                'PMCID': pmcid.replace('PMC',''),
                                'Keywords': keywords,
                                'Table': table_idx,
                                'X': normalize_number(x_val) if is_number(x_val) else '',
                                'Y': normalize_number(y_val) if is_number(y_val) else '',
                                'Z': normalize_number(z_val) if is_number(z_val) else '',
                            })

# 存檔
df = pd.DataFrame(results)
df.to_csv('b11504006_info_data.csv', index=False, encoding='utf-8-sig')
print('Saved to b11504006_info_data.csv')


Saved to b11504006_info_data.csv
