In [19]:
import os
import asyncio
from playwright.async_api import async_playwright
import requests
from bs4 import BeautifulSoup

GSE_ID = "GSE59867"

# 桌面路径 + 输出文件
BASE = os.path.expanduser("~/Desktop")
FIRST_GSM_FILE = os.path.join(BASE, f"{GSE_ID}_FIRST_GSM.txt")
ALL_GSM_FILE   = os.path.join(BASE, f"{GSE_ID}_ALL_GSM.txt")

print("Single GSM file:", FIRST_GSM_FILE)
print("Merged file:", ALL_GSM_FILE)


# 1. 从 GSE 页面抓所有 GSM
def get_gsm_list(gse_id):
    url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse_id}"
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    gsm_list = []
    for a in soup.find_all("a"):
        href = a.get("href", "")
        if "acc=GSM" in href:
            gsm = href.split("acc=")[-1]
            if gsm.startswith("GSM"):
                gsm_list.append(gsm)
    return sorted(list(set(gsm_list)))

gsm_ids = get_gsm_list(GSE_ID)
print("Found", len(gsm_ids), "GSM samples")

FIRST_GSM = gsm_ids[0]
print("First GSM:", FIRST_GSM)


# 2. Playwright 抓 GSM 页面
async def fetch_one_gsm(page, gsm):
    url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gsm}"
    await page.goto(url, wait_until="networkidle")
    await page.wait_for_timeout(1500)
    text = await page.inner_text("body")
    return text


# 3. 主流程：生成一个测试 txt + 一个合并 txt
async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # -------------------------
        # 输出第一个 GSM（测试用）
        # -------------------------
        print("Fetching FIRST GSM:", FIRST_GSM)
        first_text = await fetch_one_gsm(page, FIRST_GSM)
        with open(FIRST_GSM_FILE, "w", encoding="utf-8") as f:
            f.write(first_text)

        # -------------------------
        # 输出全部 GSM 合并 txt
        # -------------------------
        with open(ALL_GSM_FILE, "w", encoding="utf-8") as f:
            for i, gsm in enumerate(gsm_ids, 1):
                print(f"[{i}/{len(gsm_ids)}] Fetching: {gsm}")
                t = await fetch_one_gsm(page, gsm)

                f.write("=" * 80 + "\n")
                f.write(gsm + "\n")
                f.write("=" * 80 + "\n")
                f.write(t)
                f.write("\n\n")

        await browser.close()


await main()

Single GSM file: /Users/wa/Desktop/GSE59867_FIRST_GSM.txt
Merged file: /Users/wa/Desktop/GSE59867_ALL_GSM.txt
Found 436 GSM samples
First GSM: GSM1448335
Fetching FIRST GSM: GSM1448335
[1/436] Fetching: GSM1448335
[2/436] Fetching: GSM1448336
[3/436] Fetching: GSM1448337
[4/436] Fetching: GSM1448338
[5/436] Fetching: GSM1448339
[6/436] Fetching: GSM1448340
[7/436] Fetching: GSM1448341
[8/436] Fetching: GSM1448342
[9/436] Fetching: GSM1448343
[10/436] Fetching: GSM1448344
[11/436] Fetching: GSM1448345
[12/436] Fetching: GSM1448346
[13/436] Fetching: GSM1448347
[14/436] Fetching: GSM1448348
[15/436] Fetching: GSM1448349
[16/436] Fetching: GSM1448350
[17/436] Fetching: GSM1448351


  funcs = list(self._events.get(event, OrderedDict()).values())
ERROR:asyncio:Future exception was never retrieved
future: <Future finished exception=Error('Connection closed')>
playwright._impl._api_types.Error: Connection closed


[18/436] Fetching: GSM1448352
[19/436] Fetching: GSM1448353
[20/436] Fetching: GSM1448354
[21/436] Fetching: GSM1448355
[22/436] Fetching: GSM1448356
[23/436] Fetching: GSM1448357
[24/436] Fetching: GSM1448358
[25/436] Fetching: GSM1448359
[26/436] Fetching: GSM1448360
[27/436] Fetching: GSM1448361
[28/436] Fetching: GSM1448362
[29/436] Fetching: GSM1448363
[30/436] Fetching: GSM1448364
[31/436] Fetching: GSM1448365
[32/436] Fetching: GSM1448366
[33/436] Fetching: GSM1448367
[34/436] Fetching: GSM1448368
[35/436] Fetching: GSM1448369
[36/436] Fetching: GSM1448370
[37/436] Fetching: GSM1448371
[38/436] Fetching: GSM1448372
[39/436] Fetching: GSM1448373
[40/436] Fetching: GSM1448374
[41/436] Fetching: GSM1448375
[42/436] Fetching: GSM1448376
[43/436] Fetching: GSM1448377
[44/436] Fetching: GSM1448378
[45/436] Fetching: GSM1448379
[46/436] Fetching: GSM1448380
[47/436] Fetching: GSM1448381
[48/436] Fetching: GSM1448382
[49/436] Fetching: GSM1448383
[50/436] Fetching: GSM1448384
[51/436] F