In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from urllib import parse

# 设置驱动
service = Service(executable_path='./chromedriver.exe')
# 设置浏览器
option = webdriver.ChromeOptions()
option.binary_location = './chrome-win64/chrome.exe'
# 浏览器添加EasyPubMedicine插件
option.add_extension('./EasyPubMedicine.crx')

browser = webdriver.Chrome(service=service,options=option)

# 搜索关键词
key_word = 'leukemia' 
# 设置年份
min_year = '2010'
max_year = '2025'
# 爬取文章数量
article_count = 70
# 最小影响因子
min_impact_factor = 20

browser.get(f'https://pubmed.ncbi.nlm.nih.gov/?term={parse.quote(key_word)}&filter=years.{min_year}-{max_year}&page=1')

In [10]:
from selenium.webdriver.common.by import By
# 过滤影响因子
def fliter_if(min_if_value):
    browser.find_element(By.XPATH,'//*[@id="imf-min"]').clear()
    browser.find_element(By.XPATH,'//*[@id="imf-min"]').send_keys(str(min_if_value))
    if browser.find_element(By.XPATH,'//*[@id="imf-activate"]').is_displayed():
        browser.find_element(By.XPATH,'//*[@id="imf-activate"]').click()
    else:
        browser.find_element(By.XPATH,'//*[@id="imf-refresh"]').click()

In [11]:
import re
# 导出文章信息
def extract_list():
    result = []
    for article in browser.find_elements(By.XPATH,'//*[@id="search-results"]/section/div[2]/div/article'):
        if len(article.text.strip()) == 0:
            continue
        # 标题、作者、影响因子、pdf链接
        article_title = article.find_element(By.CLASS_NAME,'docsum-title').text
        article_cite = article.find_element(By.CLASS_NAME,'full-authors').text
        article_if = float(re.search('\\d+(\\.\\d)?',article.find_element(By.CLASS_NAME,'ep-if').text).group())
        article_pdf = article.find_element(By.CLASS_NAME,'pdf').get_attribute('href')
        if article_pdf is None:
            continue
        result.append((article_title.translate(str.maketrans('/\\|*><?:"', '_________')),article_cite,article_if,article_pdf))
    return result

In [12]:
# 下一页
def next():
    browser.find_element(By.XPATH,'//*[@id="search-results"]/div[6]/button[3]').click()

In [13]:
import httpx
from curl_cffi import requests

#下载
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br'
}
def download(url,save_file):
    timeout = 120
    with open(save_file,'wb') as download_file:
        try:
            response = requests.get(url, impersonate="chrome101", allow_redirects=True, timeout=timeout)
            if response.status_code == httpx.codes.OK:
                download_file.write(response.content)
                return True
        except:
            pass
        try:
            response = httpx.get(url, headers = headers, follow_redirects=True, timeout=timeout)
            if response.status_code == httpx.codes.OK:
                download_file.write(response.content)
                return True
        except:
            pass
        try:
            with httpx.stream("GET", url, headers = headers, verify=False, timeout=timeout) as response:
                if response.status_code == httpx.codes.OK:
                    for chunk in response.iter_bytes():
                        download_file.write(chunk)
                    return True
        except:
            pass
    return False

In [14]:
# 保存的文章
article_list = []

In [15]:
import time
import os

# 保存路径
save_path = './result'
save_pdf_path = f'{save_path}/{key_word}'
if not os.path.exists(save_pdf_path):
    os.mkdir(save_pdf_path)

# 开始爬取
def start():
    while(len(article_list) < article_count):
        fliter_if(min_impact_factor)
        time.sleep(0.5)
        tmp_list = extract_list()
        for t in tmp_list:
            title,cite,article_if,url = t
            print(f'{title} {url}')
            # 防止windows路径长度限制超过250
            save_file = f'{save_pdf_path}/{title}'[0:100] + '.pdf'
            if os.path.exists(save_file):
                article_list.append([title,cite,article_if,url,''])
            else:
                s = download(url,save_file)
                article_list.append([title,cite,article_if,url,'' if s else '下载失败'])
        next()

# 浏览器偶尔抽风，刷新重试
retry = 3
while retry >= 0:
    try:
        start()
        break
    except:
        print(f'重试{retry}')
        browser.refresh()
        retry = retry - 1
browser.quit()

Worldwide comparison of survival from childhood leukaemia for 1995-2009, by subtype, age, and sex (CONCORD-2)_ a population-based study of individual data for 89 828 children from 198 registries in 53 countries. http://www.thelancet.com/article/S2352302617300522/pdf
Depalmitoylation rewires FLT3-ITD signaling and exacerbates leukemia progression. https://ashpublications.org/blood/article-pdf/138/22/2244/1848214/bloodbld2021011582.pdf
Vulnerabilities in mIDH2 AML confer sensitivity to APL-like targeted combination therapy. https://www.nature.com/articles/s41422-019-0162-7.pdf
Leukaemia cutis. http://www.thelancet.com/article/S0140673612603523/pdf
Targeting leukemia's _fatty tooth_. https://ashpublications.org/blood/article-pdf/126/16/1874/1389107/1874.pdf
Single-cell analysis of acute lymphoblastic and lineage-ambiguous leukemia_ approaches and molecular insights. https://ashpublications.org/blood/article-pdf/141/4/356/2074818/blood_bld-2022-016954-c-main.pdf
Cancer_ A gene-expression p

In [16]:
import pandas as pd

# 保存
article_df = pd.DataFrame(article_list,columns=['title','cite','if','url','logger'])
article_df.to_csv(f'{save_path}/{key_word}.csv')