In [95]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from urllib import parse

# 设置驱动
service = Service(executable_path='./chromedriver.exe')
# 设置浏览器
option = webdriver.ChromeOptions()
option.binary_location = './chrome-win64/chrome.exe'
# 浏览器添加EasyPubMedicine插件
option.add_extension('./EasyPubMedicine.crx')

browser = webdriver.Chrome(service=service,options=option)

# 搜索关键词
key_word = '(leukemia) AND (virus)AND(DNA damage)' 
# 设置年份
min_year = '2010'
max_year = '2025'
# 爬取文章数量
article_count = 200
# 最小影响因子
min_impact_factor = 8
# 是否下载pdf
download_pdf = True
# 是否选择mate
is_mate = False
# 当前页数
page = 1

browser.get(f'https://pubmed.ncbi.nlm.nih.gov/?term={parse.quote(key_word)}&filter=years.{min_year}-{max_year}&page={page}')

In [96]:
from selenium.webdriver.common.by import By
# 过滤影响因子
def fliter_if(min_if_value):
    browser.find_element(By.XPATH,'//*[@id="imf-min"]').clear()
    browser.find_element(By.XPATH,'//*[@id="imf-min"]').send_keys(str(min_if_value))
    if browser.find_element(By.XPATH,'//*[@id="imf-activate"]').is_displayed():
        browser.find_element(By.XPATH,'//*[@id="imf-activate"]').click()
    else:
        browser.find_element(By.XPATH,'//*[@id="imf-refresh"]').click()
    # 选择mate
    if is_mate and not browser.find_element(By.XPATH,'//*[@id="id_filter_pubt.meta-analysis"]').is_selected():
        browser.find_element(By.XPATH,'//*[@id="static-filters-form"]/div/div[1]/div[3]/ul/li[3]/label').click()

In [97]:
import re
# 导出文章信息
def extract_list():
    result = []
    for article in browser.find_elements(By.XPATH,'//*[@id="search-results"]/section/div[2]/div/article'):
        if len(article.text.strip()) == 0:
            continue
        # 标题、作者、影响因子、pdf链接、文章链接
        article_title = article.find_element(By.CLASS_NAME,'docsum-title').text
        article_cite = article.find_element(By.CLASS_NAME,'full-authors').text
        article_if = float(re.search('\\d+(\\.\\d)?',article.find_element(By.CLASS_NAME,'ep-if').text).group())
        article_pdf = article.find_element(By.CLASS_NAME,'pdf').get_attribute('href')
        article_url = article.find_element(By.CLASS_NAME,'docsum-title').get_attribute('href')
        # if article_pdf is None:
        #     continue
        result.append((article_title.translate(str.maketrans('/\\|*><?:"', '_________')),article_cite,article_if,article_pdf,article_url))
    return result

In [98]:
# 下一页
def next():
    global page
    page = page + 1
    browser.find_element(By.XPATH,'//*[@id="search-results"]/div[6]/button[3]').click()

In [99]:
import httpx
from curl_cffi import requests

#下载
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br'
}
def download(url,save_file):
    timeout = 120
    with open(save_file,'wb') as download_file:
        try:
            response = requests.get(url, impersonate="chrome101", allow_redirects=True, timeout=timeout)
            if response.status_code == httpx.codes.OK:
                download_file.write(response.content)
                return True
        except:
            pass
        try:
            response = httpx.get(url, headers = headers, follow_redirects=True, timeout=timeout)
            if response.status_code == httpx.codes.OK:
                download_file.write(response.content)
                return True
        except:
            pass
        try:
            with httpx.stream("GET", url, headers = headers, verify=False, timeout=timeout) as response:
                if response.status_code == httpx.codes.OK:
                    for chunk in response.iter_bytes():
                        download_file.write(chunk)
                    return True
        except:
            pass
    return False

In [100]:
# 保存的文章
article_list = []

In [101]:
import time
import os
import pandas as pd

# 保存路径
save_path = './result'
save_pdf_path = f'{save_path}/{key_word}'
if not os.path.exists(save_pdf_path):
    os.mkdir(save_pdf_path)

def save_csv():
    # 保存
    article_df = pd.DataFrame(article_list,columns=['title','cite','if','url','pdf'])
    article_df.to_csv(f'{save_path}/{key_word}.csv')

# 开始爬取
def start():
    while(len(article_list) < article_count):
        print(f'start page {page}')
        fliter_if(min_impact_factor)
        time.sleep(0.5)
        tmp_list = extract_list()
        for t in tmp_list:
            title,cite,article_if,pdf,url = t
            print(f'{title} {pdf}')
            # 防止windows路径长度限制超过250
            save_file = f'{save_pdf_path}/{title}'[0:100] + '.pdf'
            if os.path.exists(save_file) or not download_pdf:
                article_list.append([title,cite,article_if,url,pdf])
            elif pdf is None or len(pdf.strip()) == 0:
                article_list.append([title,cite,article_if,url,'无下载地址'])
            else:
                s = download(pdf,save_file)
                article_list.append([title,cite,article_if,url,pdf if s else '下载失败'])
        save_csv()
        next()

# 浏览器偶尔抽风，刷新重试
retry = 3
while retry >= 0:
    try:
        start()
        break
    except Exception as e:
        print(f'重试{retry} {e}')
        browser.refresh()
        retry = retry - 1
browser.quit()

start page 1
PPM1D Mutations Drive Clonal Hematopoiesis in Response to Cytotoxic Chemotherapy. http://www.cell.com/article/S1934590918304855/pdf
NF-kappaB-induced R-loop accumulation and DNA damage select for nucleotide excision repair deficiencies in adult T cell leukemia. https://www.pnas.org/doi/pdf/10.1073/pnas.2005568118
start page 2
PIM1 (Provirus Integration Site For Moloney Murine Leukemia Virus) as a Novel Biomarker and Therapeutic Target in Pulmonary Arterial Hypertension_ Another Evidence for Cancer Theory. None
start page 3
Human T-cell leukemia virus type 1 (HTLV-1) and leukemic transformation_ viral infectivity, Tax, HBZ and therapy. https://europepmc.org/articles/pmc3413891?pdf=render
start page 4
Human T-lymphotropic type 1 virus p30 inhibits homologous recombination and favors unfaithful DNA repair. https://ashpublications.org/blood/article-pdf/117/22/5897/1339290/zh802211005897.pdf
A Dual Role of Caspase-8 in Triggering and Sensing Proliferation-Associated DNA Damage,