# Download

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/')

time.sleep(5)
template = lambda x: 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed23n{0:04d}.xml.gz'.format(x) # x is a 4 digit str

for i in range(1, 1016):
    path = template(i)
    file_name = path.split('/')[-1]
    element = driver.find_element("link text",file_name)
    element.click()
    time.sleep(2)

# From xml to json

In [3]:
__author__ = 'Qiao Jin'
import os
import glob
import json
import xml.etree.ElementTree as ET

files = os.listdir("pubmed_2023")
files.sort()
for xml_path in files:
    
    check = os.path.exists("pubmed_2023d/%s" % xml_path.replace("xml", "json"))
    if not check:
        print('Processing %s' % xml_path)
        output = {}

        tree = ET.parse("pubmed_2023/%s" % xml_path)
        root = tree.getroot()

        for citation in root.iter('MedlineCitation'):
            
            # 获取pmid
            pmid = citation.find('PMID')
            if pmid == None:
                continue
            else:
                pmid = pmid.text
            
            #获取标题
            texts = {}
            title = citation.find('Article/ArticleTitle')
            if title is None:
                texts["Title"] = ""
            else:
                texts["Title"] = " ".join(title.itertext())
            
            # 获取期刊
            journal = citation.find("Article/Journal/Title")
            if journal is None:
                texts["Journal"] = ""
            else:
                texts["Journal"] = " ".join(journal.itertext())
                
            # 获取摘要，摘要存在分段
            info_l = []
            for info in citation.iter('AbstractText'):
                if info is None:
                    abstract = ""
                if info is not None:
                    abstract = " ".join(info.itertext())
                    info_l.append(abstract)
            texts["Ab"] = texts["Title"] + " " + " ".join(info_l)
            
            # 存取mesh词汇
            MHs = []
            mesh = citation.find("MeshHeadingList")
            if  mesh is None:
                MHs = []
            else:
                for MH in mesh:
                    MHs.append([mh.text for mh in MH])

            # 存取日期
            date = citation.find("DateCompleted/Year")
            if  date is None:
                dates = ""
            else:
                dates = date.text
            
            output[pmid] = {'pmid': pmid,
                            'texts': texts,
                            'Meshhead':MHs,
                            "date":dates}
        
        with open('pubmed_2023d/%s.json' % xml_path.split('.')[0], 'w') as f:
            json.dump(output, f, indent=4)
            f.close()

Processing pubmed23n1004.xml
Processing pubmed23n1005.xml
Processing pubmed23n1006.xml
Processing pubmed23n1007.xml
Processing pubmed23n1008.xml
Processing pubmed23n1009.xml
Processing pubmed23n1010.xml
Processing pubmed23n1011.xml
Processing pubmed23n1012.xml
Processing pubmed23n1013.xml
Processing pubmed23n1014.xml
Processing pubmed23n1015.xml
Processing pubmed23n1016.xml
Processing pubmed23n1017.xml
Processing pubmed23n1018.xml
Processing pubmed23n1019.xml
Processing pubmed23n1020.xml
Processing pubmed23n1021.xml
Processing pubmed23n1022.xml
Processing pubmed23n1023.xml
Processing pubmed23n1024.xml
Processing pubmed23n1025.xml
Processing pubmed23n1026.xml
Processing pubmed23n1027.xml
Processing pubmed23n1028.xml
Processing pubmed23n1029.xml
Processing pubmed23n1030.xml
Processing pubmed23n1031.xml
Processing pubmed23n1032.xml
Processing pubmed23n1033.xml
Processing pubmed23n1034.xml
Processing pubmed23n1035.xml
Processing pubmed23n1036.xml
Processing pubmed23n1037.xml
Processing pub

# 合并

In [12]:
# 大文件
import os
import glob
import json

files = os.listdir("pubmed_2023d")
files = sorted(files)
papers_d = {}

if os.path.exists("pubmed_2023_merge/pubmed_done.json") is False:
    done_l = []
    print("done_l is empty")
else:
    with open("pubmed_2023_merge/pubmed_done.json") as f:
        done_l = json.load(f)
        print("done_l is exists")
files = [i for i in files if i not in done_l]

for i, file in enumerate(files):
    
    with open("pubmed_2023d/%s" % file, "r") as f:
        paper_d = json.load(f)
    print(file)
    
    papers_d.update(paper_d)
    done_l.append(file)
    
    iter_list = set(range(99, 1167, 100))
    iter_list.add(1165)
    if i in iter_list:
        with open("pubmed_2023_merge/pubmed_%s.txt" % len(done_l), "w") as f:
            b = json.dumps(papers_d)
            f.write(b)
        with open("pubmed_2023_merge/pubmed_done.json" , "w") as f:
            json.dump(done_l,f)
        papers_d = {}

done_l is empty
pubmed23n0001.json
pubmed23n0002.json
pubmed23n0003.json
pubmed23n0004.json
pubmed23n0005.json
pubmed23n0006.json
pubmed23n0007.json
pubmed23n0008.json
pubmed23n0009.json
pubmed23n0010.json
pubmed23n0011.json
pubmed23n0012.json
pubmed23n0013.json
pubmed23n0014.json
pubmed23n0015.json
pubmed23n0016.json
pubmed23n0017.json
pubmed23n0018.json
pubmed23n0019.json
pubmed23n0020.json
pubmed23n0021.json
pubmed23n0022.json
pubmed23n0023.json
pubmed23n0024.json
pubmed23n0025.json
pubmed23n0026.json
pubmed23n0027.json
pubmed23n0028.json
pubmed23n0029.json
pubmed23n0030.json
pubmed23n0031.json
pubmed23n0032.json
pubmed23n0033.json
pubmed23n0034.json
pubmed23n0035.json
pubmed23n0036.json
pubmed23n0037.json
pubmed23n0038.json
pubmed23n0039.json
pubmed23n0040.json
pubmed23n0041.json
pubmed23n0042.json
pubmed23n0043.json
pubmed23n0044.json
pubmed23n0045.json
pubmed23n0046.json
pubmed23n0047.json
pubmed23n0048.json
pubmed23n0049.json
pubmed23n0050.json
pubmed23n0051.json
pubmed23n0052.j