In [1]:
import gzip
import os
from xml.dom.minidom import parse
import xml.dom.minidom
import pandas as pd

In [2]:
# 解析文章基础信息
def get_paper_infor(paper):
    Paper={}
    for tag1 in ["PMID","ArticleTitle","ELocationID","AbstractText","Language","PublicationType"]:
        nodes=paper.getElementsByTagName(tag1)
        if len(nodes)>0:
            for node in nodes:
                for n in node.childNodes:
                    if n.nodeValue:
                        if tag1 not in Paper:
                            Paper[tag1]=n.nodeValue.strip()
                        else:
                            Paper[tag1]+=" | "+n.nodeValue.strip()

    for tag2 in ["PubDate","MedlineJournalInfo","ArticleIdList","Journal",]:
        nodes=paper.getElementsByTagName(tag2)
        if len(nodes)>0:
            Paper[tag2]={}
            for node in nodes:
                for n in node.childNodes:
                    if n.nodeValue:
                        if n.nodeValue.strip() !="":
                            if tag2 not in Paper:
                                Paper[tag2][n.tagName]=n.nodeValue.strip()
                            else:
                                Paper[tag2][n.tagName]+=" | "+n.nodeValue.strip()
                    elif len(n.childNodes)>0:
                        for c in n.childNodes:
                            if c.nodeValue:
                                if c.nodeValue.strip()!="":
                                    if n.tagName not in Paper[tag2]:
                                        Paper[tag2][n.tagName]=c.nodeValue.strip()
                                    else:
                                        Paper[tag2][n.tagName]+=" | "+c.nodeValue.strip()
                            elif len(c.childNodes)>0:
                                for c2 in c.childNodes:
                                    if c2.nodeValue:
                                        if c2.nodeValue.strip()!="":
                                            Paper[tag2][c.tagName]=c2.nodeValue.strip()
    return Paper

In [3]:
# 解析 MeshHeading 关键词
def get_keyword_list(tag,paper,Paper):
    nodes=paper.getElementsByTagName(tag+"List")
    if len(nodes)>0:
        authors =nodes[0].getElementsByTagName(tag)
        count=0
        for author in authors:
            count+=1
            Paper[tag+"_"+str(count)]={}
            for c in author.childNodes:
                if c.nodeValue:
                    if c.nodeValue.strip()!="":
                        if c.tagName not in Paper[tag+"_"+str(count)]:
                            Paper[tag+"_"+str(count)][c.tagName]=c.nodeValue.strip()
                        else:
                            Paper[tag+"_"+str(count)][c.tagName]+=" | "+c.nodeValue.strip()
                if len(c.childNodes)>0:
                    for c2 in c.childNodes:
                        if c2.nodeValue:
                            if c2.nodeValue.strip()!="":
                                if c.tagName not in Paper[tag+"_"+str(count)]:
                                    Paper[tag+"_"+str(count)][c.tagName]=c2.nodeValue.strip()
                                    try:
                                        Paper[tag+"_"+str(count)]["UI"]=c["UI"]
                                    except:
                                        pass
                                else:
                                    Paper[tag+"_"+str(count)][c.tagName]+=" | "+c2.nodeValue.strip()
                        else:
                            for c3 in c2.childNodes:
                                if str(c3.nodeValue).strip()!="":
                                    if c3.nodeValue:
                                        if c2.tagName not in Paper[tag+"_"+str(count)]:
                                            Paper[tag+"_"+str(count)][c2.tagName]=c3.nodeValue.strip()
                                            try:
                                                Paper[tag+"_"+str(count)]["UI"]=c2["UI"]
                                            except:
                                                pass
                                        else:
                                            Paper[tag+"_"+str(count)][c2.tagName]+=" | "+str(c3.nodeValue).strip()
    return Paper

In [4]:
# 将字典转成DataFrame
def get_paper_dataframe(Paper):
    PMID=[]
    Level_1_key=[]
    Level_2_key=[]
    Value=[]

    for k,v in Paper.items():
        if type(v)==str:
            PMID.append(Paper["PMID"])
            Level_1_key.append(k)
            Level_2_key.append(".")
            Value.append(v)
        else:
            for k2,v2 in v.items():
                PMID.append(Paper["PMID"])
                Level_1_key.append(k)
                Level_2_key.append(k2)
                Value.append(v2)
    df = pd.DataFrame(PMID,columns=["PMID"])
    df["Level_1_Key"]=pd.DataFrame(Level_1_key)
    df["Level_2_Key"]=pd.DataFrame(Level_2_key)
    df["Value"]=pd.DataFrame(Value)
    return df

In [6]:
# 将自动键值对转成 多列的表格；生成 PMID, 文本Content表格，用于分类；
def get_content(Result,f):
    MH=[]
    for k,v in Result.groupby("Level_1_Key"):
        if k=="PMID":
            PMID=v
        if k=="ArticleTitle":
            AT=v
            AT=AT.rename(columns={"Value":"ArticleTitle"})
        if "MeshHeading" in k:
            MH.append(v)
        if k=="AbstractText":
            AB=v
            AB=AB.rename(columns={"Value":"AbstractText"})
    R=PMID[["PMID","Value"]].set_index("PMID").join(AT[["PMID","ArticleTitle"]].set_index("PMID"),how="left").join(AB[["PMID","AbstractText"]].set_index("PMID"),how="left")
    if MH!=[]:
        MH=pd.concat(MH)
        MH_dic={}
        for pmid,pmid_df in MH.groupby("PMID"):
            MH_dic[pmid]=" | ".join(pmid_df["Value"].tolist())
        MH=pd.DataFrame.from_dict(MH_dic,orient="index").reset_index()
        MH.columns=["PMID","MH"]
        R=R.join(MH.set_index("PMID"),how="left").fillna("")
    else:
        R["MH"]=""
    R=R.rename(columns={"Value":"PMID"}).reset_index(drop=True)

In [12]:
# 提取 更新内容的主程序
xml_files= os.listdir("../test_sample")
# 使用minidom解析器打开 XML 文档
for f in xml_files:
    if (f.startswith("pubmed")) &(f.endswith(".xml")):
        file_path=f.split(".")
        if file_path not in os.listdir("../Result/"):
            os.mkdir("../Result/"+file_path)
        DOMTree = xml.dom.minidom.parse("../test_sample/"+f)
        collection = DOMTree.documentElement
        papers=[]
        papers =collection.getElementsByTagName("PubmedArticle")
        for paper in papers:
            Paper=get_paper_infor(paper)
            for tag in ["Author","Chemical","MeshHeading","Reference"]:
                Paper=get_keyword_list(tag,paper,Paper)
            df = get_paper_dataframe(Paper)
            Result.append(df)
        Result=pd.concat(Result)
        Result["Key"]=Result["Level_1_Key"].apply(lambda x:x.split("_")[0])
        try:
            get_content(Result,f)
        except:
            print(f)
        if int(f.split("22n")[1].split(".")[0])>1000:
            for k,k_df in Result.groupby("Key"):
                k_df.to_csv("../Result/"+file_path+"/"+k+"_"+f.replace(".xml",".csv"),index=False)
        del Result
        #os.remove(f)