### 筛选待下载新文献，手动下周 XML格式文件
- https://www.ncbi.nlm.nih.gov/labs/pmc/
- ((spatial[All Fields] AND "tissue sections"[All Fields]) OR "spatially resolved transcriptomic"[All Fields] OR "spatial expression pattern"[All Fields] OR "spatial transcriptomic"[All Fields] OR "spatial gene expression"[All Fields] OR "spatial omics"[All Fields] OR "spatial profile"[All Fields] OR "spatial profiling"[All Fields]) AND ("2021/07/01"[PubDate] : "3000"[PubDate])
- "High-Resolution Transcriptomic" 

In [1]:
import os
import pandas as pd
pd.set_option("max_columns",None)
import tarfile
import requests
import shutil
import warnings
warnings.filterwarnings("ignore")
# coding = utf-8
from collections import Counter
from bs4 import BeautifulSoup

In [2]:
from datetime import date
d = date.today()
year=str(d.year)
month=str(d.month)
day= str(d.day)
if len(month)==1:
    month="0"+month
if len(day)==1:
    day="0"+day
Taday_date=year+"-"+month+"-"+day

In [3]:
def parse_article_meta(soup,pmc_file):
    Ids =[]
    paper_type =""
    title =""
    Authors =[]
    Author_Infor =[]
    PubDate =[]
    Abstract =[]
    Keywords =[]
    jour_Ids =[]
    article_meta = soup.find_all("article-meta")
    for meta in article_meta:
        # 获取 文章相关的 ID
        IDS = meta.find_all("article-id")
        for ID in IDS:
            Ids.append(ID["pub-id-type"]+":"+ID.text)
        try:
            paper_type = meta.find("article-categories").text.strip()
        except Exception as e:
            print(e)
            pass
        
        try:
            title = meta.find("article-title").text.strip()
        except Exception as e:
            pass
        
        # 作者和作者信息
        authors = meta.find_all("contrib")
        for author in authors:
            try:
                author_name = author.find("given-names").text.strip()+" "+author.find("surname").text.strip()
                try:
                    author_xref = author.find("xref")["ref-type"]+"-"+author.find("xref")["rid"]
                    Authors.append(author_xref+":"+author_name)
                except:
                    Authors.append(author_name)
            except:
                author_name = author.text.strip()
                Authors.append(author_name)
            
            
        author_infor =meta.find_all("aff")
        for infor in author_infor:
            try:
                aff = "aff-"+infor["id"]+":"+infor.text.strip()
                Author_Infor.append(aff)
            except Exception as e:
                #print(e)
                aff = infor.text.strip()
                Author_Infor.append(aff)
        
        # 发表时间
        pub_date =meta.find_all("pub-date")
        for  dates in pub_date:
            try:
                year = dates.find("year").text.strip()
            except:
                year =""
            try:
                month = dates.find("month").text.strip()
            except:
                month =""
            try:
                day = dates.find("day").text.strip()
            except:
                day =""
            try:
                PubDate.append(dates["pub-type"]+":"+"-".join([year,month,day]))
            except:
                PubDate.append("-".join([year,month,day]))
            
        # 摘要
        abstract = meta.find_all("abstract")
        paras =[]
        for ab in abstract:
            paras +=ab.find_all("p")
            for p in paras:
                Abstract.append(p.text)
                
        # 关键词
        keywords = meta.find_all("kwd")
        for kwd in keywords:
            Keywords.append(kwd.text.strip())
            
        #解析杂志相关信息
        journal_meta = soup.find_all("journal-meta")
        journal_title=""
        for meta in journal_meta:
            IDS = meta.find_all("journal-id")
            for ID in IDS:
                jour_Ids.append(ID["journal-id-type"]+":"+ID.text)
            journal_title = meta.find("journal-title").text.strip() 
            ISSNs = meta.find_all("issn")
            issn_Number=[]
            for issn in ISSNs:
                issn_Number.append(issn.text.strip())
            issn_Number="; ".join(issn_Number)
    try:
        return "; ".join(Ids),paper_type,title,"; ".join(Authors),"; ".join(Author_Infor),"; ".join(PubDate),"\n".join(Abstract),"; ".join(Keywords),"; ".join(jour_Ids),journal_title,issn_Number
    except Exception as e:
        print(e)
        return "; ".join(Ids),paper_type,title,"; ".join(Authors),"; ".join(Author_Infor),"; ".join(PubDate),"\n".join(Abstract),"; ".join(Keywords),"; ".join(jour_Ids),".",issn_Number


In [4]:
# 解析正文部分
def parse_body(body,pmc_file,Work_Path):
    body_id =[]
    body_title =[]
    body_content =[]
    Sections = body.find_all("sec")
    for sec in Sections:
        try:
            body_id.append(sec["id"])
        except:
            body_id.append("")
                
        try:
            body_title.append(sec.find("title").text.strip())
        except:
            body_title.append("")
            
        try:
            paras =[]
            conts = sec.find_all("p")
            for c in conts:
                paras.append(c.text.strip())
            body_content.append("\n".join(paras))
        except:
            body_content.append("")
                
    df = pd.DataFrame(body_title,columns=["Title"])
    try:
        df["Text_Id"]= pd.DataFrame(body_id)
    except:
        df["Text_Id"]=""
    try:
        df["Content"]= pd.DataFrame(body_content)
    except:
        df["Content"]=""
    df["PMCID"]= pmc_file
    df=df[["PMCID","Title","Text_Id","Content"]]
    if df.shape[0]>0:
        df.to_excel("..\\ResultPMC\\"+pmc_file+"\\"+pmc_file+"_paragraphs.xlsx",index=False)
    return df

In [5]:
# 解析图片 标题，label, html
import urllib.request
def parse_figure(soup,pmc_file,Work_Path):
    figs =soup.findAll("fig")
    Label =[]
    Title =[]
    HTML =[]
    Full_URL=[]
    for fig in figs:
        # 解析 标签 和 表格标题
        label=""
        title =""
        try:
            label =fig.findAll("label")[0].text.strip()
        except:
            label ="Figure "+str(figs.index(fig)+1)
        #print("Label",label)
        try:
            title =fig.text.strip().replace(label,"").strip("\n")
        except:
            pass
        #print(title)
        try:
            url =fig.find("graphic")["xlink:href"]
            full_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"+pmc_file+"/bin/"+url+".jpg"
        except:
            url =""
            full_url = ""
        Label.append(label)
        Title.append(title)
        HTML.append(url)
        Full_URL.append(full_url)
    if len(Label)>0:
        df_infor = pd.DataFrame(Label,columns=["Label"])
        df_infor["Caption"]= pd.DataFrame(Title)
        df_infor["Name"]= pd.DataFrame(HTML)
        df_infor["PMCID"]=pmc_file
        df_infor["Full_URL"]=pd.DataFrame(Full_URL)
        df_infor = df_infor[["PMCID","Label","Caption","Name","Full_URL"]]
        df_infor.to_excel("..\\ResultPMC\\"+pmc_file+"\\"+pmc_file+"_figures.xlsx",index=False)
        return df_infor
    return ""

In [6]:
# 解析表格内容
def parse_table(soup,pmc_file,Work_Path):
    tbs =soup.findAll("table-wrap")
    Label =[]
    Title =[]
    for tb in tbs:
        # 解析 标签 和 表格标题
        label=""
        title =""
        try:
            label =tb.findAll("label")[0].text.strip()
        except:
            label ="Table "+str(tbs.index(tb)+1)
        #print("Label",label)
        try:
            paras =tb.findAll("p")
            for p in paras:
                title += p.text.strip()+". "
        except:
            pass
        #print(title)
        
        # 解析 表格内容
        th = tb.findAll("th")
        cols = [i.text.strip() for i in th]
        for col in cols:
            if col=="":
                cols.remove(col)
        #print("Headers:",cols)
        trs=tb.findAll("tr")
        values=[]
        for tr in trs[1:]:
            values.append([i.text for i in tr.findAll("td")])
        for i in range(len(values)):
            if len(values[i])<len(cols):
                values[i]=["-"]*(len(cols)-len(values[i]))+values[i]
        #print(trs)
        max_col =0
        for i in values:
            if len(i)>max_col:
                max_col =len(i)
        if values !=[]:
            if len(cols)<max_col:
                df_t = pd.DataFrame(values)
            else:
                df_t = pd.DataFrame(values,columns=cols)
            df_t["Label"]=""
            df_t.loc[0,"Label"]=label
            df_t["Title"]=""
            df_t.loc[0,"Title"]=title
            try:
                df_t.to_excel("..\\ResultPMC\\"+pmc_file+"\\"+pmc_file+"_tables"+"_"+str(label)+".xlsx",index=False)
            
            except:
                df_t.to_excel("..\\ResultPMC\\"+pmc_file+"\\"+pmc_file+"_tables"+"_"+str(tbs.index(tb)+1)+".xlsx",index=False)
                
           
            Label.append(label)
            Title.append(title)
    if len(Label)>0:
        df_infor = pd.DataFrame(Label,columns=["Label"])
        df_infor["Caption"]= pd.DataFrame(Title)
        df_infor["PMCID"]=pmc_file
        return df_infor
    return ""

In [7]:
# 解析参考文献
def parse_ref_list(soup,pmc_file,Work_Path):
    Ref_list = soup.find_all("ref-list")
    Ref_ID=[]
    Ref_label =[]
    Ref_citation =[]
    Authors =[]
    Title =[]
    PubID =[]
    for data in Ref_list:
        refs = data.find_all("ref")
        for ref in refs:
            ref_id=""
            try:
                ref_id=ref["id"].strip()
            except:
                ref_id=refs.index(ref)+1
            authors =[]
            try:
                Ref_label.append(ref.find("label").text.strip())
            except:
                Ref_label.append("")
                
            try:
                names =ref.find_all("name")
                #print(names)
                for name in names:
                    #print(name.find("given-names").text.strip())
                    authors.append(name.find("surname").text.strip()+' '+name.find("given-names").text.strip())
                
                title = ref.find("article-title").text.strip()
                try:
                    source = ref.find("source").text.strip()
                except:
                    source=""
                try:
                    year =ref.find("year").text.strip()
                except:
                    year =""
                try:
                    volume = ref.find("volume").text.strip()
                except:
                    volume=""
                try:
                    issue = ref.find("issue").text.strip()
                    fpage = ref.find("fpage").text.strip()
                    lpage = ref.find("lpage").text.strip()
                except:
                    issue =""
                    fpage =""
                    lpage=""
                try:
                    pub_id = ""
                    for p in ref.findAll("pub-id"):
                        pub_id+=p["pub-id-type"]+" : "+p.text.strip()+"; "
                except:
                    pub_id =''
                    pub_id_type =""
            
                if pub_id !="":
                    Ref_citation.append(", ".join(authors)+". "+title+". "+source+". "+year+";"+volume+"("+issue+"):"+fpage+"-"+lpage+"."+pub_id)
                else:
                    Ref_citation.append(", ".join(authors)+". "+title+". "+source+". "+year+";"+volume+"("+issue+"):"+fpage+"-"+lpage+".")
                Authors.append('; '.join(authors))
                Title.append(title)
                PubID.append(pub_id)
                Ref_ID.append(ref_id)
                
            except:
                Ref_citation.append("")
                Authors.append("")
                Title.append("")
                PubID.append("")
                Ref_ID.append(ref_id)
            
    
    if len(Ref_citation)>0:
        df = pd.DataFrame(Ref_label,columns=["Ref_label"])
        df["Ref_ID"]= pd.DataFrame(Ref_ID)
        df["Ref_citation"]= pd.DataFrame(Ref_citation)
        df["Authors"]= pd.DataFrame(Authors)
        df["Ref_PubID"] = pd.DataFrame(PubID)
        df["Title"] = pd.DataFrame(Title)
        df["PMCID"]= pmc_file
        df = df[["PMCID","Ref_ID","Ref_PubID","Title","Authors","Ref_citation"]]
        df.to_excel("..\\ResultPMC\\"+"\\"+pmc_file+"\\"+pmc_file+"_References.xlsx",index=False)
        return df
    return ""

In [8]:
def get_file_data(PMC_file,Taday_date):
    Work_Path ="E:\\PMC\\"+PMC_file+"\\"
    PMC =[]
    IDs =[]
    Paper_Type =[]
    Title =[]
    Authors =[]
    Author_Infor =[]
    PubDate =[]
    Abstract =[]
    Keywords =[]
    Journal_Ids=[]
    Journal_Title =[]
    Journal_ISSN=[]
    count =0
    files= os.listdir(Work_Path)
    
    for file in files:
        if ".xml" in file:

            # 读取 xml格式文件内容
            PMC_xml =Work_Path+"\\"+file
            try:
                with open(PMC_xml,encoding="utf-8") as f:
                    lines = " ".join(f.readlines())
                    f.close()
            except:
                    lines=[]

            if lines !=[]:# 解析文件内容
                BS = BeautifulSoup(lines,"lxml")
                article_list = BS.find_all("article")
                for body in article_list:
                    pmc_file=body.find("article-meta").find("article_id",_type="pmid")
                    para_df=parse_body(body,pmc_file,Work_Path)
                    fig_df=parse_figure(body,pmc_file,Work_Path)
                    tb_df=parse_table(body,pmc_file,Work_Path)
                    ref_df=parse_ref_list(body,pmc_file,Work_Path)
                    article_meta = body.find_all("article-meta")
                    if article_meta !=[]:
                        # 1）解析 文章信息
                        Ids,paper_type,title,authors,author_infor,pubdate,abstract,keywords,jour_Ids,journal_title,issn_number = parse_article_meta(soup,pmc_file)

                        # 将信息存入列表
                        PMC.append(pmc_file)
                        IDs.append(Ids)
                        Paper_Type.append(paper_type)
                        Title.append(title)
                        Authors.append(authors)
                        Author_Infor.append(author_infor)
                        PubDate.append(pubdate)
                        Abstract.append(abstract)
                        Keywords.append(keywords)
                        Journal_Ids.append(jour_Ids)
                        Journal_Title.append(journal_title)
                        Journal_ISSN.append(issn_number)
    Paper_Infor = pd.DataFrame(PMC,columns=["PMCID"])
    Paper_Infor["IDs"] = pd.DataFrame(IDs)
    Paper_Infor["Paper_Type"] = pd.DataFrame(Paper_Type)
    Paper_Infor["Title"] = pd.DataFrame(Title)
    Paper_Infor["Authors"] = pd.DataFrame(Authors)
    Paper_Infor["Author_Infor"] = pd.DataFrame(Author_Infor)
    Paper_Infor["PubDate"] = pd.DataFrame(PubDate)
    Paper_Infor["Abstract"] = pd.DataFrame(Abstract)
    Paper_Infor["Keywords"] = pd.DataFrame(Keywords)
    Paper_Infor["Journal_Ids"] = pd.DataFrame(Journal_Ids)
    Paper_Infor["Journal_Title"] = pd.DataFrame(Journal_Title)
    Paper_Infor["IF"]=Paper_Infor["Journal_Title"].apply(lambda x:IF_dic[str(x).upper()] if str(x).upper() in IF_dic else -1)
    print("解析文章数目：",Paper_Infor.shape)            
    Paper_Infor.to_excel("../Result/"+Taday_date+"_"+PMC_file+"_PMC_Articals_Paper_Infor.xlsx",index=False)
    return Paper_Infor

In [11]:
Round=0
PMC_file="../test_sample/"

In [13]:
Work_Path =PMC_file # 检索下载的 XML文件路径
PMC =[]
IDs =[]
Paper_Type =[]
Title =[]
Authors =[]
Author_Infor =[]
PubDate =[]
Abstract =[]
Keywords =[]
Journal_Ids=[]
Journal_Title =[]
Journal_ISSN=[]
count =0

files= os.listdir(Work_Path)
for file in files:
    if file.endswith(".xml") and file.startswith("pmc"):
        print(file)
        # 读取 xml格式文件内容
        PMC_xml =Work_Path+"\\"+file
        try:
            with open(PMC_xml,encoding="utf-8") as f:
                lines = " ".join(f.readlines())
                f.close()
        except:
            lines=[]

        if lines !=[]:# 解析文件内容
            BS = BeautifulSoup(lines,"lxml")
            article_list = BS.find_all("article")
            print(len(article_list))
            for article in article_list: 
                article_meta = article.find_all("article-meta")#
                pmcid=""
                pmid=""
                doi=""
                if article_meta !=[]:
                    # 1）解析 文章信息
                    article_ids=article_meta[0].find_all("article-id")
                    for art_id in article_ids:
                        if art_id["pub-id-type"]=="pmc":
                            pmcid="PMC"+art_id.text.strip()
                        elif art_id["pub-id-type"]=="pmid":
                            pmid="PMID: "+art_id.text.strip()
                        elif art_id["pub-id-type"]=="doi":
                            doi="DOI: "+art_id.text.strip()
                    
                    if pmcid not in os.listdir(Work_Path):
                        os.mkdir(Work_Path+"\\"+pmcid)
                    para_df=parse_body(article,pmcid,Work_Path)
                    fig_df=parse_figure(article,pmcid,Work_Path)
                    tb_df=parse_table(article,pmcid,Work_Path)
                    ref_df=parse_ref_list(article,pmcid,Work_Path)
                    Ids,paper_type,title,authors,author_infor,pubdate,abstract,keywords,jour_Ids,journal_title,issn_number = parse_article_meta(article,pmcid)

                    # 将信息存入列表
                    PMC.append(pmcid)
                    IDs.append(Ids)
                    Paper_Type.append(paper_type)
                    Title.append(title)
                    Authors.append(authors)
                    Author_Infor.append(author_infor)
                    PubDate.append(pubdate)
                    Abstract.append(abstract)
                    Keywords.append(keywords)
                    Journal_Ids.append(jour_Ids)
                    Journal_Title.append(journal_title)
                    Journal_ISSN.append(issn_number)
        
Paper_Infor = pd.DataFrame(PMC,columns=["PMCID"])
Paper_Infor["IDs"] = pd.DataFrame(IDs)
Paper_Infor["Paper_Type"] = pd.DataFrame(Paper_Type)
Paper_Infor["Title"] = pd.DataFrame(Title)
Paper_Infor["Authors"] = pd.DataFrame(Authors)
Paper_Infor["Author_Infor"] = pd.DataFrame(Author_Infor)
Paper_Infor["PubDate"] = pd.DataFrame(PubDate)
Paper_Infor["Abstract"] = pd.DataFrame(Abstract)
Paper_Infor["Keywords"] = pd.DataFrame(Keywords)
Paper_Infor["Journal_Ids"] = pd.DataFrame(Journal_Ids)
Paper_Infor["Journal_Title"] = pd.DataFrame(Journal_Title)
Paper_Infor["ISSN"] = pd.DataFrame(Journal_ISSN)
#Paper_Infor["IF"]=Paper_Infor["Journal_Title"].apply(lambda x:IF_dic[str(x).upper()] if str(x).upper() in IF_dic else -1)
print("解析文章数目：",Paper_Infor.shape)      

pmc_result.xml
131
解析文章数目： (131, 12)


In [14]:
def get_pmid(ids):
    ids = ids.split("; ")
    for i in ids:
        if i.startswith("pmid:"):
            return i.split(":")[1]
    return ""

In [15]:
Paper_Infor["PMID"]=Paper_Infor["IDs"].apply(get_pmid)

In [17]:
Round+=1
Paper_Infor.to_excel("../Result/"+Taday_date+"_PMC_Articals_Paper_Infor.xlsx",index=False)