## PunMed Data Parsing

In [1]:
import csv
import numpy as np
from xml.etree.ElementTree import parse

In [2]:
tree = parse('pubmed21n0909.xml')
root = tree.getroot()

DateClass = ['Year', 'Month', 'Day']
NameClass = ['LastName', 'ForeName', 'Initials']

In [3]:
PubmedArticle = root.findall("PubmedArticle")

MedlineCitation = sum([x.findall("MedlineCitation") for x in PubmedArticle], [])

### Article

In [4]:
Article = sum([x.findall("Article") for x in MedlineCitation], [])

TempDate = []
PubMedPubDate = []
ArticleTitle = []
AbstractText = []
TempText = ""

# PMID (PK)
PMID = [x.findtext("PMID") for x in MedlineCitation]

# PubMedPubDate
PubMedPubDateTag = sum([x.findall("./PubmedData/History/PubMedPubDate[@PubStatus='pubmed']") for x in PubmedArticle], [])
for x in DateClass:
    TempDate.append([y.findtext(x) for y in PubMedPubDateTag])
for x in zip(*TempDate):
    PubMedPubDate.append(x[0]+'-'+x[1]+'-'+x[2])

# ArticleTitle
ArticleTitleTag = sum([x.findall("ArticleTitle") for x in Article], [])
for tag in ArticleTitleTag:
    for t in tag.itertext():
        TempText += t
    ArticleTitle.append(TempText)
    TempText = ""
    
# AbstractText
AbstractTextTag = [x.findall("./Abstract/AbstractText") for x in Article]
for i in range(len(AbstractTextTag)):
    for tag in AbstractTextTag[i]:
        for t in tag.itertext():
            TempText += t
    AbstractText.append(TempText)
    TempText = ""

### Journal

In [5]:
Journal = sum([x.findall("Journal") for x in Article], [])
JournalIssue = sum([x.findall("JournalIssue") for x in Journal], [])

# ISSN (PK)
ISSN = [x.findtext("ISSN") for x in Journal]
ISSN = sum([["\\n" if issn is None else issn] for issn in ISSN], [])

# # Volume
# Volume = [x.findtext("Volume") for x in JournalIssue]

# # PubDate
# PubDateTag = sum([x.findall("PubDate") for x in JournalIssue], [])
# PubDate = [x.findtext("Year") for x in PubDateTag]

# Title
Title = [x.findtext("Title") for x in Journal]

# # ISOAbbreviation
# ISOAbbreviation = [x.findtext("ISOAbbreviation") for x in Journal]

# Country
MedlineJournalInfo = sum([x.findall("MedlineJournalInfo") for x in MedlineCitation], [])
Country = [x.findtext("Country") for x in MedlineJournalInfo]

### Author

In [31]:
Author = [x.findall("./AuthorList/Author") for x in Article]

TempName = []
TempNameList = [] # 논문별 구분을 위함
AuthorName = []

# AuthorName (PK)
for i in range(len(Author)):
    for x in NameClass:
        TempName.append([y.findtext(x) for y in Author[i]])
    for x in zip(*TempName):
        varName = ""
        for j in range(len(NameClass)):
            if x[j] is None:
                continue
            else:
                varName += x[j]
                if j != 2:
                    varName += " "
        if varName == "":
            varName = "\\n"
        TempNameList.append(varName)
    AuthorName.append(TempNameList)
    TempName = []
    TempNameList = []
# AuthorName = sum(AuthorName, []) # 논문별로 구분하려면 이 줄 주석처리

In [32]:
# Affiliation
# 대표 1개 Affiliation만 추출
TempAffiList = [] # 논문별 구분을 위함
Affiliation = []
for i in range(len(Author)):
#     AffiliationInfo = [x.findall("AffiliationInfo") for x in Author[i]]
    for j in range(len(Author[i])):
        AffiliationInfo = Author[i][j].find('AffiliationInfo')
        if AffiliationInfo is None:
            TempAffiList.append('\\n')
        else:
            TempAffiList.append(AffiliationInfo.findtext("Affiliation"))
    Affiliation.append(TempAffiList)
    TempAffiList = []
# Affiliation = sum(Affiliation, []) # 논문별로 구분하려면 이 줄 주석처리

### 모든 Affiliation 추출
# TempAffiList = []
# Temp = [] # 저자별 구분을 위함
# Affiliation = []
# for i in range(len(Author)):
#     AffiliationInfo = [x.findall("AffiliationInfo") for x in Author[i]]
#     for j in range(len(Author[i])):
#         for k in range(len(AffiliationInfo[j])):
#             Temp.append(AffiliationInfo[j][k].findtext("Affiliation"))
#         TempAffiList.append(Temp)
#         Temp = []
#     Affiliation.append(TempAffiList)
#     TempAffiList = []

### Test

In [8]:
print('------Article Test------')
print('* PMID(PK)      : ', PMID[:3])
print('* PubMedPubDate : ', PubMedPubDate[:3])
print('* ArticleTitle  : ', ArticleTitle[:3])
print('* AbstractText  : ', AbstractText[0])
print('')
print('-----Journal Test------')
print('* ISSN(PK)      : ', ISSN[:3])
print('* Title         : ', Title[:3])
print('* Country       : ', Country[:3])
print('')
print('------Author Test------')
print('* AuthorName(PK): ', AuthorName[:3])
print('* Affiliation   : ', Affiliation[:3])

------Article Test------
* PMID(PK)      :  ['28588640', '28588641', '28588642']
* PubMedPubDate :  ['2017-6-8', '2017-6-8', '2017-6-8']
* ArticleTitle  :  ['Compounds from Cynomorium songaricum with Estrogenic and Androgenic Activities Suppress the Oestrogen/Androgen-Induced BPH Process.', 'The Significant Pathways and Genes Underlying the Colon Cancer Treatment by the Traditional Chinese Medicine PHY906.', 'The Preventive Control of Zoonotic Visceral Leishmaniasis: Efficacy and Economic Evaluation.']
* AbstractText  :  To investigate the phytoestrogenic and phytoandrogenic activities of compounds isolated from CS and uncover the role of CS in prevention of oestrogen/androgen-induced BPH.Cells were treated with CS compounds, and immunofluorescence assay was performed to detect the nuclear translocation of ERα or AR in MCF-7 or LNCaP cells; luciferase reporter assay was performed to detect ERs or AR transcriptional activity in HeLa or AD293 cells; MTT assay was performed to detect the 

### CSV 파일 만들기

In [14]:
ANameInput = sum(AuthorName, [])
AffInput = sum(Affiliation, [])

In [14]:
# SQL 삽입에 방해되는 특수문자들 제거
ANameInput = [AffInput[i].replace(',', "") for i in range(len(AffInput))]
AffInput = [AffInput[i].replace('"', "") for i in range(len(AffInput))]
AffInput = [AffInput[i].replace(',', "") for i in range(len(AffInput))]
AffInput = [AffInput[i].replace('\n', "") for i in range(len(AffInput))]
ArticleTitle = [ArticleTitle[i].replace('"', "") for i in range(len(ArticleTitle))]
ArticleTitle = [ArticleTitle[i].replace(',', "") for i in range(len(ArticleTitle))]
AbstractText = [AbstractText[i].replace('"', "") for i in range(len(AbstractText))]
AbstractText = [AbstractText[i].replace(',', "") for i in range(len(AbstractText))]
AbstractText = [AbstractText[i].replace('\n', "") for i in range(len(AbstractText))]

In [15]:
ArticleListTemp = []
ArticleListTemp.extend([PMID, PubMedPubDate, ArticleTitle, AbstractText])
ArticleList = np.array(list(zip(*ArticleListTemp)))

with open('Article.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['PMID', 'PubMedPubDate', 'ArticleTitle', 'AbstractText'])
    for i in range(len(ArticleList)):
        writer.writerow(ArticleList[i])

In [17]:
JournalListTemp = []
JournalListTemp.extend([ISSN, Title, Country])
JournalList = np.array(list(set(zip(*JournalListTemp)))) # set: 중복 제거
# JournalListUnique = JournalList[np.unique(JournalList[:, 0], return_index=True)[1]] # ISSN 기준 중복 저널 제거

with open('Journal.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['ISSN', 'Title', 'Country'])
    for i in range(len(JournalList)):
        writer.writerow(JournalList[i])

In [48]:
AuthorListTemp = []
AuthorListTemp.extend([ANameInput, AffInput])
AuthorList = np.array(list(set(zip(*AuthorListTemp))))
# AuthorListUnique = AuthorList[np.unique(AuthorList[:, 0], return_index=True)[1]] # AuthorName 기준으로 중복 제거

with open('Author.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['ID', 'AuthorName', 'Affiliation'])
    for i in range(len(AuthorList)):
        writer.writerow(np.append(i+1, AuthorList[i]))

In [136]:
# Write: Author - Article
PMIDList = []
WriteListTemp = []
for i in range(len(AuthorName)): 
    PMIDList.append([PMID[i]]*len(AuthorName[i]))
PMIDList = sum(PMIDList, [])
WriteListTemp.extend([ANameInput, PMIDList])
WriteList = np.array(list(set(zip(*WriteListTemp))))
WriteList = np.delete(WriteList, np.where('\\n' == WriteList.T[0]), 0) # AuthorName이 '\n'인 행 삭제

with open('Write.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['AuthorName', 'PMID'])
    for i in range(len(WriteList)):
        writer.writerow(WriteList[i])

In [137]:
# Submit: Author - Journal
ISSNList = []
SubmitListTemp = []
for i in range(len(AuthorName)):
    ISSNList.append([ISSN[i]]*len(AuthorName[i]))
ISSNList = sum(ISSNList, [])
SubmitListTemp.extend([ANameInput, ISSNList])
SubmitList = np.array(list(set(zip(*SubmitListTemp))))
SubmitList = np.delete(SubmitList, np.where('\\n' == SubmitList.T[0]), 0) # AuthorName이 '\n'인 행 삭제
SubmitList = np.delete(SubmitList, np.where('\\n' == SubmitList.T[1]), 0) # ISSN이 '\n'인 행 삭제

with open('Submit.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['AuthorName', 'ISSN'])
    for i in range(len(SubmitList)):
        writer.writerow(SubmitList[i])

In [111]:
# Include: Article - Journal
IncludeList = []
IncludeList.extend([PMID, ISSN])
IncludeList = np.array(list(zip(*IncludeList)))

with open('Include.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['PMID', 'ISSN'])
    for i in range(len(IncludeList)):
        writer.writerow(IncludeList[i])

In [46]:
print(len(max(PMID, key=len)))
print(len(max(PubMedPubDate, key=len)))
print(len(max(ArticleTitle, key=len)))
print(len(max(AbstractText, key=len)))
print('')
print(len(max(ISSN, key=len)))
print(len(max(Title, key=len)))
print(len(max(Country, key=len)))
print('')
print(len(max(ANameInput, key=len)))
print(len(max(AffInput, key=len)))

8
10
582
6705

9
232
25

54
4685


In [20]:
print(len(ArticleList))
print(len(JournalList))
print(len(AuthorList))

30000
3943
165464
