# Ask PubMed and PubMed Central

Clemence SEBE

In [None]:
import xml.etree.ElementTree as et 
import entrezpy.conduit
import datetime
import time
import sys
import os

normal = sys.stdout

In [None]:
boolean = True 

In [None]:
c = entrezpy.conduit.Conduit('email')

In [None]:
today = datetime.date.today()
day = today.day
month = today.month
year = today.year
date = str(day) + '_' + str(month) + '_' + str(year)
print(date)
os.makedirs(date, exist_ok=True)
os.chdir(date)

# Recherche globale : 

## <span style='color:red'> Query 1 : </span> Pubmed : 

$\rightarrow$ nextflow and snakemake

### Requete

In [None]:
name1 = 'q1_' + date + ".xml"
if boolean:
    res = open(name1,'w')
    sys.stdout = res 
    extract_article = c.new_pipeline()
    sid = extract_article.add_search({'db' : 'pubmed',
                              'term' : 'nextflow or snakemake',
                              'rettype' : 'uilist'})

    fid = extract_article.add_fetch({'retmode' : 'xml','rettype' : 'abstract' }, dependency=sid)
    pipeline = c.run(extract_article)

    sys.stdout = normal
    res.close()
    time.sleep(3)

### Analyse

In [None]:
print(os.getcwd())
file1 = et.parse(name1)
pubmedXml = file1.getroot()
articles = list(pubmedXml)

In [None]:
print("Nombre de résultats avec la première requete : {}".format(len(articles)))

Extraction of the title, abstract and keywords for each article found :

In [None]:
def extractText(part, sup=False):
    txt = ''
    if part.tag != 'fig' and part.tag != 'table' and part.tag != 'table-wrap':  
        
        if part.tag == 'list-item':
            for subPart in part:
                if subPart.tag != 'label':
                    txt += '** ' + extractText(subPart) + '\n'
        else:
            if sup:
                txt += ' ['
                
            if part.text != None:
                if len(part) == 0:
                    txt += part.text
                else: 
                    txt += (part.text).strip() + " "

            for subPart in part:
                if subPart.tag =='xref':
                    txt = txt.strip() 
                if subPart.tag == 'ext-link':
                    txt = txt.strip() + " "
                    
                if subPart.tag == 'sup':
                    txt = txt.strip()
                    txt += extractText(subPart, True)
                else:
                    txt += extractText(subPart)

            if sup:
                txt += ']'
            if part.tail != None:
                if len(part.tail.strip()) != 0:
                    txt += part.tail
    
    return txt

In [None]:
idPubmed = []
abPubmedAll = {}
for article in articles:
    txt = ''
    medlineCitation = article.find('MedlineCitation')
    pmid = medlineCitation.find('PMID')
    idPubmed.append(pmid.text)
    
    art = medlineCitation.find('Article')
    title = art.find('ArticleTitle')
    abstract = art.find('Abstract')
        
    titre = extractText(title)
    txt += titre + '\n\n'
    try:
        ab = extractText(abstract)
        txt += ab + '\n\n'
    except:
        None
        
    try:
        keyword = medlineCitation.find('KeywordList')
        for k in keyword:
            txt += extractText(k) + ' '
    except:
        None
    abPubmedAll.update({pmid.text:txt})

## <span style='color:red'> Query 2 : </span> Pubmed : 

$\rightarrow$ ((nextflow[Title/Abstract]) OR (snakemake[Title/Abstract])) AND (github[Title/Abstract])

In [None]:
name2 = 'q2_' + date + ".xml"
if boolean:
    res = open(name2,'w')
    sys.stdout = res 
    extract_article = c.new_pipeline()
    sid = extract_article.add_search({'db' : 'pubmed',
                              'term' : '((nextflow[Title/Abstract]) OR (snakemake[Title/Abstract])) AND (github[Title/Abstract])',
                              'rettype' : 'uilist'})

    fid = extract_article.add_fetch({'retmode' : 'xml','rettype' : 'abstract' }, dependency=sid)
    pipeline = c.run(extract_article)

    sys.stdout = normal
    res.close()
    time.sleep(3)

In [None]:
file2 = et.parse(name2)
pubmedXml = file2.getroot()
articles = list(pubmedXml)

In [None]:
print("Number of results with second query : {}".format(len(articles)))

In [None]:
idPubmed2 = []
for article in articles:
    medlineCitation = article.find('MedlineCitation')
    pmid = medlineCitation.find('PMID')
    idPubmed2.append(pmid.text)

In [None]:
for idx in idPubmed2:
    if not idx in idPubmed:
        print('Strange')

## <span style='color:red'> Query 3 : </span> PMC : 

$\rightarrow$ (nextflow[Abstract] OR snakemake[Abstract]) AND github[All Fields] 

In [None]:
name3 = 'q3_' + date + ".xml"
if boolean:
    res = open(name3,'w')
    sys.stdout = res 
    extract_article = c.new_pipeline()
    sid = extract_article.add_search({'db' : 'pmc',
                              'term' : '(nextflow[Abstract] OR snakemake[Abstract]) AND github[All Fields]',
                              'rettype' : 'uilist'})

    fid = extract_article.add_fetch({'retmode' : 'xml','rettype' : 'abstract' }, dependency=sid)
    pipeline = c.run(extract_article)

    sys.stdout = normal
    res.close()
    time.sleep(3)

### Analyse

In [None]:
file3 = et.parse(name3)
pubmedXml = file3.getroot()
articles = list(pubmedXml)

In [None]:
print("Number of results with the third request : {}".format(len(articles)))

Extraction of Pubmed IDs from the various PMC IDs:

In [None]:
linkId = {}
noOk = 0
for article in articles :
    if article.tag == 'Reply':
        print("Can't acces to this article : ",article.attrib)
        noOk +=1
    else:
        articleMeta = article.find("front/article-meta")
        ids = articleMeta.findall("article-id")
        temp = {}
        for idx in ids:
            temp[idx.attrib['pub-id-type']] = idx.text
            #print(idx.attrib, idx.text)
        try:
            pmc = 'PMC' + temp['pmc']
            linkId.update({pmc : temp['pmid']})
        except:
            print('This article have a problem PMC Id : {} -> no pmid'.format(temp['pmc']))
            noOk +=1

We check whether the ids found in query 3 are included in query 1:

In [None]:
nb = 0
for idx in linkId:
    pmid = linkId[idx]
    if not pmid in idPubmed:
        print(idx, linkId[idx])
        nb +=1
tot = len(articles)-noOk
print("\n{} out of {} from PMC is not in my results for query 1 on PUBMED".format(nb,tot))

We check whether the ids found in query 3 are included in query 2:

In [None]:
nb = 0
for idx in linkId:
    pmid = linkId[idx]
    if not pmid in idPubmed2:
        #print(idx, linkId[idx])
        nb +=1
tot = len(articles)-noOk
print("\n{} out of {} from PMC is not in my results for query 2 on PUBMED".format(nb,tot))