# Extraction and creation of txt files containing articles
Clemence SEBE

In [None]:
import xml.etree.ElementTree as et 
import pandas as pd
import numpy as np
import re
import os

## Extracting PMC items from the xml file

In [None]:
pmcAll = et.parse('14_7_2023/q3_14_7_2023.xml')
pmc = pmcAll.getroot()

Intermediate function for extracting information and text from XML :

In [None]:
def extractText(part, sup=False):
    txt = ''
    if part.tag != 'fig' and part.tag != 'table' and part.tag != 'table-wrap':  
        
        if part.tag == 'list-item':
            for subPart in part:
                if subPart.tag != 'label':
                    txt += '** ' + extractText(subPart) + '\n'
        else:
            if sup:
                txt += ' ['
                
            if part.text != None:
                if len(part) == 0:
                    txt += part.text
                else: 
                    txt += (part.text).strip() + " "

            for subPart in part:
                if subPart.tag =='xref':
                    txt = txt.strip() 
                if subPart.tag == 'ext-link':
                    txt = txt.strip() + " "
                    
                if subPart.tag == 'sup':
                    txt = txt.strip()
                    txt += extractText(subPart, True)
                else:
                    txt += extractText(subPart)

            if sup:
                txt += ']'
            if part.tail != None:
                if len(part.tail.strip()) != 0:
                    txt += part.tail
    
    return txt

In [None]:
def extracTitle(body, tab, idx):
    section = body.findall('sec')
    for sec in section:
        title = sec.find('title')
        if title != None:
            txt = extractText(title)
               
            new = tab
            new.append([idx,txt])
            extracTitle(sec, new , idx+1)
        else:
            extracTitle(sec,tab, idx+1)
    return tab

In [None]:
def whichSection(titres, wordsAccepted):
    accepted = []
    for i in range (len(titres)) :
        word = titres[i][-1].lower()
        for wA in wordsAccepted:
            if word.find(wA) != -1:
                ok = True
                if titres[i][0] == 0:
                    if not titres[i][-1] in accepted:
                        accepted.append(titres[i][-1])
                else:
                    for j in range (i,-1,-1):
                        if titres[j][0] == 0:
                            if not titres[j][-1] in accepted:
                                accepted.append(titres[j][-1])
                            break
            
    return accepted

In [None]:
def extractTextSection(file, section):
    for part in section:
        if part.tag == 'fn-group':
            extractTextSection(file,part) 
        if part.tag == 'title':
            txt = '\n\n\t' + extractText(part) + '\n\n'
            file.write(txt)
        if part.tag == 'p' or part.tag == 'fn':
            txt = extractText(part) + '\n'
            file.write(txt)
        if part.tag == 'list':
            txt = extractText(part) 
            file.write(txt)
        if part.tag == 'notes':
            extractTextSection(file,part)
        if part.tag == 'sec':
            extractTextSection(file, part)

"Main" creating the various txt files that make up the corpus:

In [None]:
wordsAccepted = ['implementation', 'material', 'method', 'operation', 'pipeline', 'workflow', 'tool', '\u2003']
access = 0
nonAccess = 0
dicoArticle = {}
idx = 1

try:
    os.mkdir("article")
    os.mkdir("article/all")
except:
    None


titre = []
titleSection = open("sectionTitle.txt", 'w')

paternGit = '(https:\/\/)?(www\.)?github\.com\/(\w|\/|-|_)*'

df = pd.DataFrame(columns=['id','titre', 'pmid', 'pmc', 'doi', 'XMLaccess', 'git', 'language'])                   

for article in pmc:
    print(str(idx) + "/" + str(len(pmc)))
    front = article.find('front')
    body = article.find('body')
    
    #extract gloabal information on the article
    dicoSubArticle = {}
    dicoSubArticle[idx] = {}
    articleMeta = front.find('article-meta')
    temp = {}
    for idA in articleMeta.findall('article-id'):
        if not idA.attrib['pub-id-type'] in temp:
            temp.update({idA.attrib['pub-id-type'] : idA.text})
        else:
            print('NORMALLY NO - NOT TWO ID DIFFERENT')
            
    for k in temp:
        dicoSubArticle[idx][k] = temp[k]
    
    title = articleMeta.find('title-group')
    txtTitle = []
    for child in title:
        if child.tag.find('title') != -1:
            txt = extractText(child)
            txtTitle.append(txt)
    dicoSubArticle[idx]['title'] = txtTitle
    
    #  abstract
    try:
        string =  'article/all/' +  "PMID" + dicoSubArticle[idx]['pmid'] + '.all.txt' 
    except:
        string =  'article/all/'  +  "PMC" + dicoSubArticle[idx]['pmc'] + '.all.txt' 
    fileAll = open(string,'w')
    fileAll.write(dicoSubArticle[idx]['title'][0] + '\n\n')
        
    abstract = articleMeta.findall('abstract')
    for i in range (len(abstract)):
        if i > 0:
            fileAll.write("\n" + "Abstract " + str(i) + ' :\n')
        else:
            fileAll.write("Abstract :\n")
        extractTextSection(fileAll, abstract[i])
    fileAll.write("\n" + "-"*100 + "\n")

    
    if body == None:
        nonAccess += 1
        dicoSubArticle[idx]['XMLaccess'] = 'No'
        dicoSubArticle[idx]['git'] = ''
        fileAll.close()
        stringGit = ''
        tabGit = []
    else:
        access += 1
        dicoSubArticle[idx]['XMLaccess'] = 'Free'
        
        #extract the text - only some sections       
        titres = extracTitle(body,[], 0)
        string = str(idx) + " "+ dicoSubArticle[idx]['title'][0]
        titleSection.write(string)
        titleSection.write('\n')
        for t in titres:
            titleSection.write('\t'*t[0])
            titleSection.write(t[1])
            titleSection.write('\n')
        titleSection.write('\n\n')
        
        nameSection = whichSection(titres, wordsAccepted)
        try:
            string = 'article/' + str(idx) + "_PMID" + dicoSubArticle[idx]['pmid'] + '.txt'
        except:
            string = 'article/' + str(idx) + "_PMC" + dicoSubArticle[idx]['pmc'] + '.txt'
        file = open(string,'w')
        file.write(dicoSubArticle[idx]['title'][0] + '\n')
        
        print(dicoSubArticle[idx]['title'][0])
        section = body.findall('sec')
        for sec in section:
            title = sec.find('title')
            if title != None:
                txt = extractText(title)
                if txt in nameSection:
                    extractTextSection(file,sec)
        file.close()
        
        #extract all the article
        for sec in section:
            extractTextSection(fileAll,sec)
        
        #and the section Data Availability if the article have one
        fileAll.write("\n" + "-"*100 + "\n")
        back = article.find('back')
        extractTextSection(fileAll, back)               
            
        fileAll.close() 
        
        #extract github @ 
        try:
            string =  'article/all/' +  "PMID" + dicoSubArticle[idx]['pmid'] + '.all.txt' 
        except:
            string =  'article/all/'  + "PMC" + dicoSubArticle[idx]['pmc'] + '.all.txt' 
        fileR = open(string,'r')
        txt = fileR.read()
        tabGit = []
        stringGit = ''
        for match in re.finditer(paternGit, txt):
            git = txt[match.span()[0]: match.span()[1]]
            
            if git[0] == 'h':
                git = git[8:]
            if git[0] == 'w':
                git = git[4:]
            if not git in tabGit:
                tabGit.append(git)
                stringGit += git + ' , '
        stringGit = stringGit[:-(len(' , '))]
        dicoSubArticle[idx]['git'] = stringGit
        fileR.close()
        
            

    try:
        pmidid = dicoSubArticle[idx]['pmid'] 
    except :
        pmidid = ''
    try:
        pmcid = dicoSubArticle[idx]['pmc'] 
    except :
        pmcid = ''
    try:
        doiid = dicoSubArticle[idx]['doi'] 
    except :
        doiid = ''
        
    df_new_row = pd.DataFrame([{'id':idx, 'titre':dicoSubArticle[idx]['title'][0], 'pmid':pmidid, 'pmc':pmcid, 
                'doi':doiid, 'XMLaccess':dicoSubArticle[idx]['XMLaccess'], 'git':dicoSubArticle[idx]['git']}])
    
    df = pd.concat([df, df_new_row], ignore_index=True)
    
    print("~~~~~~~~~~~~~~~~~")
    
    #print(dicoSubArticle)
    dicoArticle.update(dicoSubArticle)
    idx += 1
    

titleSection.close()

In [None]:
print("On {} articles, {} open access Articles and {} no XML open acess Articles".format(len(pmc),access, nonAccess))

# Deplacement des articles dans le bon dossier 

In [None]:
path = 'article/all/'
listArticle = os.listdir(path)

nbNextflow = 0
nbSnakemake = 0
nbBoth = 0
nonAccess = 0


for article in listArticle:
    a = open(path + article, 'r')
    name = article.replace(".all.txt", '')

    try:
        idxDf = list(np.where(df['pmid'] == name.replace('PMID',''))[0])[0]
    except:
        idxDf = list(np.where(df['pmc'] == name.replace('PMC',''))[0])[0]

    txt = a.read()
    idx = txt.find('-'*100)
    abstract = txt[:idx].lower()

    n = abstract.find('nextflow')
    s = abstract.find('snakemake')
    
    if n != -1 and s == -1:
        df.at[idxDf,'language']='Nextflow'
        if df.iloc[idxDf]['XMLaccess'] == 'Free':
            nbNextflow += 1

    elif n == -1 and s != -1:
        df.at[idxDf,'language']='Snakemake'
        if df.iloc[idxDf]['XMLaccess'] == 'Free':
            nbSnakemake += 1
            
            
    elif n != -1 and s != -1:
        nbBoth += 1
        df.at[idxDf,'language']='Both'
        
    a.close()
print("Number of articles describing Nextflow wf : {}".format(nbNextflow))
print("Number of articles describing Snakemake wf : {}".format(nbSnakemake))
print("Number of items with both Nextflow and Snakemake : {}".format(nbBoth))


In [None]:
df.to_html('corpus.html',justify='center',index=False)

In [None]:
df