In [1]:
import pandas as pd
from xml.etree import ElementTree
import requests
from bs4 import BeautifulSoup
import gzip
import shutil
import os
from urllib.request import urlopen

import asyncio
import time 
import aiohttp
from aiohttp.client import ClientSession

import nest_asyncio
nest_asyncio.apply()

In [2]:
# BIG SCRIPT TO RUN ALL THE CODE
# ITERATES OVER ALL 1114 .gz FILES


baseurl = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/'
# Open up the link for pubmed
html = urlopen(baseurl)
soup = BeautifulSoup(html.read(), 'html.parser')

# Get all the links from the index page
links = []
for link in soup.find_all('a'):

    links.append(link.get('href'))
    
# Get only the .gz files
links = [link for link in links if link.endswith(".gz")]

for link in links:

    # The url to the file
    fileurl = baseurl + link
    
    # Download the .gz file
    with open(link, "wb") as f:
        r = requests.get(fileurl)
        f.write(r.content)
        
    # Unzip the .gz file
    with gzip.open(link, 'rb') as f_in:
        with open('pubmedcurrent.xml', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        
    # Parse the xml file
    dom = ElementTree.parse('pubmedcurrent.xml')
    Articles = dom.findall('PubmedArticle/MedlineCitation/Article')
    pubmedID = dom.findall('PubmedArticle/PubmedData/ArticleIdList/')
    allTitles = []
    allAuthors = []
    allJournalName = []
    allPageNumber = []
    allVolume = []
    allPublicationYear = []
    allIssueNumber = []
    
    for i, eachArticles in enumerate(Articles):
        # Get article title
        title = eachArticles.find('ArticleTitle').text
        allTitles.append(title)

        # Get journal name
        journal = eachArticles.find('Journal').find('Title').text
        allJournalName.append(journal)

        # Get page number
        try:
            page = eachArticles.find('Pagination').find('MedlinePgn').text
            allPageNumber.append(page)
        except:
            allPageNumber.append('PageNumberNotFound')

        # Get volume
        try:
            volume = eachArticles.find('Journal').find('JournalIssue').find('Volume').text
            allVolume.append(volume)
        except:
            allVolume.append("VolumeNotFound")

        # Get publication year
        try:
            pubDate = eachArticles.find('Journal').find('JournalIssue').find('PubDate')
            pubYear = pubDate.find('Year').text
            allPublicationYear.append(pubYear)
        except:
            pubDate = eachArticles.find('Journal').find('JournalIssue').find('PubDate')
            pubYear = pubDate.find('MedlineDate').text[0:4]
            allPublicationYear.append(pubYear)

        # Get issue number
        try:
            issueNumber = eachArticles.find('Journal').find('JournalIssue').find('Issue').text
            allIssueNumber.append(issueNumber)
        except:
            allIssueNumber.append("IssueNotFound")

        # Get authors
        allArticleAuthors = []
        # for eachId in eachArticles.findall()
        for authors in eachArticles.findall('AuthorList/Author'):
            if authors.find('LastName') is not None:
                if authors.find('LastName').text is not None:
                    LastName = authors.find('LastName').text
                else:
                    LastName = "NULL"
            else:
                LastName = "NULL"
            if authors.find('ForeName') is not None:
                ForeName = authors.find('ForeName').text
            else:
                ForeName = "NULL"
            allArticleAuthors.append(ForeName + " " + LastName)
        allAuthors.append(allArticleAuthors)
        
    # Get pubmed IDs
    IDs = []
    counter = 0
    # for i, each in enumerate(pubmedID):
    for each in pubmedID:
        # indexes.append(i+1)
        try:
        # if (each.attrib == {'IdType': 'pubmed'}) is not None:
            if each.attrib == {'IdType': 'pubmed'}:
                IDs.append(each.text)
                # counter += 1
                # indexes.append(i+1)
        except:
            IDs.append("NULLID")
            
    # Make a dataframe out of the field lists
    components = {
        'Title' : allTitles,
        'Author': allAuthors,
        # 'Indexes' : indexes,
        'PubMedIDs' : IDs,
        'JournalName': allJournalName,
        'PageNumber': allPageNumber,
        'Volume': allVolume,
        'Year': allPublicationYear,
        'Issue': allIssueNumber
    }
    bigDf = pd.DataFrame(components)
    
    # Get raw string citations

    customString = "https://pubmed.ncbi.nlm.nih.gov/"
    endCString = "/citations/"
    styles = ["ama", "apa", "mla", "nlm"]

    ama, apa, mla, nlm = [0 for i in range(0, 30000)], [0 for i in range(0, 30000)], [0 for i in range(0, 30000)], [0 for i in range(0, 30000)]
    
    errorList = []
    
    async def download_link(url:str,session:ClientSession, eachID):
        async with session.get(url) as response:
            result = await response.text()
            
            # Get index based on ID
            index = IDs.index(eachID)
            
            try:
            # This is where the bottleneck is, where we need to try MT  
                matchDict = eval(result)
                try:
                    ama[index] = matchDict["ama"]['orig']
                    #ama.append(matchDict["ama"]['orig'])
                except:
                    ama[index] = "NULL"
                    #ama.append("NULL")
                try:
                    apa[index] = matchDict["apa"]['orig']
                    #apa.append(matchDict["apa"]['orig'])
                except:
                    apa[index] = "NULL"
                    #apa.append("NULL")
                try:
                    mla[index] = matchDict["mla"]['orig']
                    #mla.append(matchDict["mla"]['orig'])
                except:
                    mla[index] = "NULL"
                    #mla.append("NULL")
                try:
                    nlm[index] = matchDict["nlm"]['orig']
                    #nlm.append(matchDict["nlm"]['orig'])
                except:
                    nlm[index] = "NULL"
                    #nlm.append("NULL")
            except:
                errorList.append(eachID)
    
    async def download_all(IDs:list):
        my_conn = aiohttp.TCPConnector(limit=17)
        async with aiohttp.ClientSession(connector=my_conn) as session:
            tasks = []
            for eachID in IDs:
                url = customString + eachID + endCString
                task = asyncio.ensure_future(download_link(url=url,session=session, eachID=eachID))
                tasks.append(task)
            await asyncio.gather(*tasks,return_exceptions=True) # the await must be nest inside of the session
    
    asyncio.run(download_all(IDs))
    
    rawStringDf = pd.DataFrame(list(zip(ama,apa,mla,nlm)), columns = ["ama","apa","mla","nlm"])
    
    # Concatenate the raw references and the fields
    df = pd.concat([bigDf, rawStringDf],axis=1)

    # Remove instances where we couldn't get the raw references
    df = df[~df['PubMedIDs'].isin(errorList)]
    df.reset_index(inplace=True)

    df.drop(["index"], axis = 1, inplace=True)
    
#     # Bounce to json (or pickle)
#     try:
#         df.to_json(r'jsonTheSecond/' + link + 'ParsedAllFields.json')
#     except:
#         df.to_pickle('jsonTheSecond/' + link + 'ParsedAllFields.json')
#     # Delete the current unzipped xml
#     if os.path.exists("pubmedcurrent.xml"):
#         os.remove("pubmedcurrent.xml")
#     else:
#         print("The file does not exist") 

#     # Delete the current zipped .gz file
#     if os.path.exists(link):
#         os.remove(link)
#     else:
#         print("The file does not exist")
                
    break
    # ... get another .gz file and do it again

In [3]:
df

Unnamed: 0,Title,Author,PubMedIDs,JournalName,PageNumber,Volume,Year,Issue,ama,apa,mla,nlm
0,Formate assay in body fluids: application in m...,"[A B Makar, K E McMartin, M Palese, T R Tephly]",1,Biochemical medicine,117-26,13,1975,2,"Makar AB, McMartin KE, Palese M, Tephly TR. Fo...","Makar, A. B., McMartin, K. E., Palese, M., & T...","Makar, A B et al. “Formate assay in body fluid...","Makar AB, McMartin KE, Palese M, Tephly TR. Fo..."
1,Delineation of the intimate details of the bac...,"[K S Bose, R H Sarma]",2,Biochemical and biophysical research communica...,1173-9,66,1975,4,"Bose KS, Sarma RH. Delineation of the intimate...","Bose, K. S., & Sarma, R. H. (1975). Delineatio...","Bose, K S, and R H Sarma. “Delineation of the ...","Bose KS, Sarma RH. Delineation of the intimate..."
2,Metal substitutions incarbonic anhydrase: a ha...,"[R J Smith, R G Bryant]",3,Biochemical and biophysical research communica...,1281-6,66,1975,4,"Smith RJ, Bryant RG. Metal substitutions incar...","Smith, R. J., & Bryant, R. G. (1975). Metal su...","Smith, R J, and R G Bryant. “Metal substitutio...","Smith RJ, Bryant RG. Metal substitutions incar..."
3,Effect of chloroquine on cultured fibroblasts:...,"[U N Wiesmann, S DiDonato, N N Herschkowitz]",4,Biochemical and biophysical research communica...,1338-43,66,1975,4,"Wiesmann UN, DiDonato S, Herschkowitz NN. Effe...","Wiesmann, U. N., DiDonato, S., & Herschkowitz,...","Wiesmann, U N et al. “Effect of chloroquine on...","Wiesmann UN, DiDonato S, Herschkowitz NN. Effe..."
4,Atomic models for the polypeptide backbones of...,"[W A Hendrickson, K B Ward]",5,Biochemical and biophysical research communica...,1349-56,66,1975,4,"Hendrickson WA, Ward KB. Atomic models for the...","Hendrickson, W. A., & Ward, K. B. (1975). Atom...","Hendrickson, W A, and K B Ward. “Atomic models...","Hendrickson WA, Ward KB. Atomic models for the..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1217,The role of fibrin formation in the pathogenes...,"[J W Hammon, J L Kitzmiller, R Virgillio, L G ...",24899,"Surgery, gynecology & obstetrics",535-40,146,1978,4,"Hammon JW Jr, Kitzmiller JL, Virgillio R, Getz...","Hammon, J. W., Jr, Kitzmiller, J. L., Virgilli...","Hammon, J W Jr et al. “The role of fibrin form...","Hammon JW Jr, Kitzmiller JL, Virgillio R, Getz..."
1218,Training and use of surgeon's assistants.,"[H L Laws, M K Kirklin, A G Diethelm, J Hall, ...",24901,Surgery,445-50,83,1978,4,"Laws HL, Kirklin MK, Diethelm AG, Hall J, Kirk...","Laws, H. L., Kirklin, M. K., Diethelm, A. G., ...","Laws, H L et al. “Training and use of surgeon'...","Laws HL, Kirklin MK, Diethelm AG, Hall J, Kirk..."
1219,"Droperidol, its alpha-adrenergic blocking acti...","[M Satoh, K Kaya, I Yamanaka, A Kasama, M Yana...",24902,The Tohoku journal of experimental medicine,65-72,124,1978,1,"Satoh M, Kaya K, Yamanaka I, Kasama A, Yanagis...","Satoh, M., Kaya, K., Yamanaka, I., Kasama, A.,...","Satoh, M et al. “Droperidol, its alpha-adrener...","Satoh M, Kaya K, Yamanaka I, Kasama A, Yanagis..."
1220,Modulation of GVHR and cell-mediated cytotoxic...,"[A Matossian-Rogers, H Festenstein]",24903,Transplantation proceedings,11-3,10,1978,1,"Matossian-Rogers A, Festenstein H. Modulation ...","Matossian-Rogers, A., & Festenstein, H. (1978)...","Matossian-Rogers, A, and H Festenstein. “Modul...","Matossian-Rogers A, Festenstein H. Modulation ..."


In [6]:
errorList

['583',
 '584',
 '585',
 '590',
 '595',
 '594',
 '599',
 '600',
 '597',
 '601',
 '602',
 '603',
 '608',
 '607',
 '615',
 '611',
 '612',
 '610',
 '614',
 '616',
 '618',
 '621',
 '622',
 '627',
 '624',
 '625',
 '626',
 '623',
 '628',
 '632',
 '629',
 '633',
 '639',
 '638',
 '641',
 '640',
 '642',
 '643',
 '645',
 '649',
 '648',
 '647',
 '650',
 '652',
 '651',
 '658',
 '659',
 '655',
 '656',
 '654',
 '657',
 '663',
 '662',
 '664',
 '666',
 '668',
 '667',
 '671',
 '673',
 '670',
 '672',
 '674',
 '669',
 '675',
 '676',
 '679',
 '680',
 '678',
 '677',
 '686',
 '681',
 '685',
 '682',
 '683',
 '687',
 '684',
 '690',
 '691',
 '688',
 '689',
 '692',
 '695',
 '694',
 '696',
 '693',
 '698',
 '697',
 '700',
 '699',
 '707',
 '701',
 '702',
 '704',
 '703',
 '705',
 '706',
 '709',
 '708',
 '710',
 '712',
 '711',
 '718',
 '719',
 '721',
 '713',
 '714',
 '717',
 '716',
 '715',
 '722',
 '720',
 '723',
 '726',
 '724',
 '725',
 '728',
 '727',
 '731',
 '729',
 '732',
 '730',
 '733',
 '741',
 '740',
 '736',


In [4]:
allArticleAuthors

['David L McCue', 'James M Kasper']

In [5]:
ForeName

'Ara'

In [6]:
LastName

In [7]:
links.index(link)

961

In [8]:
title

'Incubation of feeding behavior is regulated by neuromedin U receptor 2 in the paraventricular nucleus of the hypothalamus.'

In [12]:
print(type(authors.find('LastName').text))

<class 'NoneType'>
