In [77]:
import json
import os
import bs4 as bs
import sys
import re
import time
import requests
import datetime

from lxml import etree
from lxml import html
from unidecode import unidecode
from lxml.etree import tostring
from itertools import chain
from unidecode import unidecode
from urllib.request import urlopen
import pandas as pd
import numpy as np
from xml.etree import ElementTree as et
from pprint import pprint

from ftplib import FTP
import io 
import boto3
import glob
import shutil
destination = 'clinical_data'

In [2]:
from git import Repo
join = osp.join


ImportError: Bad git executable.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

This initial warning can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|none|n|0: for no warning or exception
    - warn|w|warning|1: for a printed warning
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet


In [68]:
def parse_pubmed_abstract(path, return_xml=True):
    """
    Giving tree, return simple parsed abstract information from the tree
    """
    regexRandom = '\\W*\\brandom(?:i(s|z))?\\W*'
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']
    
    #if tree.xpath('//abstracttext/text()') is not None:
    abstract_tree = tree.xpath('//abstract/abstracttext')
    #abstract_tree = tree.xpath('//abstracttext/text()')
    abstract = ' '.join([stringify_children(a).strip() for a in abstract_tree])
    #print(len(abstract))
    soup = bs.BeautifulSoup(abstract, "lxml")
        #soup = bs.BeautifulSoup(abstract_tree, "lxml")
        #abstract = soup.findAll(text=re.compile(regexRandom, re.I))

    #else:
    #    abstract = ''
        
    dict_out = {'pmid': pmid,
                'pmc': pmc,
                'abstract': abstract}
    return dict_out

def load_xml(database, articleid, sleep=None):
    """
    Load XML file from given pmid from eutils site
    return a dictionary for given pmid and xml string from the site
    sleep: how much time we want to wait until requesting new xml
    """
    link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=" + database + "&retmode=xml&id=%s" % str(articleid)
    page = requests.get(link)
    tree = html.fromstring(page.content)
    if sleep is not None:
        time.sleep(sleep)
    return tree

def parse_pubmed_web_tree(tree):
    """
    Giving tree, return simple parsed information from the tree
    """

    if tree.xpath('//articletitle') is not None:
        title = ' '.join([title.text for title in tree.xpath('//articletitle')])
    else:
        title = ''

    abstract_tree = tree.xpath('//abstract/abstracttext')
    abstract = ' '.join([stringify_children(a).strip() for a in abstract_tree])

    if tree.xpath('//article//title') is not None:
        journal = ';'.join([t.text.strip() for t in tree.xpath('//article//title')])
    else:
        journal = ''

    pubdate = tree.xpath('//pubmeddata//history//pubmedpubdate[@pubstatus="medline"]')
    if len(pubdate) >= 1 and pubdate[0].find('year') is not None:
        year = pubdate[0].find('year').text
    else:
        year = ''

    affiliations = list()
    if tree.xpath('//affiliationinfo/affiliation') is not None:
        for affil in tree.xpath('//affiliationinfo/affiliation'):
            affiliations.append(affil.text)
    affiliations_text = '; '.join(affiliations)

    authors_tree = tree.xpath('//authorlist/author')
    authors = list()
    if authors_tree is not None:
        for a in authors_tree:
            firstname = a.find('forename').text if a.find('forename') is not None else ''
            lastname = a.find('lastname').text if a.find('forename') is not None else ''
            fullname = (firstname + ' ' + lastname).strip()
            if fullname == '':
                fullname = a.find('collectivename').text if a.find('collectivename') is not None else ''
            authors.append(fullname)
        authors_text = '; '.join(authors)
    else:
        authors_text = ''

    dict_out = {'title': title,
                'abstract': abstract,
                'journal': journal,
                'affiliation': affiliations_text,
                'authors': authors_text,
                'year': year}
    return dict_out

def parse_xml_web(pmid, sleep=None, save_xml=False):
    """
    Give pmid, load and parse xml from Pubmed eutils
    if save_xml is True, save xml output in dictionary
    """
    tree = load_xml(pmid, sleep=sleep)
    dict_out = parse_pubmed_web_tree(tree)
    dict_out['pmid'] = str(pmid)
    if save_xml:
        dict_out['xml'] = etree.tostring(tree)
    return dict_out

def read_xml(path):
    """
    Parse tree from given XML path
    """
    try:
        tree = etree.parse(path)
    except:
        try:
            tree = etree.fromstring(path)
        except Exception as e:
            print("Error: it was not able to read a path, a file-like object, or a string as an XML")
            raise
    if '.nxml' in path:
        remove_namespace(tree) # strip namespace for
    return tree

def remove_namespace(tree):
    """
    Strip namespace from parsed XML
    """
    for node in tree.iter():
        try:
            has_namespace = node.tag.startswith('{')
        except AttributeError:
            continue  # node.tag is not a string (node is a comment or similar)
        if has_namespace:
            node.tag = node.tag.split('}', 1)[1]
            
def stringify_children(node):
    """
    Filters and removes possible Nones in texts and tails
    ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
    """
    parts = ([node.text] +
             list(chain(*([c.text, c.tail] for c in node.getchildren()))) +
             [node.tail])
    return ''.join(filter(None, parts))


def parse_article_meta(tree):
    """
    Parse PMID, PMC and DOI from given article tree
    """
    article_meta = tree.find('.//article-meta')
    pmid_node = article_meta.find('article-id[@pub-id-type="pmid"]')
    pmc_node = article_meta.find('article-id[@pub-id-type="pmc"]')
    pub_id_node = article_meta.find('article-id[@pub-id-type="publisher-id"]')
    doi_node = article_meta.find('article-id[@pub-id-type="doi"]')

    pmid = pmid_node.text if pmid_node is not None else ''
    pmc = pmc_node.text if pmc_node is not None else ''
    pub_id = pub_id_node.text if pub_id_node is not None else ''
    doi = doi_node.text if doi_node is not None else ''

    dict_article_meta = {'pmid': pmid,
                         'pmc': pmc,
                         'doi': doi,
                         'publisher_id': pub_id}

    return dict_article_meta

def table_to_df(table_text):
    """
    Function to transform plain xml text to list of row values and
    columns
    """
    table_tree = etree.fromstring(table_text)
    columns = []
    for tr in table_tree.xpath('thead/tr'):
        for c in tr.getchildren():
            columns.append(unidecode(stringify_children(c)))

    row_values = []
    len_rows = []
    for tr in table_tree.findall('tbody/tr'):
        es = tr.xpath('td')
        row_value = [unidecode(stringify_children(e)) for e in es]
        len_rows.append(len(es))
        row_values.append(row_value)
    if len(len_rows) >= 1:
        len_row = max(set(len_rows), key=len_rows.count)
        row_values = [r for r in row_values if len(r) == len_row] # remove row with different length
        return columns, row_values
    else:
        return None, None


def parse_pubmed_table(path, return_xml=True):
    """
    Parse table from given Pubmed Open-Access XML file
    """
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']

    # parse table
    tables = tree.xpath('//body//sec//table-wrap')
    table_dicts = list()
    for table in tables:
        if table.find('label') is not None:
            label = unidecode(table.find('label').text or '')
        else:
            label = ''

        # table caption
        if table.find('caption/p') is not None:
            caption_node = table.find('caption/p')
        elif table.find('caption/title') is not None:
            caption_node = table.find('caption/title')
        else:
            caption_node = None
        if caption_node is not None:
            caption = unidecode(stringify_children(caption_node).strip())
        else:
            caption = ''

        # table content
        if table.find('table') is not None:
            table_tree = table.find('table')
        elif table.find('alternatives/table') is not None:
            table_tree = table.find('alternatives/table')
        else:
            table_tree = None

        if table_tree is not None:
            table_xml = etree.tostring(table_tree)
            columns, row_values = table_to_df(table_xml)
            if row_values is not None:
                table_dict = {'pmid': pmid,
                              'pmc': pmc,
                              'label': label,
                              'caption': caption,
                              'table_columns': columns,
                              'table_values': row_values}
                if return_xml:
                    table_dict['table_xml'] = table_xml
                table_dicts.append(table_dict)
    if len(table_dicts) >= 1:
        return table_dicts
    else:
        return None
    
def parse_html_table_to_df(table):
    n_columns = 0
    n_rows=0
    column_names = []

    # we find the column titles if we can
    for row in table.thead.find_all('tr'):
        # Handle column names if we find them
        td_tags = row.find_all('td') 
        if len(td_tags) > 0 and len(column_names) == 0:
            for td in td_tags:
                column_names.append(td.get_text())
        
    # Find number of rows
    for row in table.tbody.find_all('tr'):
    
        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)
            
    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.tbody.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

In [2]:
#query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=300000&term=(Clinical%20Trial%5BPublication%20Type%5D)%20AND%20Free%20full%20text%5BFilter%5D"

#query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&retmax=300000&term=(Randomized%20Controlled%20Trial%5BPublication%20Type%5D)"
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=4659616&term=((randomized+controlled+trial%5Bpt%5D)+OR+(controlled+clinical+trial%5Bpt%5D)+OR+(randomized%5Btiab%5D+OR+randomised%5Btiab%5D)+OR+(placebo%5Btiab%5D)+OR+(drug+therapy%5Bsh%5D)+OR+(randomly%5Btiab%5D)+OR+(trial%5Btiab%5D)+OR+(groups%5Btiab%5D))+NOT+(animals%5Bmh%5D+NOT+humans%5Bmh%5D)"

resp = urlopen(query).read()#.decode('utf-8')

soup = bs.BeautifulSoup(resp,'lxml')
targetIDs = soup.find_all('id')
small_list = []
for docID in targetIDs:
    small_list.append(docID.get_text())
    #print(docID.get_text())

In [3]:
print(len(small_list))

3669870


In [187]:
openaccess_df = pd.read_csv('oa_file_list.csv')

print(openaccess_df.shape)

print(len(small_list))

openaccess_df.head(10)

(1913822, 6)
3669870


Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,11250746.0,NO-CC CODE
1,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,11250747.0,NO-CC CODE
2,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,11250748.0,NO-CC CODE
3,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,11056684.0,NO-CC CODE
4,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,11400682.0,NO-CC CODE
5,oa_package/3b/77/PMC13913.tar.gz,Breast Cancer Res. 1999 Aug 19; 1(1):73-80,PMC13913,2013-03-17 14:00:52,11056681.0,NO-CC CODE
6,oa_package/4b/13/PMC13914.tar.gz,Breast Cancer Res. 1999 Sep 1; 1(1):81-87,PMC13914,2013-03-17 14:00:52,11056682.0,NO-CC CODE
7,oa_package/cb/d1/PMC13915.tar.gz,Breast Cancer Res. 1999 Oct 7; 1(1):88-94,PMC13915,2013-03-17 14:00:52,11056683.0,NO-CC CODE
8,oa_package/1e/3b/PMC13916.tar.gz,Breast Cancer Res. 2000 Feb 21; 2(2):139-148,PMC13916,2014-02-18 06:06:35,11056686.0,NO-CC CODE
9,oa_package/0e/7e/PMC13917.tar.gz,Breast Cancer Res. 2000 Mar 7; 2(3):222-235,PMC13917,2013-05-17 12:53:06,11056687.0,NO-CC CODE


In [188]:
openaccess_df.count(axis=0)

File                                  1913822
Article Citation                      1913822
Accession ID                          1913822
Last Updated (YYYY-MM-DD HH:MM:SS)    1913822
PMID                                  1798776
License                               1913822
dtype: int64

In [189]:
openaccess_df['PMCID'] = openaccess_df['Accession ID'].str[3:]
print(openaccess_df.count(axis=0))
openaccess_df.head()

File                                  1913822
Article Citation                      1913822
Accession ID                          1913822
Last Updated (YYYY-MM-DD HH:MM:SS)    1913822
PMID                                  1798776
License                               1913822
PMCID                                 1913822
dtype: int64


Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License,PMCID
0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,11250746.0,NO-CC CODE,13900
1,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,11250747.0,NO-CC CODE,13901
2,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,11250748.0,NO-CC CODE,13902
3,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,11056684.0,NO-CC CODE,13911
4,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,11400682.0,NO-CC CODE,13912


In [181]:
clinical_oa_df = openaccess_df[openaccess_df['PMID'].isin(small_list)]

In [7]:
print(clinical_oa_df.shape)

(296424, 6)


In [8]:
clinical_oa_df.head(10)

Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
13,oa_package/17/bb/PMC13921.tar.gz,Breast Cancer Res. 2000 Aug 21; 2(6):438-443,PMC13921,2014-04-29 14:49:00,11056691.0,NO-CC CODE
15,oa_package/75/59/PMC13923.tar.gz,Breast Cancer Res. 2001 Dec 22; 3(2):122-133,PMC13923,2014-04-29 19:39:41,11250759.0,NO-CC CODE
24,oa_package/9c/ed/PMC15027.tar.gz,Genome Biol. 2000 Nov 6; 1(5):research0009.1-r...,PMC15027,2014-04-29 19:45:12,11178258.0,NO-CC CODE
29,oa_package/be/ab/PMC16145.tar.gz,Genome Biol. 2000 Dec 4; 1(6):research0014.1-1...,PMC16145,2014-04-29 19:46:09,11178268.0,NO-CC CODE
39,oa_package/70/ed/PMC17806.tar.gz,Arthritis Res. 2000 Dec 22; 2(1):75-84,PMC17806,2014-04-29 19:51:28,11219392.0,NO-CC CODE
67,oa_package/ad/9a/PMC28986.tar.gz,Crit Care. 1997 Aug 13; 1(1):25-39,PMC28986,2014-04-29 19:58:26,11056694.0,NO-CC CODE
70,oa_package/ad/08/PMC28989.tar.gz,Crit Care. 1997 Nov 26; 1(2):65-70,PMC28989,2014-04-29 19:58:26,11056697.0,NO-CC CODE
72,oa_package/08/2d/PMC28991.tar.gz,Crit Care. 1997 Nov 26; 1(2):75-77,PMC28991,2014-04-29 19:58:27,11056699.0,NO-CC CODE
76,oa_package/72/15/PMC28995.tar.gz,Crit Care. 1997 Jan 22; 1(3):105-110,PMC28995,2014-04-29 19:58:27,11056703.0,NO-CC CODE
81,oa_package/8a/9a/PMC29000.tar.gz,Crit Care. 1998 Mar 12; 2(1):35-39,PMC29000,2014-04-29 19:58:28,11056708.0,NO-CC CODE


In [11]:
clinical_oa_df.to_csv('clinical_oa_df.csv')
#clinical_oa_df_test = pd.read_csv('clinical_oa_df.csv', index_col=0)
#clinical_oa_df = clinical_oa_df.reset_index(drop=True)

In [13]:
#clinical_oa_df_test.head()

In [14]:
target_list = list(clinical_oa_df['Accession ID'])
target_list = [i[3:] for i  in target_list  ]
print(target_list[-10:])
print(len(target_list))

['5877394', '5877421', '5877828', '5877934', '5878245', '5878246', '5878248', '5878253', '5878256', '5878415']
296424


In [None]:
#ftp_string1 = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/%s \n"
copy_string1 = "cp %s ./targetarticles/%s.nxml\n"

f = open("copy_script.sh", "w")
f.write("#!/bin/bash\n")

for index, row in clinical_oa_df[:30].iterrows():
    fname = str(row['Accession ID'])
    print(fname)
    fpath = glob.glob('./data/*/'+fname+'.nxml')
    if (len(fpath) > 0): 
        print(fpath[0])
        f.write(copy_string1 % (fpath[0], str(row['Accession ID'] )))
f.close()

PMC13921
PMC13923
PMC15027


In [15]:
print(target_list[3])

16145


In [30]:
#s3 = boto3.client('s3')
#resp = s3.list_objects_v2(Bucket='pubmedcentral_oa')
#print(len(resp['Contents']))

s3 = boto3.resource('s3')
bucket = s3.Bucket('pubmedcentral_oa')
size = sum(1 for _ in bucket.objects.all())
print(size)

173213


In [47]:
#s3 = boto3.client('s3')
#resp = s3.list_objects_v2(Bucket='pubmedcentral_oa')
#print(len(resp['Contents']))
import time

start_time = time.time()

s3 = boto3.resource('s3')
bucket = s3.Bucket('pubmedcentral_oa')
inS3BucketIDs = [obj.key for obj in bucket.objects.all()]
inS3BucketIDs = [i[3:-5] for i  in inS3BucketIDs]

print(inS3BucketIDs[:20])
print(len(inS3BucketIDs))

elapsed_time = time.time() - start_time
print(elapsed_time/60.)

['100321', '100327', '100357', '100783', '101376', '101381', '101386', '101394', '101395', '101407', '101408', '101493', '102323', '102333', '102757', '1033642', '1036000', '1036002', '103662', '103664']
294019
2.0204272786776225


1. Register the email address with Amazon SES and verify the domain, and the add a TXT Record to the Domain's DNS Server
2. Create an S3 bucket and attach a policy to allow Amazon SES to putObject in the S3 bucket
3. 

In [21]:
targetarticles_files_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
targetarticles_files_list = [i[40:] for i  in targetarticles_files_list  ]
targetarticles_files_list = [i[:-5] for i  in targetarticles_files_list  ]

print(targetarticles_files_list[:10])
print(len(targetarticles_files_list))

to_be_downloaded_list = list(set(target_list) - set(targetarticles_files_list))
print(len(to_be_downloaded_list))

['3778263', '4473156', '5771285', '4264693', '4210730', '4578256', '5424460', '5541470', '4634690', '4548150']
294019
2564


In [61]:
import json

json = json.dumps(inS3BucketIDs)
f = open("inS3BucketIDs.json","w")
f.write(json)
f.close()

In [None]:
ondisk_files_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
ondisk_files_list = [i[40:] for i  in ondisk_files_list  ]
ondisk_files_list = [i[:-5] for i  in ondisk_files_list  ]
print(ondisk_files_list[:10])
print(len(ondisk_files_list))

targets_on_disk = list(set.intersection(set(file_list),set(ondisk_files_list)))
print(len(targets_on_disk))

Get the set of articles that contain random in the abstract
--


In [None]:
#regexRandom = '\\W*\\brandom(?:i(s|z))?\\W*'
regexRandom = '(?i)(?:mean|M?)?\W*age[d|s]?\W*?(?:mean|M)?\W*(?:year[s]|day[s]|week[s])\W*(?:mean|M)?\W*[?:-|+|±]?\W*(S[.])?(D[.])?'

doc_count = 0
file_count = 0
random_content = []
rct_with_table = []

#file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
#choice = np.random.randint(50000, size=1000)
#for f in range(len(file_list)): 
#for f in choice: 
for doc in targets_on_disk:
    #print(os.path.basename(file_list[f])[:-5])
    filename = "/home/dave/datapubmed/targetarticles/PMC"+ str(doc) + ".nxml"
    fh = open(filename, "r")
    #text =  fh.read()
    #random= re.findall(regexRandom,text)
    '''
    tree = read_xml(filename)
    if tree.xpath('//abstract') is not None:
        random = tree.xpath("//abstract[re:match(., regexRandom)]", 
                   namespaces={"re": "http://exslt.org/regular-expressions"})
        random = [unidecode(stringify_children(e)) for e in random]
    else:
        random =  ''
    '''
    
    #fh = open(file_list[f],'r')
    
    #tree = etree.parse(path)
    #xml = ET.fromstring(tree)
    
    
    #######
    #this works:
    doc_count += 1
    soup = bs.BeautifulSoup(fh,'lxml')
    alist = soup.find('abstract')
    if alist is not None:
        random = alist.find_all(text=re.compile(regexRandom, re.I))
    ######
    
        if len (random):
            file_count += 1
            if file_count % 1000 == 0:
                print("target files: %s " % str(doc_count))
                print("file count: %s " % str(file_count))
            rct_with_table.append(doc)
            table_dict = {'PMCID': "PMC" + str(doc), 'Text': random}
            random_content.append(table_dict)
    #print(soup.prettify())
    
    #tree = read_xml(file_list[f])
    #abstract_tree = tree.xpath('//abstract')
    #abstract = ' '.join([stringify_children(a).strip() for a in abstract_tree])
    #for element in tree.iter("abstract"):
    #    print(etree.tostring(element, method='text'))
    #    print("tag: %s - Text: %s" % (element.tag, element.text))
    
    #abstract_dict = parse_pubmed_abstract(file_list[f])
    #if len (abstract_dict['abstract'] ):
    #    file_count += 1
    #    print(abstract_dict['abstract'])
    #filecnt.append(abstract_dict)

    #print(abstract)

    
    #dict_article_meta = parse_article_meta(tree)
    #pmid = dict_article_meta['pmid'];# print (pmid)
    #pmc = dict_article_meta['pmc']; print (pmc)
    #alist = soup.findAll('a')#.findAll(text="age")
    #age = soup.findAll(text=re.compile(regexAge, re.I))
print(file_count)

In [None]:
import json
json = json.dumps(random_content)
f = open("random_content.json","w")
f.write(json)
f.close()

In [None]:
table1_list = ['1', 'T', 'TABLE', 'TABLE 1', 'TABLE 1 ', 'TABLE1', 'TABLE 1.', 'TABLE1.', 'TABLE I', 'TABLE I.',\
               'TAB. 1', 'TAB.1', 'Table', 'Table 1', 'Table 1 ', 'Table 1 -', 'Table 1.', 'Table 1:', \
              'Tab. 1', 'Tab.1', 'Table I', 'Table I.', 'Table No. 1', 'Table. 1', 'Tabela 1', 'Tableau 1']

table1_set = set(table1_list)
regexAge = '(?i)(?:mean|M?)?\W*age[d|s]?\W*?(?:mean|M)?\W*(?:year[s]|day[s]|week[s])\W*(?:mean|M)?\W*[?:-|+|±]?\W*(S[.])?(D[.])?'

table_count = 0
parts_count = 0
table1_count = 0
tablecnt = []
#targets_on_disk = ['3001701', '4223415', '3519475', '4640859', '5215651', '3945930']
for doc in rct_with_table:#[:4000]:
    filename = "/home/dave/datapubmed/targetarticles/PMC"+ str(doc) + ".nxml"
    fh = open(filename, "r")
    tree = read_xml(fh)
    if tree is not None:
        tables = tree.xpath('//body//sec//table-wrap')
        if not tables:
            #print(file_list[f], ":  0")
            table_dict = {'PMCID': "PMC" + str(doc), 'Table': 'no_tables', 'Id': '', 'Parts': '', 'Age': '', 'Sex': ''}
        else:
            for table in tables:
                #has_label = table.find('label') 
                if table.find('label') is not None:
                    label = unidecode(table.find('label').text or '')
                    tableid = unidecode(table.find('id') or '')
                    #has_label_text = has_label.text
                    if label in table1_set:
                        table1_count += 1
                        soup = bs.BeautifulSoup(etree.tostring(table), "lxml")
                        #print(soup.prettify())
                        parts = soup.findAll(text=re.compile('n=', re.I))
                        if (len(parts) == 1 | len(parts) == 2) :
                            parts_count += 1
                        age = soup.findAll(text=re.compile(regexAge, re.I))
                        sex = soup.findAll(text=re.compile('^Sex', re.I))
                        gender = soup.findAll(text=re.compile('^Gender', re.I))
                        sexgender = sex + gender
                        
                        if len(sexgender) > 0:
                            table_count += 1
                            
                        age = [unidecode(a) for a in age]
                        sexgender= [unidecode(a) for a in sexgender]

                        #if (len(age) > 0):
                        #    print(age)
                        #age = unidecode(soup.findAll(text=re.compile('^Age', re.I)).text or '')
                        #sex = unidecode(tablelist.findAll(text=re.compile('^Sex', re.I)).text or '')
                        table_dict = {'PMCID': "PMC" + str(doc), 'Table': label, 'Id': table.attrib['id'], 'Parts': parts, 'Age': age, 'Sex': sexgender}
                        break
                    else:
                        table_dict = {'PMCID': "PMC" + str(doc), 'Table': label, 'Id': table.attrib['id'], 'Parts': '', 'Age': '', 'Sex': ''}
                else:
                    table_dict = {'PMCID': "PMC" + str(doc), 'Table': 'no_label', 'Id': table.attrib['id'], 'Parts': '', 'Age': '', 'Sex': ''}
        tablecnt.append(table_dict)
print(table_count)
print(parts_count)

In [None]:
pprint(tablecnt)

In [309]:
import os
os.environ["TZ"]="UTC"
import boto3
import sys

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')

try:
    table = dynamodb.create_table(
        TableName='demographics',
        KeySchema=[
            {
                'AttributeName': 'pmcid', 
                'KeyType': 'HASH'
            }
        ], 
        AttributeDefinitions=[
            {
                'AttributeName': 'pmcid', 
                'AttributeType': 'S'
            }
        ], 
        ProvisionedThroughput={
            'ReadCapacityUnits': 1, 
            'WriteCapacityUnits': 1
        }
    )

    table.meta.client.get_waiter('table_exists').wait(TableName='demographics')
    print(table.item_count)

except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

0


In [308]:
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')

try:
    dynamodb.Table('demographics').delete()
    print('deleted table')
        
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

deleted table


In [None]:
f = open('demographics.json')
request_items = json.loads(f.read())
client = boto3.client('dynamodb')
response = client.batch_write_item(RequestItems=request_items)

In [None]:
import os
os.environ["TZ"]="UTC"
import boto3
import json
import decimal

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
input_file = "inS3BucketIDs.json"
try:
    table = dynamodb.Table('demographics')
    print("Instantiate a table: ",table.creation_date_time)
    print("Ready to load data\n")
    incr = 0
    with open(input_file) as json_file:
        itemset = json.load(json_file, parse_float = decimal.Decimal)
        for item in itemset:
            incr += 1
            #pmcid = item['pmcid']
            #title = item['title']
            #date_processed = movie['date_processed']
            #print("Adding record # ", incr," pmcid: ",pmcid," title: ",title,)
            if (incr % 10000 == 0):
                print("Adding record # ", incr)
            table.put_item(
               Item={
                    "pmcid": {"S": pmc},
                    "pmid": {"S": pmid},
                    "date_processed": {"S": now},
                    "title": {"S": title},
                }
            )
            
            
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

In [119]:
fh = open('/home/dave/datapubmed/targetarticles/PMC4210730.nxml','r')
soup = bs.BeautifulSoup(fh,'lxml')
#print(soup.prettify())

file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC4210730.nxml')
#fh = open(file_list[f],'r')
tree = read_xml(*file_list)
if tree is not None:
    print('tree')
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']; print (pmid)
    pmc = dict_article_meta['pmc']; print (pmc)
    tree_title = tree.find('//title-group/article-title')
    if tree_title is not None:
        title = [t for t in tree_title.itertext()]
        sub_title = tree.xpath('//title-group/subtitle/text()')
        title.extend(sub_title)
        title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
        full_title = ' '.join(title)
    else:
        full_title = ' '
    print(full_title)
#table = soup.find('table')

#print(table.thead.prettify())

#parse_html_table(table)
#tree = read_xml('/Volumes/Untitled/clinical_data/PMC1297588.nxml')

#print(etree.tostring(tree, pretty_print=True))



#[(table['id'], parse_html_table(table))  for table in soup.find_all('table')]  

tree
25352696
4210730
IgG Avidity Antibodies against  Toxoplasma gondii  in High Risk Females of Reproductive Age Group in India


Create a JSON file for the dynamodb database
---

In [82]:
import datetime
now = datetime.datetime.now().strftime("%Y-%m-%d")
print(now)

2018-04-06


In [164]:
import os
os.environ["TZ"]="UTC"
import boto3
import sys

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')

try:
    table = dynamodb.create_table(
        TableName='demographics',
        KeySchema=[
            {
                'AttributeName': 'pmcid', 
                'KeyType': 'HASH'
            }
        ], 
        AttributeDefinitions=[
            {
                'AttributeName': 'pmcid', 
                'AttributeType': 'S'
            }
        ], 
        ProvisionedThroughput={
            'ReadCapacityUnits': 1, 
            'WriteCapacityUnits': 1
        }
    )

    table.meta.client.get_waiter('table_exists').wait(TableName='demographics')
    print(table.item_count)

except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

0


In [165]:
item_count = 0
itemset_array = []
now = datetime.datetime.now().strftime("%Y-%m-%d")

client = boto3.client('dynamodb')

file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
#choice = np.random.randint(10000, size=1000)
for f in range(len(file_list)): 
#for f in choice: 
    item_count += 1
    tree = read_xml(file_list[f])
    if tree is not None:
        dict_article_meta = parse_article_meta(tree)
        pmid = dict_article_meta['pmid'];# print (pmid)
        if (pmid == ''):
            pmid = 'PMID missing'
        pmc = dict_article_meta['pmc']; #print (pmc)
        
        tree_title = tree.find('//title-group/article-title')
        if tree_title is not None:
            title = [t for t in tree_title.itertext()]
            sub_title = tree.xpath('//title-group/subtitle/text()')
            title.extend(sub_title)
            title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
            if len(title):
                full_title = ' '.join(title)
            else:
                full_title = 'Title missing'
        else:
            full_title = 'Title missing'
            
        item_dict = {
            "PutRequest": {
                "Item": {
                    "pmcid": {"S": pmc},
                    "pmid": {"S": pmid},
                    "date_processed": {"S": now},
                    "title": {"S": full_title},
                }
            }
        }

        itemset_array.append(item_dict)
        
    if (item_count % 25 == 0):
        response = client.batch_write_item(RequestItems={ "demographics": itemset_array})
        itemset_array = []
    if (item_count % 10000 == 0):
        print ("The item_count is: %s " % item_count)

The item_count is: 10000 
The item_count is: 20000 
The item_count is: 30000 
The item_count is: 40000 
The item_count is: 50000 
The item_count is: 60000 
The item_count is: 70000 
The item_count is: 80000 
The item_count is: 90000 
The item_count is: 100000 
The item_count is: 110000 
The item_count is: 120000 
The item_count is: 130000 
The item_count is: 140000 
The item_count is: 150000 
The item_count is: 160000 
The item_count is: 170000 
The item_count is: 180000 
The item_count is: 190000 
The item_count is: 200000 
The item_count is: 210000 
The item_count is: 220000 
The item_count is: 230000 
The item_count is: 240000 
The item_count is: 250000 
The item_count is: 260000 
The item_count is: 270000 
The item_count is: 280000 
The item_count is: 290000 


In [148]:
item_count = 0
itemset_array = []
now = datetime.datetime.now().strftime("%Y-%m-%d")

file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')

for f in range(len(file_list)): 

    tree = read_xml(file_list[f])
    if tree is not None:
        dict_article_meta = parse_article_meta(tree)
        pmid = dict_article_meta['pmid'];# print (pmid)
        if pmid == '':
            pmid = 'PMID missing'
        pmc = dict_article_meta['pmc']; #print (pmc)
        
        tree_title = tree.find('//title-group/article-title')
        if tree_title is not None:
            title = [t for t in tree_title.itertext()]
            sub_title = tree.xpath('//title-group/subtitle/text()')
            title.extend(sub_title)
            title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
            if len(title):
                full_title = ' '.join(title)
            else:
                full_title = 'Title missing'
        else:
            full_title = 'Title missing'
            
        item_dict = {
            "PutRequest": {
                "Item": {
                    "pmcid": {"S": pmc},
                    "pmid": {"S": pmid},
                    "date_processed": {"S": now},
                    "title": {"S": full_title},
                }
            }
        }

        itemset_array.append(item_dict)
output_dict = { "demographics": itemset_array}

In [167]:
PMCIDs_in_demographics = [d['PutRequest']['Item']['pmcid']['S']  for d in output_dict['demographics']]
jsondata = json.dumps(pmcid_in_demographics)
f = open("PMCIDs_in_demographics.json","w")
f.write(jsondata)
f.close()

In [None]:
[d['PutRequest']['Item']['pmcid']['S']  for d in output_dict['demographics']]

In [149]:
import json

jsondata = json.dumps(output_dict)
f = open("output_dict.json","w")
f.write(jsondata)
f.close()

#f = open('output_dict.json')
#request_items = json.loads(f.read())


In [169]:
pprint(Counter( [d['PutRequest']['Item']['pmcid']['S']  for d in output_dict['demographics']]).most_common(40))
pprint(Counter( [d['PutRequest']['Item']['pmid']['S']  for d in output_dict['demographics']]).most_common(40))
pprint(Counter( [d['PutRequest']['Item']['date_processed']['S']  for d in output_dict['demographics']]).most_common(40))
pprint(Counter( [d['PutRequest']['Item']['title']['S']  for d in output_dict['demographics']]).most_common(40))

[('4500842', 1),
 ('5366211', 1),
 ('5293559', 1),
 ('4845260', 1),
 ('3792899', 1),
 ('4332942', 1),
 ('5011471', 1),
 ('2845513', 1),
 ('5598241', 1),
 ('4369803', 1),
 ('5835343', 1),
 ('5315024', 1),
 ('2984425', 1),
 ('4329748', 1),
 ('4308794', 1),
 ('4018974', 1),
 ('2249589', 1),
 ('2890341', 1),
 ('3954800', 1),
 ('4494171', 1),
 ('2175320', 1),
 ('2631700', 1),
 ('3881939', 1),
 ('3791597', 1),
 ('5623425', 1),
 ('4804544', 1),
 ('5590032', 1),
 ('4706891', 1),
 ('3886572', 1),
 ('3142342', 1),
 ('4359699', 1),
 ('3227281', 1),
 ('3727980', 1),
 ('5571909', 1),
 ('3541191', 1),
 ('5644166', 1),
 ('2135761', 1),
 ('5123663', 1),
 ('5142261', 1),
 ('4123605', 1)]
[('', 90),
 ('27175291', 2),
 ('27365953', 1),
 ('24015130', 1),
 ('29499766', 1),
 ('28137763', 1),
 ('28776343', 1),
 ('23820255', 1),
 ('19113992', 1),
 ('18483569', 1),
 ('25642758', 1),
 ('23717389', 1),
 ('23735116', 1),
 ('26322082', 1),
 ('25206588', 1),
 ('29134106', 1),
 ('17129383', 1),
 ('26619915', 1),
 ('

In [173]:
openaccess_df.head()

Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,11250746.0,NO-CC CODE
1,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,11250747.0,NO-CC CODE
2,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,11250748.0,NO-CC CODE
3,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,11056684.0,NO-CC CODE
4,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,11400682.0,NO-CC CODE


In [192]:
demographics_in_oa = openaccess_df[openaccess_df['PMCID'].isin(PMCIDs_in_demographics)]
print(len(demographics_in_oa))
print(demographics_in_oa[:10])

294018
                                File  \
13  oa_package/17/bb/PMC13921.tar.gz   
15  oa_package/75/59/PMC13923.tar.gz   
24  oa_package/9c/ed/PMC15027.tar.gz   
29  oa_package/be/ab/PMC16145.tar.gz   
39  oa_package/70/ed/PMC17806.tar.gz   
67  oa_package/ad/9a/PMC28986.tar.gz   
70  oa_package/ad/08/PMC28989.tar.gz   
72  oa_package/08/2d/PMC28991.tar.gz   
76  oa_package/72/15/PMC28995.tar.gz   
81  oa_package/8a/9a/PMC29000.tar.gz   

                                     Article Citation Accession ID  \
13       Breast Cancer Res. 2000 Aug 21; 2(6):438-443     PMC13921   
15       Breast Cancer Res. 2001 Dec 22; 3(2):122-133     PMC13923   
24  Genome Biol. 2000 Nov 6; 1(5):research0009.1-r...     PMC15027   
29  Genome Biol. 2000 Dec 4; 1(6):research0014.1-1...     PMC16145   
39             Arthritis Res. 2000 Dec 22; 2(1):75-84     PMC17806   
67                 Crit Care. 1997 Aug 13; 1(1):25-39     PMC28986   
70                 Crit Care. 1997 Nov 26; 1(2):65-70     PMC2

In [203]:
demographics_in_oa = demographics_in_oa.set_index('PMCID')

In [204]:
demographics_in_oa.head()

Unnamed: 0_level_0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
PMCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13921,oa_package/17/bb/PMC13921.tar.gz,Breast Cancer Res. 2000 Aug 21; 2(6):438-443,PMC13921,2014-04-29 14:49:00,11056691.0,NO-CC CODE
13923,oa_package/75/59/PMC13923.tar.gz,Breast Cancer Res. 2001 Dec 22; 3(2):122-133,PMC13923,2014-04-29 19:39:41,11250759.0,NO-CC CODE
15027,oa_package/9c/ed/PMC15027.tar.gz,Genome Biol. 2000 Nov 6; 1(5):research0009.1-r...,PMC15027,2014-04-29 19:45:12,11178258.0,NO-CC CODE
16145,oa_package/be/ab/PMC16145.tar.gz,Genome Biol. 2000 Dec 4; 1(6):research0014.1-1...,PMC16145,2014-04-29 19:46:09,11178268.0,NO-CC CODE
17806,oa_package/70/ed/PMC17806.tar.gz,Arthritis Res. 2000 Dec 22; 2(1):75-84,PMC17806,2014-04-29 19:51:28,11219392.0,NO-CC CODE


In [206]:
int(demographics_in_oa['PMID'].loc['13923'])

11250759

In [264]:
demographic_json = []
for d in output_dict['demographics']:
    itemdict = {
        "pmcid": {"S": d['PutRequest']['Item']['pmid']['S']},
        "pmid":  {"S": d['PutRequest']['Item']['pmcid']['S']},
        "date_processed":  {"S": d['PutRequest']['Item']['date_processed']['S']},
        "title":  {"S": d['PutRequest']['Item']['title']['S']}
        }
    demographic_json.append(itemdict)

In [311]:
now = datetime.datetime.now().strftime("%Y-%m-%d")
demographic_json = []
for d in output_dict['demographics']:
    itemdict = {
        "pmcid": str(d['PutRequest']['Item']['pmcid']['S']),
        "pmid":  str(d['PutRequest']['Item']['pmid']['S']),
        "date_processed":  str(now),
        "title":  unidecode(d['PutRequest']['Item']['title']['S'])
        }
    demographic_json.append(itemdict)

In [207]:
missing_pmid = []
for d in output_dict['demographics']:
    mykey = d['PutRequest']['Item']['pmcid']['S']
    if (d['PutRequest']['Item']['pmid']['S'] == ''):
        missing_pmid.append(mykey)
        d['PutRequest']['Item']['pmid']['S'] = int(demographics_in_oa['PMID'].loc[mykey])

In [216]:
print(len(missing_pmid))
print(missing_pmid[:10])

90
['5818702', '5806443', '5822973', '5806432', '5812181', '5822851', '5828420', '5829952', '5822983', '5812180']


In [293]:
print(len(demographic_json))
print(demographic_json[10]['pmcid'])

294019
28717714


In [269]:
resource = boto3.resource('dynamodb')
table = resource.Table('demographics')
mykey = demographic_json[100001]['pmcid']
response = table.get_item(Key={'pmcid': '24179788'})
print(type(response['ResponseMetadata']['HTTPStatusCode']))
print(response['Item'])

<class 'int'>
{'title': 'Catecholamine depletion in first-degree relatives of individuals with mood disorders: An [ 18 F]fluorodeoxyglucose positron emission tomography study ☆', 'pmcid': '24179788', 'date_processed': '2018-04-06', 'pmid': '3778263'}


## Batch put

In [276]:
resource = boto3.resource('dynamodb')
table = resource.Table('demographics')
with table.batch_writer() as batch:
    for i in range(100):#len(demographic_json)):
        mykey = demographic_json[i]['pmcid']['S']
        response = table.get_item(Key={'pmcid': mykey})
        if response['Item'] is None:
            batch.put_item(Item=demographic_json[i])
        if (i % 10000 == 0):
            print ("The item_count is: %s " % (i))

The item_count is: 0 


In [312]:
demographic_json[1500]

{'date_processed': '2018-04-07',
 'pmcid': '5799897',
 'pmid': '29402275',
 'title': 'Acute administration of beta-caryophyllene prevents endocannabinoid system activation during transient common carotid artery occlusion and reperfusion'}

In [313]:
resource = boto3.resource('dynamodb')
table = resource.Table('demographics')
with table.batch_writer() as batch:
    for i in range(len(demographic_json)):
        batch.put_item(
            Item=demographic_json[i]
        )
        if (i % 1000 == 0):
            print ("The item_count is: %s " % (i))

The item_count is: 0 
The item_count is: 1000 
The item_count is: 2000 
The item_count is: 3000 
The item_count is: 4000 
The item_count is: 5000 
The item_count is: 6000 
The item_count is: 7000 
The item_count is: 8000 
The item_count is: 9000 
The item_count is: 10000 
The item_count is: 11000 
The item_count is: 12000 
The item_count is: 13000 
The item_count is: 14000 
The item_count is: 15000 
The item_count is: 16000 
The item_count is: 17000 
The item_count is: 18000 
The item_count is: 19000 
The item_count is: 20000 
The item_count is: 21000 
The item_count is: 22000 
The item_count is: 23000 
The item_count is: 24000 
The item_count is: 25000 
The item_count is: 26000 
The item_count is: 27000 
The item_count is: 28000 
The item_count is: 29000 
The item_count is: 30000 
The item_count is: 31000 
The item_count is: 32000 
The item_count is: 33000 
The item_count is: 34000 
The item_count is: 35000 
The item_count is: 36000 
The item_count is: 37000 
The item_count is: 38000 

## Normal put

In [254]:
responses = []
resource = boto3.resource('dynamodb')
table = resource.Table('demographics_alt')
#with table.batch_writer() as batch:
for i in range(len(missing_pmid)):
    #mykey = demographic_json[i]['pmcid']
    mykey = missing_pmid[i]
    #response = table.get_item(Key={'pmcid': mykey})
    #if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        #print('hit')
    table.update_item(Item=demographic_json[i])
        #responses.append({'pmcid': mykey, 'response': response}) 
    if (i % 10000 == 0):
        print ("The item_count is: %s " % (i))

The item_count is: 0 


In [257]:
responses = []
resource = boto3.resource('dynamodb')
table = resource.Table('demographics')
#with table.batch_writer() as batch:
for i in range(len(missing_pmid)):
    #mykey = demographic_json[i]['pmcid']
    mykey = missing_pmid[i]
    #response = table.get_item(Key={'pmcid': mykey})
    #if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        #print('hit')
    table.update_item(
        Key={'pmcid': mykey},
        UpdateExpression="set pmid = :s",
        ExpressionAttributeValues={
            ':s': int(demographics_in_oa['PMID'].loc[mykey])
        },
    )

    #print ("The item_count is: %s " % (i))

In [302]:
for mykey in missing_pmid:
    print(int(demographics_in_oa['PMID'].loc[mykey]))

29497398
29497483
29491670
29497449
29491800
29497322
29497350
29497582
29491672
29491784
29491915
29497377
29497367
29491906
29491671
29497408
29497244
29497315
29491940
29497417
29497472
29497402
29497566
29491933
29497320
29497475
29497412
29497561
29491928
29497615
29497384
29491804
29497378
29491904
29491894
29491898
29497592
29491680
29491668
29497466
29497291
29497448
29497691
29491679
29497450
29491937
29497453
29491535
29497254
29491924
29497328
29497584
29497329
29497333
29497369
29497289
29497274
29497446
29497236
29497279
29491907
29497485
29491932
29497430
29497534
29497623
29497619
29497179
29497583
29497705
29599803
29491673
29497275
29491902
29497444
29497393
29497693
29497304
29497506
29491946
29497232
29497443
29497298
29491802
29497471
29497331
29491678
29497397
29491538
29497447


In [316]:
pmcidd_list = ['2064904']
#for i in range(len(missing_pmid)):
for i in range(len(pmcidd_list)):
    #mykey = demographic_json[i]['pmcid']
    mykey = pmcidd_list[i]
    response = table.get_item(Key={'pmcid': mykey})
    print(response)

{'ResponseMetadata': {'HTTPStatusCode': 200, 'RequestId': 'HOLRTNDS9QO2PG5A494PJUT0DVVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPHeaders': {'x-amz-crc32': '2132856803', 'content-type': 'application/x-amz-json-1.0', 'date': 'Sat, 07 Apr 2018 19:47:53 GMT', 'x-amzn-requestid': 'HOLRTNDS9QO2PG5A494PJUT0DVVV4KQNSO5AEMVJF66Q9ASUAAJG', 'content-length': '200', 'server': 'Server', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'Item': {'title': 'Evaluation of a peer counselling programme to sustain breastfeeding practice in Hong Kong', 'pmcid': '2064904', 'date_processed': '2018-04-07', 'pmid': '17883851'}}


In [317]:
demographics_metata = get_table_metadata('demographics')
pprint(demographics_metata)

{'bytes_size': 0,
 'global_secondary_indices': None,
 'num_items': 0,
 'primary_key_name': {'AttributeName': 'pmcid', 'KeyType': 'HASH'},
 'status': 'ACTIVE'}


In [275]:
print(responses)

[]


In [193]:
from boto3 import resource
from boto3.dynamodb.conditions import Key

# The boto3 dynamoDB resource
dynamodb_resource = resource('dynamodb')
table = dynamodb_resource.Table('demographics')

def get_table_metadata(table_name):
    """
    Get some metadata about chosen table.
    """
    table = dynamodb_resource.Table(table_name)

    return {
        'num_items': table.item_count,
        'primary_key_name': table.key_schema[0],
        'status': table.table_status,
        'bytes_size': table.table_size_bytes,
        'global_secondary_indices': table.global_secondary_indexes
    }

def read_table_item(table_name, pk_name, pk_value):
    """
    Return item read by primary key.
    """
    table = dynamodb_resource.Table(table_name)
    response = table.get_item(Key={pk_name: pk_value})

    return response


def add_item(table_name, col_dict):
    """
    Add one item (row) to table. col_dict is a dictionary {col_name: value}.
    """
    table = dynamodb_resource.Table(table_name)
    response = table.put_item(Item=col_dict)

    return response


def delete_item(table_name, pk_name, pk_value):
    """
    Delete an item (row) in table from its primary key.
    """
    table = dynamodb_resource.Table(table_name)
    response = table.delete_item(Key={pk_name: pk_value})

    return

In [272]:
dynamodb = boto3.resource('dynamodb')

table = dynamodb.Table('demographics_alt')

response = table.scan()
data = response['Items']
demographic_loaded = [d['pmcid'] for d in data]

In [273]:
print(len(demographic_loaded))

100


In [274]:
data[:20]

[{'date_processed': '2018-04-06',
  'pmcid': '28717714',
  'pmid': '5493262',
  'title': 'Phenotype and Treatment of Breast Cancer in HIV-Positive and -Negative Women in Cape Town, South Africa'},
 {'date_processed': '2018-04-06',
  'pmcid': '26860192',
  'pmid': '4746920',
  'title': 'Strengthening national capacities for researching on Social Determinants of Health (SDH) towards informing and addressing health inequities in Tanzania'},
 {'date_processed': '2018-04-06',
  'pmcid': '28194335',
  'pmid': '5299802',
  'title': "Investigators' viewpoint of clinical trials in India: Past, present and future"},
 {'date_processed': '2018-04-06',
  'pmcid': '26629543',
  'pmid': '4634690',
  'title': 'HPV Prophylactic Vaccination in Males Improves the Clearance of Semen Infection ☆'},
 {'date_processed': '2018-04-06',
  'pmcid': '28104700',
  'pmid': '5395116',
  'title': 'Inhibition of 4EBP phosphorylation mediates the cytotoxic effect of mechanistic target of rapamycin kinase inhibitors in 

In [201]:
dynamoDBResource = boto3.resource('dynamodb')
table = dynamoDBResource.Table('demographics')
print(table.item_count)

34822


In [202]:
import boto3

dynamoDBClient = boto3.client('dynamodb')
table = dynamoDBClient.describe_table(
    TableName='demographics'
)
print(table)

{'Table': {'TableSizeBytes': 5928627, 'CreationDateTime': datetime.datetime(2018, 4, 7, 0, 5, 18, 425000, tzinfo=tzlocal()), 'ProvisionedThroughput': {'NumberOfDecreasesToday': 1, 'ReadCapacityUnits': 5, 'LastIncreaseDateTime': datetime.datetime(2018, 4, 7, 0, 49, 3, 805000, tzinfo=tzlocal()), 'WriteCapacityUnits': 5, 'LastDecreaseDateTime': datetime.datetime(2018, 4, 7, 2, 40, 22, 683000, tzinfo=tzlocal())}, 'ItemCount': 34822, 'KeySchema': [{'KeyType': 'HASH', 'AttributeName': 'pmcid'}], 'TableName': 'demographics', 'TableArn': 'arn:aws:dynamodb:us-east-1:627238834174:table/demographics', 'TableId': 'e00989bb-d71f-45f1-9b5f-fb73f93a5a44', 'AttributeDefinitions': [{'AttributeName': 'pmcid', 'AttributeType': 'S'}], 'TableStatus': 'ACTIVE'}, 'ResponseMetadata': {'HTTPStatusCode': 200, 'RequestId': 'EAG99VKJOLNTKKKSB6LBEQD5MNVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPHeaders': {'x-amz-crc32': '129227739', 'content-type': 'application/x-amz-json-1.0', 'date': 'Sat, 07 Apr 2018 11:16:49 GMT', 'x-am

In [150]:
common_pmid = Counter(output_dict['pmid'])
print(common_pmid.most_common(10))

common_date_processed = Counter(output_dict['date_processed'])
print(common_date_processed.most_common(10))

common_title = Counter(output_dict['title'])
print(common_title.most_common(10))

KeyError: 'pmcid'

In [126]:
client = boto3.client('dynamodb')
response = client.batch_write_item(RequestItems=output_dict)

ClientError: An error occurred (ValidationException) when calling the BatchWriteItem operation: 1 validation error detected: Value '{demographics=[com.amazonaws.dynamodb.v20120810.WriteRequest@bd10590c, com.amazonaws.dynamodb.v20120810.WriteRequest@1152f6b9, com.amazonaws.dynamodb.v20120810.WriteRequest@ce212bf, com.amazonaws.dynamodb.v20120810.WriteRequest@12850f52, com.amazonaws.dynamodb.v20120810.WriteRequest@6400f025, com.amazonaws.dynamodb.v20120810.WriteRequest@96023c8e, com.amazonaws.dynamodb.v20120810.WriteRequest@45e66424, com.amazonaws.dynamodb.v20120810.WriteRequest@3e14c440, com.amazonaws.dynamodb.v20120810.WriteRequest@1b3bba90, com.amazonaws.dynamodb.v20120810.WriteRequest@8c6f2fa9, com.amazonaws.dynamodb.v20120810.WriteRequest@c1eb95d, com.amazonaws.dynamodb.v20120810.WriteRequest@c7f7ef85, com.amazonaws.dynamodb.v20120810.WriteRequest@b6e22677, com.amazonaws.dynamodb.v20120810.WriteRequest@5b6d9730, com.amazonaws.dynamodb.v20120810.WriteRequest@dd7e6194, com.amazonaws.dynamodb.v20120810.WriteRequest@b6c3a3a5, com.amazonaws.dynamodb.v20120810.WriteRequest@b6bad1cf, com.amazonaws.dynamodb.v20120810.WriteRequest@dcf93267, com.amazonaws.dynamodb.v20120810.WriteRequest@eec09072, com.amazonaws.dynamodb.v20120810.WriteRequest@b3b04e9d, com.amazonaws.dynamodb.v20120810.WriteRequest@20170ba5, com.amazonaws.dynamodb.v20120810.WriteRequest@564da404, com.amazonaws.dynamodb.v20120810.WriteRequest@eb89993, com.amazonaws.dynamodb.v20120810.WriteRequest@9329d749, com.amazonaws.dynamodb.v20120810.WriteRequest@bed04095, com.amazonaws.dynamodb.v20120810.WriteRequest@fee76fb4, com.amazonaws.dynamodb.v20120810.WriteRequest@2d636303, com.amazonaws.dynamodb.v20120810.WriteRequest@914d810e, com.amazonaws.dynamodb.v20120810.WriteRequest@34869323, com.amazonaws.dynamodb.v20120810.WriteRequest@1e95a7f2, com.amazonaws.dynamodb.v20120810.WriteRequest@8a78267, com.amazonaws.dynamodb.v20120810.WriteRequest@e6dd058f, com.amazonaws.dynamodb.v20120810.WriteRequest@6c555c02, com.amazonaws.dynamodb.v20120810.WriteRequest@b96cd19b, com.amazonaws.dynamodb.v20120810.WriteRequest@2db2fe52, com.amazonaws.dynamodb.v20120810.WriteRequest@d4ed5a25, com.amazonaws.dynamodb.v20120810.WriteRequest@fed61297, com.amazonaws.dynamodb.v20120810.WriteRequest@f0ee6e42, com.amazonaws.dynamodb.v20120810.WriteRequest@eeb9de58, com.amazonaws.dynamodb.v20120810.WriteRequest@680ce603, com.amazonaws.dynamodb.v20120810.WriteRequest@a32624d, com.amazonaws.dynamodb.v20120810.WriteRequest@49d0033a, com.amazonaws.dynamodb.v20120810.WriteRequest@c42078b6, com.amazonaws.dynamodb.v20120810.WriteRequest@70f93b40, com.amazonaws.dynamodb.v20120810.WriteRequest@381e4a60, com.amazonaws.dynamodb.v20120810.WriteRequest@5b747253, com.amazonaws.dynamodb.v20120810.WriteRequest@725e2473, com.amazonaws.dynamodb.v20120810.WriteRequest@7585593f, com.amazonaws.dynamodb.v20120810.WriteRequest@167c4d92, com.amazonaws.dynamodb.v20120810.WriteRequest@aa9fab9d, com.amazonaws.dynamodb.v20120810.WriteRequest@bcc1d839, com.amazonaws.dynamodb.v20120810.WriteRequest@e1791807, com.amazonaws.dynamodb.v20120810.WriteRequest@de074fd8, com.amazonaws.dynamodb.v20120810.WriteRequest@b0a6e025, com.amazonaws.dynamodb.v20120810.WriteRequest@b06faebf, com.amazonaws.dynamodb.v20120810.WriteRequest@329df746, com.amazonaws.dynamodb.v20120810.WriteRequest@b1e3b302, com.amazonaws.dynamodb.v20120810.WriteRequest@256ae7c8, com.amazonaws.dynamodb.v20120810.WriteRequest@53b02ed5, com.amazonaws.dynamodb.v20120810.WriteRequest@56a876f5, com.amazonaws.dynamodb.v20120810.WriteRequest@c88470c9, com.amazonaws.dynamodb.v20120810.WriteRequest@91001ad0, com.amazonaws.dynamodb.v20120810.WriteRequest@156e28fa, com.amazonaws.dynamodb.v20120810.WriteRequest@a60d5e79, com.amazonaws.dynamodb.v20120810.WriteRequest@dba4240c, com.amazonaws.dynamodb.v20120810.WriteRequest@8402ed21, com.amazonaws.dynamodb.v20120810.WriteRequest@e4d99c2e, com.amazonaws.dynamodb.v20120810.WriteRequest@42927f45, com.amazonaws.dynamodb.v20120810.WriteRequest@5dfa76f7, com.amazonaws.dynamodb.v20120810.WriteRequest@220be9fd, com.amazonaws.dynamodb.v20120810.WriteRequest@9c4272a, com.amazonaws.dynamodb.v20120810.WriteRequest@e50eb3ec, com.amazonaws.dynamodb.v20120810.WriteRequest@3fcdb9e1, com.amazonaws.dynamodb.v20120810.WriteRequest@4c3f569f, com.amazonaws.dynamodb.v20120810.WriteRequest@f23d2b, com.amazonaws.dynamodb.v20120810.WriteRequest@86b2609e, com.amazonaws.dynamodb.v20120810.WriteRequest@74e971fb, com.amazonaws.dynamodb.v20120810.WriteRequest@beb8fa3b, com.amazonaws.dynamodb.v20120810.WriteRequest@b02c9c8f, com.amazonaws.dynamodb.v20120810.WriteRequest@909a7c00, com.amazonaws.dynamodb.v20120810.WriteRequest@db028f99, com.amazonaws.dynamodb.v20120810.WriteRequest@4adde12d, com.amazonaws.dynamodb.v20120810.WriteRequest@65024c4b, com.amazonaws.dynamodb.v20120810.WriteRequest@6281a988, com.amazonaws.dynamodb.v20120810.WriteRequest@27a5ecb6, com.amazonaws.dynamodb.v20120810.WriteRequest@a88a3164, com.amazonaws.dynamodb.v20120810.WriteRequest@c9f643e6, com.amazonaws.dynamodb.v20120810.WriteRequest@48b25fd5, com.amazonaws.dynamodb.v20120810.WriteRequest@c6958f5a, com.amazonaws.dynamodb.v20120810.WriteRequest@165b0f44, com.amazonaws.dynamodb.v20120810.WriteRequest@c11bbd26, com.amazonaws.dynamodb.v20120810.WriteRequest@8f3cba63, com.amazonaws.dynamodb.v20120810.WriteRequest@29daed50, com.amazonaws.dynamodb.v20120810.WriteRequest@7bd6d05e, com.amazonaws.dynamodb.v20120810.WriteRequest@f208dbdd, com.amazonaws.dynamodb.v20120810.WriteRequest@761ee955, com.amazonaws.dynamodb.v20120810.WriteRequest@39aefa0f, com.amazonaws.dynamodb.v20120810.WriteRequest@6955d083, com.amazonaws.dynamodb.v20120810.WriteRequest@b96f24ff, com.amazonaws.dynamodb.v20120810.WriteRequest@832ced37]}' at 'requestItems' failed to satisfy constraint: Map value must satisfy constraint: [Member must have length less than or equal to 25, Member must have length greater than or equal to 1]

Delete the table `demographics` from dynamodb
---

In [161]:
import os
os.environ["TZ"]="UTC"
import boto3
import sys

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')

try:
    dynamodb.Table('demographics').delete()
    print('deleted table')
        
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

deleted table


In [None]:
"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids="

In [10]:
print(glob.glob('./data/*/' + target_list[8] +'.nxml'))
print(glob.glob('./data/*/'+'PMC5388656'+'.nxml'))

[]
['./data/3_Biotech/PMC5388656.nxml']


In [21]:
downloaded_files_list = glob.glob('./downloaded/PMC*.nxml')
downloaded_files_list = [i[16:] for i  in downloaded_files_list  ]
downloaded_files_list = [i[:-5] for i  in downloaded_files_list  ]

print(downloaded_files_list[:10])
print(len(downloaded_files_list))

['1002252', '1002265', '1002290', '1002414', '1002417', '1002474', '1002838', '1002949', '1002951', '1002983']
159782


In [84]:
import os
number = 0

def utf8len(s):
    return len(s.encode('utf-8'))

downloaded_files_list = glob.glob('./downloaded/PMC*.nxml')
print(len(downloaded_files_list))
for fh  in downloaded_files_list: 
    b = os.path.getsize(fh)
    if (str(b) > "20000L"):
        number += 1
print(number)
print("Percentage above: %.0f " % (number / len(downloaded_files_list) *100))

134312
97932
Percentage above: 73 


In [18]:
clinical_data_files_list = glob.glob('./clinical_data/PMC*.nxml')
clinical_data_files_list = [i[19:] for i  in clinical_data_files_list  ]
clinical_data_files_list = [i[:-5] for i  in clinical_data_files_list  ]

print(clinical_data_files_list[:10])
print(len(clinical_data_files_list))

['ed/targetarticles/PMC3778263', 'ed/targetarticles/PMC4473156', 'ed/targetarticles/PMC5771285', 'ed/targetarticles/PMC4264693', 'ed/targetarticles/PMC4210730', 'ed/targetarticles/PMC4578256', 'ed/targetarticles/PMC5424460', 'ed/targetarticles/PMC5541470', 'ed/targetarticles/PMC4634690', 'ed/targetarticles/PMC4548150']
291151


In [90]:
not_downloaded_list = list(set(small_list) - set(clinical_data_files_list))
print(len(not_downloaded_list))
to_be_downloaded_list = list(set(not_downloaded_list) - set(downloaded_files_list))
print(len(to_be_downloaded_list))

3657333
3657333


In [15]:
in_clinical_data_list = list(set(target_list) - set(clinical_data_files_list))
print(len(in_clinical_data_list))
to_be_downloaded_list = list(set(target_list) - set(in_clinical_data_list))
print(len(to_be_downloaded_list))

291059
0


In [49]:
common_downloaded_list = list(set(clinical_data_in_small_list) - set(downloaded_files_list))
print(len(common_downloaded_list))


0


In [50]:
clinical_data_in_small_list = list(set.intersection(set(clinical_data_files_list),set(small_list)))
print(len(clinical_data_in_small_list))


13938


In [27]:
clinical_data_in_target_list = list(set.intersection(set(clinical_data_files_list),set(target_list)))
print(len(clinical_data_in_target_list))


22833


In [28]:
downloaded_files_in_target_list = list(set.intersection(set(downloaded_files_list),set(target_list)))
print(len(downloaded_files_in_target_list))


48991


In [29]:
common_downloaded_list = list(set.intersection(set(downloaded_files_in_target_list),set(clinical_data_in_target_list)))
print(len(common_downloaded_list))


13575


In [31]:
from shutil import copyfile

#source = 'clinical_data'
source = 'downloaded'
destination = 'targetarticles'

for fh in additional_downloaded_list:
    copyfile('./'+ source +'/PMC' + fh +'.nxml', './'+ destination +'/PMC' + fh +'.nxml')

In [11]:
glob.glob('./data/*/'+'PMC368156'+'.nxml')

['./data/PLoS_Biol/PMC368156.nxml']

In [26]:
noncomm_files_list = glob.glob('/media/dave/NAS Disk1 4TB/pubmed/non_comm_use_data/PMC*.nxml')
noncomm_files_list = [i[54:] for i  in noncomm_files_list  ]
noncomm_files_list = [i[:-5] for i  in noncomm_files_list  ]

print(noncomm_files_list[:10])
print(len(noncomm_files_list))

['100321', '100322', '100323', '100324', '100325', '100326', '100327', '100357', '100780', '100781']
822090


In [27]:
noncomm_files_files_in_target_list = list(set.intersection(set(noncomm_files_list),set(target_list)))
print(len(noncomm_files_files_in_target_list))

122093


In [28]:
additional_list = list(set(noncomm_files_files_in_target_list) - set(targetarticles_files_list))
print(len(additional_list))

105315


In [29]:
print(len(list(set.intersection(set(noncomm_files_files_in_target_list),set(targetarticles_files_list)))))

16778


In [31]:
comm_files_list = glob.glob('/media/dave/NAS Disk1 4TB/pubmed/comm_use_data/PMC*.nxml')
comm_files_list = [i[50:] for i  in comm_files_list  ]
comm_files_list = [i[:-5] for i  in comm_files_list  ]

print(comm_files_list[:10])
print(len(comm_files_list))

['925', '931', '430', '557', '627', '859', '860', '861', '862', '830']
1051288


In [27]:
comm_files_files_in_target_list = list(set.intersection(set(comm_files_list),set(target_list)))
print(len(comm_files_files_in_target_list))

122093


In [28]:
additional_list = list(set(comm_files_files_in_target_list) - set(targetarticles_files_list))
print(len(additional_list))

105315


In [29]:
print(len(list(set.intersection(set(comm_files_files_in_target_list),set(targetarticles_files_list)))))

16778


In [30]:
from shutil import copyfile

#source = 'clinical_data'
source = 'downloaded'
destination = 'targetarticles'

for fh in additional_list:
    copyfile('/media/dave/NAS Disk1 4TB/pubmed/comm_use_data/PMC' + fh +'.nxml', '/home/dave/datapubmed/'+ destination +'/PMC' + fh +'.nxml')

In [46]:
additional_list = list(set(target_list) - set(targetarticles_files_list))
print(len(additional_list))

0


In [32]:
163564+126917

290481

In [4]:
clinical_oa_df = pd.read_csv('clinical_oa_df.csv')

In [5]:
clinical_oa_df.head(20)

Unnamed: 0.1,Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
0,48,oa_package/94/ba/PMC17815.tar.gz,Arthritis Res. 2000 Jun 8; 2(4):327-336,PMC17815,2014-04-29 19:51:33,11056673.0,NO-CC CODE
1,115,oa_package/44/ab/PMC29034.tar.gz,Crit Care. 2000 Jan 24; 4(1):40-44,PMC29034,2014-04-29 19:58:37,11056743.0,NO-CC CODE
2,123,oa_package/ad/d7/PMC29042.tar.gz,Crit Care. 2000 Apr 13; 4(3):188-192,PMC29042,2014-04-29 19:58:41,11056751.0,NO-CC CODE
3,126,oa_package/e7/5a/PMC29045.tar.gz,Crit Care. 2000 Jun 27; 4(4):249-254,PMC29045,2014-04-29 19:58:41,11056754.0,NO-CC CODE
4,128,oa_package/ed/00/PMC29047.tar.gz,Crit Care. 2000 Jul 31; 4(5):302-308,PMC29047,2014-04-29 19:58:41,11056756.0,NO-CC CODE
5,131,oa_package/2f/ef/PMC29050.tar.gz,Crit Care. 2000 Sep 8; 4(5):319-326,PMC29050,2014-04-29 19:58:42,11056759.0,NO-CC CODE
6,134,oa_package/37/95/PMC29053.tar.gz,Crit Care. 2001 Dec 8; 5(1):24-30,PMC29053,2014-04-29 19:58:42,11178222.0,NO-CC CODE
7,194,oa_package/fd/45/PMC30713.tar.gz,Crit Care. 2001 Jan 29; 5(2):81-87,PMC30713,2014-04-29 19:59:50,11299066.0,NO-CC CODE
8,237,oa_package/27/87/PMC31578.tar.gz,Crit Care. 2001 Apr 6; 5(3):145-150,PMC31578,2013-03-19 16:18:04,11353931.0,NO-CC CODE
9,238,oa_package/ec/70/PMC31579.tar.gz,Crit Care. 2001 Apr 20; 5(3):151-157,PMC31579,2013-03-19 16:18:04,11353932.0,NO-CC CODE


In [6]:
clinical_oa_df.shape

(42886, 7)

In [5]:
df_table1 = pd.read_csv('clinical_oa_df_table1.csv')

In [8]:
df_table1.shape

(42886, 11)

In [4]:
import numpy as np
choice = np.random.randint(2000, size=20)
print(choice)
small_list = clinical_oa_df.iloc[choice]['PMID'].astype(np.int)
print(small_list)

[ 657  112 1781 1007  923 1495 1155  401 1693 1983  898 1973  978  824  192
 1085  300  574  355 1167]
657     16689997
112     12857352
1781     8624266
1007    17328809
923     17485816
1495     2871860
1155    17370096
401     15963239
1693    17942387
1983    18031577
898     17169156
1973    17900336
978     17184516
824     17032439
192     15196307
1085    17370032
300     15667660
574     16426449
355     15850486
1167    17340137
Name: PMID, dtype: int64


In [13]:
sample_tree = load_xml('pmc', 3147286)
print(parse_pubmed_web_tree(sample_tree))

{'title': '', 'abstract': '', 'journal': 'Background;Methods;Results;Conclusion', 'affiliation': '', 'authors': '', 'year': ''}


In [67]:
def load_xml(database, articleid, sleep=None):
    """
    Load XML file from given pmid from eutils site
    return a dictionary for given pmid and xml string from the site
    sleep: how much time we want to wait until requesting new xml
    """
    link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=" + database + "&retmode=xml&id=%s" % str(articleid)
    page = requests.get(link)
    tree = html.fromstring(page.content)
    if sleep is not None:
        time.sleep(sleep)
    return tree

def parse_pubmed_web_tree(tree):
    """
    Giving tree, return simple parsed information from the tree
    """

    if tree.xpath('//articletitle') is not None:
        title = ' '.join([title.text for title in tree.xpath('//articletitle')])
    else:
        title = ''

    abstract_tree = tree.xpath('//abstract/abstracttext')
    abstract = ' '.join([stringify_children(a).strip() for a in abstract_tree])

    if tree.xpath('//article//title') is not None:
        journal = ';'.join([t.text.strip() for t in tree.xpath('//article//title')])
    else:
        journal = ''

    pubdate = tree.xpath('//pubmeddata//history//pubmedpubdate[@pubstatus="medline"]')
    if len(pubdate) >= 1 and pubdate[0].find('year') is not None:
        year = pubdate[0].find('year').text
    else:
        year = ''

    affiliations = list()
    if tree.xpath('//affiliationinfo/affiliation') is not None:
        for affil in tree.xpath('//affiliationinfo/affiliation'):
            affiliations.append(affil.text)
    affiliations_text = '; '.join(affiliations)

    authors_tree = tree.xpath('//authorlist/author')
    authors = list()
    if authors_tree is not None:
        for a in authors_tree:
            firstname = a.find('forename').text if a.find('forename') is not None else ''
            lastname = a.find('lastname').text if a.find('forename') is not None else ''
            fullname = (firstname + ' ' + lastname).strip()
            if fullname == '':
                fullname = a.find('collectivename').text if a.find('collectivename') is not None else ''
            authors.append(fullname)
        authors_text = '; '.join(authors)
    else:
        authors_text = ''

    dict_out = {'title': title,
                'abstract': abstract,
                'journal': journal,
                'affiliation': affiliations_text,
                'authors': authors_text,
                'year': year}
    return dict_out

def parse_xml_web(pmid, sleep=None, save_xml=False):
    """
    Give pmid, load and parse xml from Pubmed eutils
    if save_xml is True, save xml output in dictionary
    """
    tree = load_xml(pmid, sleep=sleep)
    dict_out = parse_pubmed_web_tree(tree)
    dict_out['pmid'] = str(pmid)
    if save_xml:
        dict_out['xml'] = etree.tostring(tree)
    return dict_out

def read_xml(path):
    """
    Parse tree from given XML path
    """
    try:
        tree = etree.parse(path)
    except:
        try:
            tree = etree.fromstring(path)
        except Exception as e:
            print("Error: it was not able to read a path, a file-like object, or a string as an XML")
            raise
    if '.nxml' in path:
        remove_namespace(tree) # strip namespace for
    return tree

def remove_namespace(tree):
    """
    Strip namespace from parsed XML
    """
    for node in tree.iter():
        try:
            has_namespace = node.tag.startswith('{')
        except AttributeError:
            continue  # node.tag is not a string (node is a comment or similar)
        if has_namespace:
            node.tag = node.tag.split('}', 1)[1]
            
def stringify_children(node):
    """
    Filters and removes possible Nones in texts and tails
    ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
    """
    parts = ([node.text] +
             list(chain(*([c.text, c.tail] for c in node.getchildren()))) +
             [node.tail])
    return ''.join(filter(None, parts))


def parse_article_meta(tree):
    """
    Parse PMID, PMC and DOI from given article tree
    """
    article_meta = tree.find('.//article-meta')
    pmid_node = article_meta.find('article-id[@pub-id-type="pmid"]')
    pmc_node = article_meta.find('article-id[@pub-id-type="pmc"]')
    pub_id_node = article_meta.find('article-id[@pub-id-type="publisher-id"]')
    doi_node = article_meta.find('article-id[@pub-id-type="doi"]')

    pmid = pmid_node.text if pmid_node is not None else ''
    pmc = pmc_node.text if pmc_node is not None else ''
    pub_id = pub_id_node.text if pub_id_node is not None else ''
    doi = doi_node.text if doi_node is not None else ''

    dict_article_meta = {'pmid': pmid,
                         'pmc': pmc,
                         'doi': doi,
                         'publisher_id': pub_id}

    return dict_article_meta

def table_to_df(table_text):
    """
    Function to transform plain xml text to list of row values and
    columns
    """
    table_tree = etree.fromstring(table_text)
    columns = []
    for tr in table_tree.xpath('thead/tr'):
        for c in tr.getchildren():
            columns.append(unidecode(stringify_children(c)))

    row_values = []
    len_rows = []
    for tr in table_tree.findall('tbody/tr'):
        es = tr.xpath('td')
        row_value = [unidecode(stringify_children(e)) for e in es]
        len_rows.append(len(es))
        row_values.append(row_value)
    if len(len_rows) >= 1:
        len_row = max(set(len_rows), key=len_rows.count)
        row_values = [r for r in row_values if len(r) == len_row] # remove row with different length
        return columns, row_values
    else:
        return None, None


def parse_pubmed_table(path, return_xml=True):
    """
    Parse table from given Pubmed Open-Access XML file
    """
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']

    # parse table
    tables = tree.xpath('//body//sec//table-wrap')
    table_dicts = list()
    for table in tables:
        if table.find('label') is not None:
            label = unidecode(table.find('label').text or '')
        else:
            label = ''

        # table caption
        if table.find('caption/p') is not None:
            caption_node = table.find('caption/p')
        elif table.find('caption/title') is not None:
            caption_node = table.find('caption/title')
        else:
            caption_node = None
        if caption_node is not None:
            caption = unidecode(stringify_children(caption_node).strip())
        else:
            caption = ''

        # table content
        if table.find('table') is not None:
            table_tree = table.find('table')
        elif table.find('alternatives/table') is not None:
            table_tree = table.find('alternatives/table')
        else:
            table_tree = None

        if table_tree is not None:
            table_xml = etree.tostring(table_tree)
            columns, row_values = table_to_df(table_xml)
            if row_values is not None:
                table_dict = {'pmid': pmid,
                              'pmc': pmc,
                              'label': label,
                              'caption': caption,
                              'table_columns': columns,
                              'table_values': row_values}
                if return_xml:
                    table_dict['table_xml'] = table_xml
                table_dicts.append(table_dict)
    if len(table_dicts) >= 1:
        return table_dicts
    else:
        return None
    
def parse_html_table_to_df(table):
    n_columns = 0
    n_rows=0
    column_names = []

    # we find the column titles if we can
    for row in table.thead.find_all('tr'):
        # Handle column names if we find them
        td_tags = row.find_all('td') 
        if len(td_tags) > 0 and len(column_names) == 0:
            for td in td_tags:
                column_names.append(td.get_text())
        
    # Find number of rows
    for row in table.tbody.find_all('tr'):
    
        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)
            
    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.tbody.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

In [9]:
import pandas as pd
from bs4 import BeautifulSoup

html_string = '''
  <table>
        <tr>
            <td> Hello! </td>
            <td> Table </td>
        </tr>
    </table>
'''

soup = BeautifulSoup(html_string, 'lxml') # Parse the HTML as a string

table = soup.find_all('table')[0] # Grab the first table

new_table = pd.DataFrame(columns=range(0,2), index = [0]) # I know the size 

row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        new_table.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1

new_table

Unnamed: 0,0,1
0,Hello!,Table


In [24]:
fh = open('./targetarticles/PMC4210730.nxml','r')
soup = bs.BeautifulSoup(fh,'lxml')
print(soup.prettify())

#table = soup.find('table')

#print(table.thead.prettify())

#parse_html_table(table)
#tree = read_xml('/Volumes/Untitled/clinical_data/PMC1297588.nxml')

#print(etree.tostring(tree, pretty_print=True))



#[(table['id'], parse_html_table(table))  for table in soup.find_all('table')]  



<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<html>
 <body>
  <article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
   <?properties open_access?>
   <front>
    <journal-meta>
     <journal-id journal-id-type="nlm-ta">
      Korean J Parasitol
     </journal-id>
     <journal-id journal-id-type="iso-abbrev">
      Korean J. Parasitol
     </journal-id>
     <journal-id journal-id-type="publisher-id">
      KJP
     </journal-id>
     <journal-title-group>
      <journal-title>
       The Korean Journal of Parasitology
      </journal-title>
     </journal-title-group>
     <issn pub-type="ppub">
      0023-4001
     </issn>
     <issn pub-type="epub">
      1738-0006
     </issn>
     <publisher>
      <publisher-name>
       The Korean Society for Parasitology and Tropical Medicine
      </publisher-name>
     </publi

In [22]:
def parse_html_table(table):
    n_columns = 0
    n_rows=0
    column_names = []

    # we find the column titles if we can
    for row in table.thead.find_all('tr'):
        # Handle column names if we find them
        td_tags = row.find_all('td') 
        if len(td_tags) > 0 and len(column_names) == 0:
            for td in td_tags:
                column_names.append(td.get_text())
        
    # Find number of rows
    for row in table.tbody.find_all('tr'):
    
        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)
            
    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.tbody.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

In [130]:
idlist = ['1297588', '3320544', '2426698','1308868','1199622', '3221992', '1550628', '2234423', '2677666', '3214849', '3102623', '3090332']
for idee in idlist: 
    fname = glob.glob('./clinical_data/PMC%s.nxml' % idee)[0]
    print(fname)
    fh = open(fname,'r')
    soup = bs.BeautifulSoup(fh,'lxml')
    table = soup.find('table')
    print(pd.DataFrame(parse_html_table(table)).iloc[:6])
    

./clinical_data/PMC1297588.nxml
                               Placebo group (n = 14)  \
0                          Age            57.5 (11.0)   
1                    Sex (F:M)                    9:5   
2                  Height (cm)           168.4 (10.3)   
3       Methotrexate (yes/all)                   7/14   
4  Methotrexate dose (mg/week)             7.5 (10.1)   
5         Prednisone (yes/all)                   9/14   

  Vitamin B6 group (n = 14)  
0               53.9 (12.6)  
1                      12:2  
2               164.7 (9.1)  
3                      8/14  
4               10.2 (11.7)  
5                     11/14  
./clinical_data/PMC3320544.nxml
                  0           1           2           3           4  \
0                 N         583         297         374         121   
1           Age (y)  48.2 ± 6.7  50.3 ± 8.9  48.6 ± 9.2  45.1 ± 7.1   
2    Sex (% female)        47.2        51.2        57.7        52.9   
3          Race (%)                       

In [46]:
targetarticles_files_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
targetarticles_files_list = [i[40:-5] for i  in targetarticles_files_list]

print(targetarticles_files_list[:10])
print(len(targetarticles_files_list))

['3778263', '4473156', '5771285', '4264693', '4210730', '4578256', '5424460', '5541470', '4634690', '4548150']
294019


In [38]:
json = json.dumps(target_list)
f = open("target_PMCs.json","w")
f.write(json)
f.close()

In [93]:
#regexAge = '(?:mean|M)?\\W*age[d|s]?\\W*(?:range|from)?\\W+\\d+\\W*(?:to)?\\W*\\d*\\W*(?:year|day|week)'# or \\d+[\\s\\-\\d]*(?:year[s]?|month[s]?|week[s]?)[\\s\\-]*old'
regexRandom = '\\W*\\brandom(?:i(s|z))?\\W*'

file_count = 0
filecnt = []
file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
choice = np.random.randint(5000, size=200)
#for f in range(len(file_list)): 
for f in choice: 
    #print(os.path.basename(file_list[f])[:-5])
    #fh = open(file_list[f],'r')
    #soup = bs.BeautifulSoup(fh,'lxml')
    #print(soup.prettify())
    #tree = read_xml(file_list[f])
    abstract_dict = parse_pubmed_abstract(file_list[f])
    if (abstract_dict['abstract'] is not None):
        file_count += 1
        print(abstract_dict['abstract'])
    filecnt.append(abstract_dict)
    #dict_article_meta = parse_article_meta(tree)
    #pmid = dict_article_meta['pmid'];# print (pmid)
    #pmc = dict_article_meta['pmc']; #print (pmc)
    #alist = soup.findAll('a')#.findAll(text="age")
    #age = soup.findAll(text=re.compile(regexAge, re.I))
print(file_count)
filecnt









































































































































































































200


[{'abstract': '', 'pmc': '3033846', 'pmid': '21244695'},
 {'abstract': '', 'pmc': '5402023', 'pmid': '28451084'},
 {'abstract': '', 'pmc': '4010819', 'pmid': '24760348'},
 {'abstract': '', 'pmc': '5755335', 'pmid': '29301511'},
 {'abstract': '', 'pmc': '2701428', 'pmid': '19473511'},
 {'abstract': '', 'pmc': '4583395', 'pmid': '26406248'},
 {'abstract': '', 'pmc': '3651223', 'pmid': '23663486'},
 {'abstract': '', 'pmc': '3778266', 'pmid': '24179779'},
 {'abstract': '', 'pmc': '4728502', 'pmid': '26712769'},
 {'abstract': '', 'pmc': '5079100', 'pmid': '22889980'},
 {'abstract': '', 'pmc': '5256416', 'pmid': '27402733'},
 {'abstract': '', 'pmc': '2943681', 'pmid': '20882155'},
 {'abstract': '', 'pmc': '4657337', 'pmid': '26597908'},
 {'abstract': '', 'pmc': '3462125', 'pmid': '22731680'},
 {'abstract': '', 'pmc': '2448420', 'pmid': '18629257'},
 {'abstract': '', 'pmc': '2375305', 'pmid': '11875725'},
 {'abstract': '', 'pmc': '4676237', 'pmid': '26702205'},
 {'abstract': '', 'pmc': '41642

## Get the Table 1 from a list of articles and save the list of PMCIDs

In [59]:
table1_list = ['1', 'T', 'TABLE', 'TABLE 1', 'TABLE 1 ', 'TABLE1', 'TABLE 1.', 'TABLE1.', 'TABLE I', 'TABLE I.',\
               'TAB. 1', 'TAB.1', 'Table', 'Table 1', 'Table 1 ', 'Table 1 -', 'Table 1.', 'Table 1:', \
              'Tab. 1', 'Tab.1', 'Table I', 'Table I.', 'Table No. 1', 'Table. 1', 'Tabela 1', 'Tableau 1']

table1_set = set(table1_list)

table_count = 0
table1_count = 0
tablelist = []
labelslist = []
tableidslist = []
file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
choice = np.random.randint(2000, size=100)
for f in range(len(file_list)): 
#for f in choice: 
    filename = file_list[f]
    fh = open(filename,'r')
    filename = filename[40:-5]
    #print(filename)
    tree = read_xml(file_list[f])
    if tree is not None:
        tables = tree.xpath('//body//sec//table-wrap')
        if tables:
            table_count += 1
            for table in tables:
                if table.find('label') is not None:
                    label = unidecode(table.find('label').text or '')
                    labelslist.append(label)
                    tableid = unidecode(table.attrib['id'] or '')
                    tableidslist.append(tableid)
                    if label in table1_set:
                        table1_count += 1
                        tablelist.append(filename)
    

In [60]:
from collections import Counter


print(tablelist[:10])
print(len(tablelist))


common_labels = Counter(labelslist)
print(common_labels.most_common(40))

common_tableids = Counter(tableidslist)
print(common_tableids.most_common(40))

['4473156', '4264693', '5424460', '5541470', '5493262', '4778173', '4485884', '5379535', '4485694', '5079054']
176287
[('Table 1', 157680), ('Table 2', 127717), ('Table 3', 90904), ('Table 4', 54308), ('Table 5', 27844), ('Table 6', 13220), ('Table 1.', 10901), ('Table 2.', 8697), ('Table 7', 6148), ('Table 3.', 5778), ('TABLE 1', 3374), ('Table 4.', 3227), ('Table 8', 3105), ('TABLE 2', 2859), ('TABLE 3', 2087), ('Table I', 1782), ('Table 9', 1687), ('Table 5.', 1564), ('Table II', 1495), ('TABLE 4', 1267), ('', 1246), ('Table III', 1012), ('Table 10', 947), ('Table 6.', 669), ('TABLE 5', 603), ('Table 11', 586), ('Table 1. ', 570), ('Table IV', 557), ('Table-I', 453), ('Table S1', 447), ('Table I.', 427), ('Table 2. ', 427), ('Table-II', 404), ('Table 12', 376), ('Table 1:', 374), ('T', 361), ('Table', 356), ('Table II.', 324), ('Table 3. ', 308), ('Table 2:', 302)]
[('T1', 72983), ('T2', 58518), ('T3', 40742), ('Tab1', 38176), ('Tab2', 31129), ('T4', 23916), ('Tab3', 22613), ('Tab4'

In [47]:
table1_list = ['1', 'T', 'TABLE', 'TABLE 1', 'TABLE 1 ', 'TABLE1', 'TABLE 1.', 'TABLE1.', 'TABLE I', 'TABLE I.',\
               'TAB. 1', 'TAB.1', 'Table', 'Table 1', 'Table 1 ', 'Table 1 -', 'Table 1.', 'Table 1:', \
              'Tab. 1', 'Tab.1', 'Table I', 'Table I.', 'Table No. 1', 'Table. 1', 'Tabela 1', 'Tableau 1']

table1_set = set(table1_list)

#regexAge = '(?:mean|M|median|Med[.])?\\W*age[d|s]?\\W*(?:mean|M)?\\W*(?:SD)?\\W*(?:range|from)?\\W+\\d+\\W*(?:to)?\\W*\\d*\\W*(?:year|day|week)'# or '\\d+[\\s\\-\\d]*(?:year[s]?|month[s]?|week[s]?)[\\s\\-]*old?'
regexAge = '(?i)(?:mean|M?)?\W*age[d|s]?\W*?(?:mean|M)?\W*(?:year[s]|day[s]|week[s])\W*(?:mean|M)?\W*[?:-|+|±]?\W*(S[.])?(D[.])?'

table_count = 0
table1_count = 0
tablecnt = []
file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
choice = np.random.randint(2000, size=100)
for f in range(len(file_list)): 
#for f in choice: 

    #print(os.path.basename(file_list[f])[:-5])
    fh = open(file_list[f],'r')
    #soup = bs.BeautifulSoup(fh,'lxml')
    #print(soup.prettify())
    tree = read_xml(file_list[f])
    if tree is not None:
        dict_article_meta = parse_article_meta(tree)
        pmid = dict_article_meta['pmid'];# print (pmid)
        pmc = dict_article_meta['pmc']; #print (pmc)
        #tablelist = soup.find('table')#.findAll(text="age")
        #print(tablelist)
        tables = tree.xpath('//body//sec//table-wrap')
        if not tables:
            #print(file_list[f], ":  0")
            table_dict = {'PMCID': pmc, 'Table': 'no_tables', 'Id': '', 'Age': '', 'Sex': '', 'Gender': ''}
        else:
            table_count += 1
            #print(tables[0].attrib['id'])
            for table in tables:
                #has_label = table.find('label') 
                if table.find('label') is not None:
                    label = unidecode(table.find('label').text or '')
                    tableid = unidecode(table.find('id') or '')
                    #has_label_text = has_label.text
                    if label in table1_set:
                        table1_count += 1
                        soup = bs.BeautifulSoup(etree.tostring(table), "lxml")
                        #print(soup.prettify())
                        age = soup.findAll(text=re.compile(regexAge, re.I))
                        #sex = soup.findAll(text=re.compile('^Sex', re.I))
                        #gender = soup.findAll(text=re.compile('^Gender', re.I))

                        #if (len(age) > 0):
                        #    print(age)
                        #age = unidecode(soup.findAll(text=re.compile('^Age', re.I)).text or '')
                        #sex = unidecode(tablelist.findAll(text=re.compile('^Sex', re.I)).text or '')
                        table_dict = {'PMCID': pmc, 'Table': label, 'Id': table.attrib['id'], 'Age': age, 'Sex': '', 'Gender': ''}
                        break
                    else:
                        table_dict = {'PMCID': pmc, 'Table': label, 'Id': table.attrib['id'], 'Age': '', 'Sex': '', 'Gender': ''}
                else:
                    table_dict = {'PMCID': pmc, 'Table': 'no_label', 'Id': table.attrib['id'], 'Age': '', 'Sex': '', 'Gender': ''}
            #out_dict = parse_pubmed_table(file_list[f], return_xml=True)
            #print(out_dict)
            #print(file_list[f], ": ", len(tablelist))
            #agelist=tablelist.findAll(text=re.compile('Age'))
            #print(agelist)
            #sexlist=tablelist.findAll(text=re.compile('Sex'))
            #print(sexlist)
            #genderlist=tablelist.findAll(text=re.compile('Gender'))
            #print(genderlist)

            #table = soup.find('table')
            #print(table.findprettify())
            #print(parse_html_table(table).head(4))
        tablecnt.append(table_dict)
#print(tablecount)

In [17]:
file_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
for f in range(len(file_list)): 
    fh = open(file_list[f],'r')
    tree = read_xml(file_list[f])
    article_meta = tree.find('.//article-meta')
    if article_meta is None:
        print(f)
    

180322
216817


In [17]:
getDocString3 = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id="
#really_small_list = [small_list[i] for i in range(30)]
#for doc in ['216817']:
for doc in to_be_downloaded_list:    
    query = getDocString3 + str(doc)
    resp = urlopen(query).read().decode('utf-8')
    f = open("/home/dave/datapubmed/targetarticles/PMC"+ str(doc) + ".nxml", "w")
    f.write(resp)
    f.close()
#soup_doc = bs.BeautifulSoup(resp,'lxml')

HTTPError: HTTP Error 502: Bad Gateway

In [48]:
print(len(tablecnt))

291151


In [85]:
tablecnt[-1]

{'Age': [],
 'Gender': '',
 'Id': 'Tab1',
 'PMCID': '4264322',
 'Sex': '',
 'Table': 'Table 1'}

In [42]:
tree = read_xml(file_list[1091])
print(tree.find('.//article-meta'))
print(file_list[1091])

None
/home/dave/datapubmed/targetarticles/PMC216817.nxml


In [49]:
print(table_count)
print(table1_count)

178606
174007


In [50]:
withAge = [d for d in tablecnt if (d['Age'] !=  '') and (d['Age'] !=  [])]
len(withAge)

27830

In [89]:
withAgeAndSex = [d for d in withAge if (d['Sex'] !=  '') and (d['Sex'] !=  [])]
len(withAgeAndSex)

0

In [62]:
withAgeAndGender = [d for d in withAge if (d['Gender'] !=  '') and (d['Gender'] !=  [])]
len(withAgeAndGender)

2267

In [51]:
PMCIDwithAge = [d['PMCID'] for d in withAge]
AgewithAge = [d['Age'] for d in withAge]
AgewithAgeList = []
[AgewithAgeList.append(d[0]) for d in AgewithAge]
AgewithAgeList

['Age (years), median (range)',
 'Age (years)',
 'Mean age (years)\u2009±\u2009SD',
 'Mean Age (years)',
 'Age (years)',
 'Median age, years (range)',
 'Age, years',
 'Age years',
 'Age (years, SD)',
 'Age (years) mean (SD)',
 'Age (years)',
 'Age (years)',
 'Age (years)',
 'Age [years], mean (SD)',
 'Age (years)*',
 ' Age (years)',
 'Age: mean, Years±SD',
 'Age (years)',
 'Age, years',
 'Age (years)',
 'Age (years)',
 'Age, years',
 'Age, years (mean\xa0±\xa0SD)',
 'Age [years]',
 'Age, years',
 'Mean age (years)',
 'Mean (SD) age (years)',
 'Age (years)',
 'Age (years)',
 'Age, mean (years)',
 'Age (years)',
 'Age (years)',
 'Age (years)',
 'Age [years]',
 'Age (years)',
 'Age (years)',
 'Age, years (IQR)',
 'Age (years)',
 '\xa0Body age difference from chronological age (years), mean (SD)',
 'Age (years)',
 'Age, years',
 'Age (Years) (%)',
 'Age (years), ',
 'Age (years)',
 'Age (years)',
 'Age (years)',
 '\xa0Age (years)',
 'Mean age (years)',
 'Age (years)',
 'Age (years)',
 'Age

In [52]:
from collections import Counter

ages = Counter(AgewithAgeList)
ages.most_common(40)


[('Age (years)', 13496),
 ('Age, years', 3379),
 ('Mean age (years)', 547),
 ('Age (years), mean (SD)', 409),
 ('Age [years]', 370),
 ('\u2003Age (years)', 303),
 ('Age (Years)', 226),
 ('Age, years, mean (SD)', 217),
 ('Median age, years (range)', 207),
 ('Age(years)', 204),
 ('Mean age, years (SD)', 167),
 ('Mean (SD) age (years)', 165),
 ('\u2003Age, years', 161),
 ('Maternal age (years)', 160),
 ('Median age (years)', 133),
 ('Age (years)*', 116),
 ('Gestational age (weeks)', 114),
 ('Age (years) ', 107),
 ('Mean age, years', 105),
 ('Age (years), mean ± SD', 91),
 ('\xa0Age (years)', 88),
 ('Age (years), median (IQR)', 88),
 ('Age, years (SD)', 81),
 ('Mean (SD) age, years', 81),
 ('Age years', 76),
 ('Age (years), median (range)', 73),
 ('Age (years old)', 71),
 ('Age (years):', 70),
 ('age (years)', 59),
 ('Mean age, years (range)', 57),
 ('Age (years, mean ± SD)', 57),
 ('Age, mean years (SD)', 56),
 ('Age, years, median (range)', 56),
 ('Age, years (mean ± SD)', 55),
 ('Age, y

In [31]:
from shutil import copyfile

source = 'targetarticles'
destination = 'ForTableDistentangler'

for fh in PMCIDwithAge[2000:3001]:
    copyfile('/home/dave/datapubmed/'+ source +'/PMC' + fh +'.nxml', '/home/dave/datapubmed/'+ destination +'/PMC' + fh +'.nxml')

In [70]:
idlist = ['5249075']
for idee in idlist: 
    fname = glob.glob('./clinical_data/PMC%s.nxml' % idee)[0]
    print(fname)
    fh = open(fname,'r')
    soup = bs.BeautifulSoup(fh,'lxml')
    table = soup.find('table')
    dframe = pd.DataFrame(parse_html_table(table))
dframe

./clinical_data/PMC5249075.nxml


Unnamed: 0,0,1,2,3,4
0,"Age, years (mean SD)",78.1 (7.2),78.1 (7.2),77.7 (7.0),77.6 (7.0)
1,"Gender, n (%)\nMale",190 (38.5),189 (39.1),210 (40.6),207 (40.8)
2,Female,303 (61.5),295 (61.0),307 (59.4),300 (59.2)
3,Body Mass Index (mean SD),27.6 (5.3),27.6 (5.3),27.7 (5.4),27.7 (5.4)
4,"Ethnic group (White British), n (%)",492 (99.8),483 (99.8),510 (98.7),500 (98.6)
5,"Self-reported arthritis, n (%)",292 (59.2),286 (59.1),300 (58),290 (57.2)
6,"Live alone, n (%)",236 (47.9),230 (47.5),220 (42.6),214 (42.2)
7,">4 prescribed medications, n (%)",313 (63.5),305 (63.0),304 (58.8),297 (58.6)
8,"Current use of foot orthoses, n (%)",191 (38.7),189 (39.1),163 (31.5),161 (31.8)
9,"1+ falls in previous 12 months, n (%)",325 (65.9),319 (65.9),332 (64.2),323 (63.7)


In [198]:
def parse_html_table_to_json(table):
    
    rows = table.find_all("tr")

    headers = {}
    
    thead = table.find("thead")
    if thead is not None:
        ths = thead.find_all("th")

        for i in range(len(ths)):
            headers[i] = ths[i].text.strip().lower()

    data = []

    for row in rows:
        if row is not None:
            cells = row.find_all("td")

            item = {}
            if len(cells) == len(headers):

                for index in headers:
                    item[headers[index]] = unidecode(cells[index].text)

                data.append(item)

    #return json.dumps(data, indent=4)
    return data

In [37]:
with open('target_PMCs.json', 'wb') as outfile:
    json.dumps(target_list, outfile)

TypeError: dumps() takes 1 positional argument but 2 were given

In [38]:
json = json.dumps(target_list)
f = open("target_PMCs.json","w")
f.write(json)
f.close()

In [203]:
table1_list = ['1', 'T', 'TABLE', 'TABLE 1', 'TABLE 1 ', 'TABLE1', 'TABLE 1.', 'TABLE1.', 'TABLE I', 'TABLE I.',\
               'TAB. 1', 'TAB.1', 'Table', 'Table 1', 'Table 1 ', 'Table 1 -', 'Table 1.', 'Table 1:', \
              'Tab. 1', 'Tab.1', 'Table I', 'Table I.', 'Table No. 1', 'Table. 1', 'Tabela 1', 'Tableau 1']

table1_set = set(table1_list)

tablecnt = []
file_list = glob.glob('./clinical_data/PMC3104488.nxml')
choice = np.random.randint(2000, size=30)
for f in range(len(file_list)): 
#for f in choice: 

    
    fh = open(file_list[f],'r')
    soup = bs.BeautifulSoup(fh,'lxml')
    tablelist = soup.find("table-wrap", id="T1")
    if tablelist is not None:
        #for table in tablelist.find_all('table'):
        #    if table is not None:
            #table = tree.xpath('/table')
                #agelist = unidecode(tablelist.findAll(text=re.compile('^Age', re.I)).text or '')
        table = tablelist.table
        if table is not None:
            out_json = parse_html_table_to_json(table)
            
            if (len(out_json) > 0 ):
                pmcid = os.path.basename(file_list[f])[:-5]
                table_dict = {'PMCID': pmcid, 'Table': out_json}
print(json.dumps(table_dict))         
f = open("table1.json","w")
f.write(json.dumps(table_dict))
f.close()
tablecnt.append(table_dict)

{"PMCID": "PMC3104488", "Table": [{"patients": "Age", "4t(n = 708)": "", "qualitative sample (n = 45)": ""}, {"patients": " Mean age (+- SD)", "4t(n = 708)": "61.7 (+-9.8)*", "qualitative sample (n = 45)": "64.7 (+- 8.5)**"}, {"patients": "", "4t(n = 708)": "", "qualitative sample (n = 45)": ""}, {"patients": "Sex", "4t(n = 708)": "", "qualitative sample (n = 45)": ""}, {"patients": " Male (%)", "4t(n = 708)": "456 (64)", "qualitative sample (n = 45)": "29 (64)"}, {"patients": " Female (%)", "4t(n = 708)": "252 (36)", "qualitative sample (n = 45)": "16 (36)"}, {"patients": "", "4t(n = 708)": "", "qualitative sample (n = 45)": ""}, {"patients": "Randomisation", "4t(n = 708)": "", "qualitative sample (n = 45)": ""}, {"patients": " Biphasic (%)", "4t(n = 708)": "235 (33)", "qualitative sample (n = 45)": "15 (33)"}, {"patients": " Prandial (%)", "4t(n = 708)": "239 (34)", "qualitative sample (n = 45)": "15 (33)"}, {"patients": " Basal (%)", "4t(n = 708)": "234 (33)", "qualitative sample (n

In [191]:
f = open("table1.json","w")
f.write(tablecnt)
f.close()

TypeError: write() argument must be str, not list

In [144]:
no_tables

63871

In [142]:
file_list = glob.glob('./clinical_data/PMC*.nxml')
len(file_list)

no_tables = 0
for f in range(len(file_list)): 

    tree = read_xml(file_list[f])
    tables = tree.xpath('//body//sec//table-wrap')
    for table in tables:
            no_tables += 1


In [120]:
out_dict = parse_pubmed_table('./clinical_data/PMC3104488.nxml', return_xml=True)
print(out_dict[0])

{'pmid': '21542916', 'pmc': '3104488', 'label': 'Table 1', 'caption': 'Patient and staff characteristics', 'table_columns': ['Patients', '4T(n = 708)', 'Qualitative sample (n = 45)'], 'table_values': [['Age', '', ''], [' Mean age (+- SD)', '61.7 (+-9.8)*', '64.7 (+- 8.5)**'], ['', '', ''], ['Sex', '', ''], [' Male (%)', '456 (64)', '29 (64)'], [' Female (%)', '252 (36)', '16 (36)'], ['', '', ''], ['Randomisation', '', ''], [' Biphasic (%)', '235 (33)', '15 (33)'], [' Prandial (%)', '239 (34)', '15 (33)'], [' Basal (%)', '234 (33)', '15 (33)'], ['', '', ''], ['Glycaemic control at Yr 3', '', ''], [' Median HbA1c', '6.9%', '6.9%'], [' Number (%) of patients with HbA1c PS 6.5%', '283 (40)', '19 (42)'], [' Number (%) of patients with HbA1c PS 7.0%', '425 (60)', '26 (58)'], ['', '', ''], ['Health Care Professionals', '4T ', 'Qualitative sample (n = 21)'], [' Role', '', ''], [' Physician (Phy)', '-', '9'], [' Research Nurse (RNs)', '-', '12'], ['', '', ''], ['Experience in diabetic medicine'

In [69]:
json = json.dumps(out_dict[0])
f = open("out_dict.json","w")
f.write(json)
f.close()

TypeError: Object of type 'bytes' is not JSON serializable

In [49]:
print(len(tablecnt))

1000


In [51]:
withAge = [d for d in tablecnt if (d['Age'] !=  '') and (d['Age'] !=  [])]
len(withAge)

305

In [52]:
withAge[:20]

[{'Age': ['Age, yr'],
  'Id': 'T1',
  'PMCID': '1351193',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age'],
  'Id': 'T1',
  'PMCID': '2584023',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age, y'],
  'Id': 'tbl1',
  'PMCID': '2409178',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age (years) (SD)'],
  'Id': 'T1',
  'PMCID': '2528008',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age (years)', 'Age at Onset of Depression'],
  'Id': 'T1',
  'PMCID': '1373611',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age (yr)'],
  'Id': 'T1',
  'PMCID': '2689488',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age in years: median (interquartile range)'],
  'Id': 'T1',
  'PMCID': '1872030',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age [months, mean (SD)]', 'Age categories ['],
  'Id': 'T1',
  'PMCID': '2684117',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['Age (years)'],
  'Id': 'T1',
  'PMCID': '2467430',
  'Sex': 'sex',
  'Table': 'Table 1'},
 {'Age': ['

In [57]:
from collections import Counter
Counter(k['Id'] for k in tablecnt if k.get('Id'))

Counter({'pmed-0020078-t001': 1,
         'T1': 8286,
         'pmed-0020092-t001': 1,
         'Tab1': 5693,
         'tab1': 31,
         'tbl1': 481,
         'pntd.0005197.t001': 1,
         't1': 123,
         'pone.0170277.t001': 1,
         'T4': 5,
         'birt12241-tbl-0001': 1,
         'pone.0168712.t001': 1,
         'pntd.0005389.t001': 1,
         'table1': 450,
         'sms12725-tbl-0001': 1,
         'pone.0179866.t001': 1,
         'Table1': 18,
         'pone.0179600.t001': 1,
         'pmed-0020135-t001': 1,
         'pmed-0020190-t001': 1,
         'pmed-0020233-t001': 1,
         'pmed-0020265-t001': 1,
         'pmed-0020295-t001': 1,
         'pmed-0020298-t001': 1,
         'pmed-0020345-t001': 1,
         'pmed-0030162-t001': 1,
         'pmed-0030134-t001': 1,
         'pmed-0030238-t001': 1,
         'T7': 1,
         'ppat-0020092-t001': 1,
         'pone-0000098-t001': 1,
         'pone-0000097-t001': 1,
         'pone-0000089-t001': 1,
         'pmed-00

In [17]:
tablecount

[{'Age': '', 'PMCID': '1064097', 'Sex': '', 'Table': 'no_tables'},
 {'Age': '', 'PMCID': '1065065', 'Sex': '', 'Table': 'no_tables'},
 {'Age': '', 'PMCID': '1065067', 'Sex': '', 'Table': 'no_tables'},
 {'Age': '', 'PMCID': '1065097', 'Sex': '', 'Table': 'no_tables'},
 {'Age': '', 'PMCID': '1065317', 'Sex': '', 'Table': 'no_tables'},
 {'Age': 'age', 'PMCID': '1069669', 'Sex': 'sex', 'Table': 'Table 1'},
 {'Age': '', 'PMCID': '1079830', 'Sex': '', 'Table': 'Table 10'},
 {'Age': '', 'PMCID': '1079844', 'Sex': '', 'Table': 'Table 3'},
 {'Age': 'age', 'PMCID': '1079862', 'Sex': 'sex', 'Table': 'Table 1'},
 {'Age': '', 'PMCID': '1079897', 'Sex': '', 'Table': 'no_tables'},
 {'Age': '', 'PMCID': '1079899', 'Sex': '', 'Table': 'no_tables'},
 {'Age': '', 'PMCID': '1079941', 'Sex': '', 'Table': 'Table 7'},
 {'Age': '', 'PMCID': '1079947', 'Sex': '', 'Table': 'Table 2'},
 {'Age': 'age', 'PMCID': '1083417', 'Sex': 'sex', 'Table': 'Table 1'},
 {'Age': '', 'PMCID': '1084249', 'Sex': '', 'Table': 'Tab

In [50]:
newtablecount = [dict(t) for t in set([tuple(d.items()) for d in tablecount])]

In [58]:
len(newtablecount)

23700

In [59]:
withTable1 = [d for d in newtablecount if d['Table'] == 'Table 1']
len(withTable1)

20091

In [107]:
withAge = [d for d in tablecount if (d['Age'] !=  '') and (d['Age'] !=  [])]
len(withAge)

988

In [109]:
withAgeAndSex = [d for d in withAge if (d['Sex'] !=  '') and (d['Sex'] !=  [])]
len(withAgeAndSex)

214

In [111]:
withAgeAndSex[30:40]

[{'Age': ['Age (years, mean ± SD)'],
  'PMCID': '3039748',
  'Sex': ['Sex'],
  'Table': 'Table 1'},
 {'Age': ['Age (mean, SD)'],
  'PMCID': '3090332',
  'Sex': ['Sex male/female (male%)'],
  'Table': 'Table 1'},
 {'Age': ['Age (years)', 'Age at disease onset'],
  'PMCID': '3102623',
  'Sex': ['Sex (fem/male)'],
  'Table': 'Table 1'},
 {'Age': ['Age'], 'PMCID': '3104488', 'Sex': ['Sex'], 'Table': 'Table 1'},
 {'Age': ['Age'], 'PMCID': '3135553', 'Sex': ['Sex'], 'Table': 'Table 1'},
 {'Age': ['Age'],
  'PMCID': '3212926',
  'Sex': ['Sex (Female/Male)'],
  'Table': 'Table 1'},
 {'Age': ['Age'], 'PMCID': '3214849', 'Sex': ['Sex'], 'Table': 'Table 1'},
 {'Age': ['Age, years'],
  'PMCID': '3221992',
  'Sex': ['Sex, male'],
  'Table': 'Table 1'},
 {'Age': ['Age'], 'PMCID': '3274446', 'Sex': ['Sex'], 'Table': 'Table 1'},
 {'Age': ['Age (y)'],
  'PMCID': '3320544',
  'Sex': ['Sex (% female)'],
  'Table': 'Table 1'}]

In [99]:
def parse_html_table(table):
    n_columns = 0
    n_rows=0
    column_names = []

    # we find the column titles if we can
    for row in table.thead.find_all('tr'):
        # Handle column names if we find them
        td_tags = row.find_all('td') 
        if len(td_tags) > 0 and len(column_names) == 0:
            for td in td_tags:
                column_names.append(td.get_text())
        
    # Find number of rows
    for row in table.tbody.find_all('tr'):
    
        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)
            
    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.tbody.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

tablecount = []
file_list = glob.glob('./clinical_data/PMC*.nxml')
choice = np.random.randint(2000, size=100)
for f in range(len(file_list)): 
#for f in choice: 

    #print(os.path.basename(file_list[f])[:-5])
    fh = open(file_list[f],'r')
    soup = bs.BeautifulSoup(fh,'lxml')
    #print(soup.prettify())
    tree = read_xml(file_list[f])
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid'];# print (pmid)
    pmc = dict_article_meta['pmc']; #print (pmc)
    tablelist = soup.find('table')#.findAll(text="age")
    #print(tablelist)
    if not tablelist:
        #print(file_list[f], ":  0")
        table_dict = {'PMCID': pmc, 'Table': '', 'Age': '', 'Sex': ''}
    else:
        tables = tree.xpath('//body//sec//table-wrap')
        for table in tables:
            #has_label = table.find('label') 
            if table.find('label') is not None:
                #has_label_text = has_label.text
                if table.find('label').text == 'Table 1':
                    label = unidecode(table.find('label').text or '')
                    age = unidecode(tablelist.findAll(text=re.compile('^Age', re.I)).text or '')
                    sex = unidecode(tablelist.findAll(text=re.compile('^Sex', re.I)).text or '')
                    table_dict = {'PMCID': pmc, 'Table': label, 'Age': age, 'Sex': sex}
                else:
                    table_dict = {'PMCID': pmc, 'Table': '', 'Age': '', 'Sex': ''}
            else:
                table_dict = {'PMCID': pmc, 'Table': '', 'Age': '', 'Sex': ''}
        #out_dict = parse_pubmed_table(file_list[f], return_xml=True)
        #print(out_dict)
        #print(file_list[f], ": ", len(tablelist))
        #agelist=tablelist.findAll(text=re.compile('Age'))
        #print(agelist)
        #sexlist=tablelist.findAll(text=re.compile('Sex'))
        #print(sexlist)
        #genderlist=tablelist.findAll(text=re.compile('Gender'))
        #print(genderlist)
        
        #table = soup.find('table')
        #print(table.findprettify())
        #print(parse_html_table(table).head(4))
    tablecount.append(table_dict)
#print(tablecount)

In [40]:
parse_html_table(table)

Unnamed: 0,Unnamed: 1,Placebo group (n = 14),Vitamin B6 group (n = 14)
0,Age,57.5 (11.0),53.9 (12.6)
1,Sex (F:M),9:5,12:2
2,Height (cm),168.4 (10.3),164.7 (9.1)
3,Methotrexate (yes/all),7/14,8/14
4,Methotrexate dose (mg/week),7.5 (10.1),10.2 (11.7)
5,Prednisone (yes/all),9/14,11/14
6,Prednisone dose (mg/week),3.1 (3.5),4.3 (4.0)
7,NSAIDs use (yes/all),10/14,11/14
8,Duration of disease (years),11.6 (8.2),8.5 (5.6)
9,Number of painful joints,5.1 (5.1),7.9 (8.9)


In [50]:
hp = HTMLTableParser()
table = hp.parse_url(soup)[0][1] # Grabbing the table from the tuple
table.head()

KeyError: 'id'

In [44]:
fh = f = open('./clinical_data/PMC514553.nxml','r')
soup = bs.BeautifulSoup(fh,'lxml')

table = soup.find_all('table')[0] # Grab the first table

new_table = pd.DataFrame(columns=range(0,2), index = [0]) # I know the size 

row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        new_table.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1

new_table
    
    
targetIDs = soup.find_all('table')
small_list = []
for docID in targetIDs[:30]:
    small_list.append(docID.get_text())
    print(docID.get_text())

print(soup.prettify())

IndexError: index 2 is out of bounds for axis 0 with size 2

In [54]:
out_dict = parse_pubmed_table('./clinical_data/PMC1069669.nxml', return_xml=True)
print(out_dict)

None


In [139]:
parse_xml_web(small_list[5], sleep=None, save_xml=True)

{'abstract': 'We conducted two studies of circadian misalignment in non-Hispanic African and European-Americans. In the first, the sleep/wake (light/dark) schedule was advanced 9 h, similar to flying east, and in the second these schedules were delayed 9 h, similar to flying west or sleeping during the day after night work. We confirmed that the free-running circadian period is shorter in African-Americans compared to European-Americans, and found differences in the magnitude and direction of circadian rhythm phase shifts which were related to the circadian period. The sleep and cognitive performance data from the first study (published in this journal) documented the impairment in both ancestry groups due to this extreme circadian misalignment. African-Americans slept less and performed slightly worse during advanced/misaligned days than European-Americans. The current analysis is of sleep and cognitive performance from the second study. Participants were 23 African-Americans and 22 E

In [11]:
def getPMCIDs(query_string):
    resp = urlopen(query_string).read().decode('utf-8')
    tree = etree.ElementTree('IdList')
    #root = tree.getroot()
    return tree

output = getPMCIDs(query)

TypeError: Argument 'element' has incorrect type (expected lxml.etree._Element, got str)

In [10]:
print(etree.tostring(output, pretty_print=True))

b'<IdList/>\n'


In [48]:
class HTMLTableParser:

    def parse_url(self, soup):
        #response = requests.get(url)
        #soup = BeautifulSoup(response.text, 'lxml')
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df

In [123]:
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=300000&term=(Clinical%20Trial%5BPublication%20Type%5D)%20AND%20Free%20full%20text%5BFilter%5D"

resp = urlopen(query).read()#.decode('utf-8')


soup = bs.BeautifulSoup(resp,'lxml')
targetIDs = soup.find_all('id')
small_list = []
for docID in targetIDs[300:350]:
    small_list.append(docID.get_text())
    print(docID.get_text())

29074640
29073584
29073275
29073252
29073200
29073187
29073180
29073174
29071414
29070564
29069260
29069224
29069220
29069156
29069153
29069152
29069041
29069016
29069003
29068995
29068977
29068592
29068578
29067906
29067428
29065426
29065142
29062346
29062324
29061270
29059635
29059492
29059253
29059197
29059190
29059186
29058817
29055956
29055056
29052344
29052340
29049415
29049336
29049303
29049282
29049251
29049247
29049235
29049211
29049194


In [104]:
print(soup.prettify()[0:1000])

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<html>
 <body>
  <esearchresult>
   <count>
    289345
   </count>
   <retmax>
    289345
   </retmax>
   <retstart>
    0
   </retstart>
   <idlist>
    <id>
     5832349
    </id>
    <id>
     5831366
    </id>
    <id>
     5831364
    </id>
    <id>
     5831331
    </id>
    <id>
     5831252
    </id>
    <id>
     5831244
    </id>
    <id>
     5831176
    </id>
    <id>
     5831170
    </id>
    <id>
     5831129
    </id>
    <id>
     5831118
    </id>
    <id>
     5829951
    </id>
    <id>
     5829948
    </id>
    <id>
     5829926
    </id>
    <id>
     5829872
    </id>
    <id>
     5829859
    </id>
    <id>
     5829675
    </id>
    <id>
     5829673
    </id>
    <id>
     5829602
    </id>
    <id>
     5829454
    </id>
    <id>
     5829280
    </id>
    <id>
     5829128
    </id>
    <

In [101]:
len(soup)

3

In [74]:
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&retmax=300000&term=(Randomized%20Controlled%20Trial%5BPublication%20Type%5D)"

resp = urlopen(query).read()#.decode('utf-8')

soup = bs.BeautifulSoup(resp,'lxml')
targetIDs = soup.find_all('id')
small_list = []
for docID in targetIDs:
    small_list.append(docID.get_text())
    #print(docID.get_text())

In [75]:
print(len(small_list))

240540


In [92]:
getDocString = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:4304705&metadataPrefix=pmc"
getDocString1 = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:"
getDocString2 = "&metadataPrefix=pmc"

In [93]:
print(small_list[5])
print(getDocString1 + small_list[5] + getDocString2)

5831244
https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:5831244&metadataPrefix=pmc


In [94]:
query = getDocString1 + small_list[5] + getDocString2
resp = urlopen(query).read()#.decode('utf-8')
soup_doc = bs.BeautifulSoup(resp,'lxml')

In [95]:
print(soup_doc.prettify()[0:1000])

<?xml version="1.0" encoding="UTF-8"?>
<html>
 <body>
  <oai-pmh xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
   <responsedate>
    2018-03-04T10:56:37Z
   </responsedate>
   <request>
    https://www.ncbi.nlm.nih.gov/oai/oai.cgi
   </request>
   <error code="cannotDisseminateFormat">
    The metadata format 'pmc' is not supported by the item or by the repository.
   </error>
  </oai-pmh>
 </body>
</html>


In [3]:
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=300000&term=(Clinical%20Trial%5BPublication%20Type%5D)%20AND%20Free%20full%20text%5BFilter%5D"
resp = urlopen(query).read()#.decode('utf-8')


soup = bs.BeautifulSoup(resp,'lxml')
targetIDs = soup.find_all('id')
small_list = []
for docID in targetIDs:
    small_list.append(docID.get_text())
print(small_list[20:40])

['29390493', '29390476', '29390463', '29390461', '29390433', '29390421', '29390308', '29390298', '29390284', '29390278', '29390272', '29385990', '29385178', '29385147', '29384933', '29384927', '29384903', '29384859', '29384854', '29384848']


In [105]:
query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&retmax=300000&term=(Clinical%20Trial%5BPublication%20Type%5D)%20AND%20Free%20full%20text%5BFilter%5D"

resp = urlopen(query).read()#.decode('utf-8')


soup = bs.BeautifulSoup(resp,'lxml')
targetIDs = soup.find_all('id')
small_list = []
for docID in targetIDs[300:320]:
    small_list.append(docID.get_text())
    print(docID.get_text())

5819660
5819638
5819303
5819299
5819168
5818491
5818378
5818369
5818080
5817246
5817244
5817232
5815471
5814509
5813709
5813065
5813050
5811418
5809626
5808011


In [76]:
getDocString3 = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id="

In [92]:
[i for i,x in enumerate(small_list) if x=='5700662'] 

[5706]

In [93]:
len(small_list[5705:])

234835

In [94]:
getDocString3 = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id="
#really_small_list = [small_list[i] for i in range(30)]
for doc in small_list[5705:]:
    query = getDocString3 + str(doc)
    resp = urlopen(query).read().decode('utf-8')
    f = open("./downloaded/PMC"+ str(doc) + ".nxml", "w")
    f.write(resp)
    f.close()
#soup_doc = bs.BeautifulSoup(resp,'lxml')

HTTPError: HTTP Error 502: Bad Gateway

In [78]:
print(soup_doc.prettify()[0:1000])

<?xml version="1.0" ?>
<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">
<html>
 <body>
  <pmc-articleset>
   <article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
    <?properties open_access?>
    <front>
     <journal-meta>
      <journal-id journal-id-type="nlm-ta">
       Addict Behav Rep
      </journal-id>
      <journal-id journal-id-type="iso-abbrev">
       Addict Behav Rep
      </journal-id>
      <journal-title-group>
       <journal-title>
        Addictive Behaviors Reports
       </journal-title>
      </journal-title-group>
      <issn pub-type="epub">
       2352-8532
      </issn>
      <publisher>
       <publisher-name>
        Elsevier
       </publisher-name>
      </publisher>
     </journal-meta>
     <article-meta>
      <article-id pub-id-type="pmc">
       5845947
      </article-id>
      <articl

In [91]:
import pandas as pd
openaccess_df = pd.read_csv('oa_file_list.csv')

print(openaccess_df.shape)

print(len(small_list))

openaccess_df.head(10)

In [95]:
clinical_oa_df = openaccess_df[openaccess_df['PMID'].isin(small_list)]

In [96]:
clinical_oa_df.shape

(291047, 6)

In [97]:
clinical_oa_df.to_csv('clinical_oa_df.csv')

In [98]:
clinical_oa_df.head()

Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
13,oa_package/17/bb/PMC13921.tar.gz,Breast Cancer Res. 2000 Aug 21; 2(6):438-443,PMC13921,2014-04-29 14:49:00,11056691.0,NO-CC CODE
15,oa_package/75/59/PMC13923.tar.gz,Breast Cancer Res. 2001 Dec 22; 3(2):122-133,PMC13923,2014-04-29 19:39:41,11250759.0,NO-CC CODE
24,oa_package/9c/ed/PMC15027.tar.gz,Genome Biol. 2000 Nov 6; 1(5):research0009.1-r...,PMC15027,2014-04-29 19:45:12,11178258.0,NO-CC CODE
29,oa_package/be/ab/PMC16145.tar.gz,Genome Biol. 2000 Dec 4; 1(6):research0014.1-1...,PMC16145,2014-04-29 19:46:09,11178268.0,NO-CC CODE
39,oa_package/70/ed/PMC17806.tar.gz,Arthritis Res. 2000 Dec 22; 2(1):75-84,PMC17806,2014-04-29 19:51:28,11219392.0,NO-CC CODE


In [105]:
#ftp_string1 = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/%s \n"
ftp_string1 = "get %s ./%s.tar.gz\n"

f = open("wget_script.sh", "w")
f.write("#!/bin/bash\n")
f.write("HOST=\"ftp.ncbi.nlm.nih.gov\"\n")
f.write("USER=\"anonymous\"\n")
f.write("PASSWORD=\"ddowey@hotmail.com\"\n")
f.write("ftp -inv $HOST <<EOT\n")
f.write("ascii\n")
f.write("user $USER $PASSWORD\n")
f.write("prompt\n")
f.write("cd /pub/pmc\n")
for index, row in clinical_oa_df.iterrows():
        f.write(ftp_string1 % (str(row['File']), str(row['Accession ID'] )))
f.write("bye\n")
f.write("EOT\n")
f.close()

In [None]:
        #file_string = ftp_string1 + row['File'] +" -O temp.zip; tar -xzf temp.zip; rm temp.zip"
        #!echo ftp_string1 + row['File'] +" -O temp.zip; tar -xzf temp.zip; rm temp.zip" > cmd.temp
        !wget -i cmd.temp -O temp.zip; tar -xzf temp.zip; rm temp.zip
        #!tar -xzf temp.zip
        #!rm temp.zip
        filename_in = glob.glob('./'+str(row['Accession ID']+'/*.nxml'))[0]
        print(filename_in)
        #filename_in =  filename_out
        filename_out = str(row['Accession ID'])+ '.nxml'
        s3.upload_file(filename_in, bucket_name, filename_out)
        !rm -rf PMC*

--2018-03-04 15:46:32--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/94/ba/PMC17815.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/94/ba ... done.
==> SIZE PMC17815.tar.gz ... 129469
==> PASV ... done.    ==> RETR PMC17815.tar.gz ... done.
Length: 129469 (126K) (unauthoritative)


2018-03-04 15:46:33 (414 KB/s) - ‘temp.zip’ saved [129469]

FINISHED --2018-03-04 15:46:33--
Total wall clock time: 1,8s
Downloaded: 1 files, 126K in 0,3s (414 KB/s)
./PMC17815/ar-2-4-327.nxml
--2018-03-04 15:46:35--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/44/ab/PMC29034.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connectin

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/47/ab ... done.
==> SIZE PMC57003.tar.gz ... 276766
==> PASV ... done.    ==> RETR PMC57003.tar.gz ... done.
Length: 276766 (270K) (unauthoritative)


2018-03-04 15:47:23 (551 KB/s) - ‘temp.zip’ saved [276766]

FINISHED --2018-03-04 15:47:23--
Total wall clock time: 1,9s
Downloaded: 1 files, 270K in 0,5s (551 KB/s)
./PMC57003/1471-2431-1-2.nxml
--2018-03-04 15:47:24--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/a4/20/PMC57006.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_pac

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/94/50 ... done.
==> SIZE PMC65542.tar.gz ... 166479
==> PASV ... done.    ==> RETR PMC65542.tar.gz ... done.
Length: 166479 (163K) (unauthoritative)


2018-03-04 15:48:20 (380 KB/s) - ‘temp.zip’ saved [166479]

FINISHED --2018-03-04 15:48:20--
Total wall clock time: 1,9s
Downloaded: 1 files, 163K in 0,4s (380 KB/s)
./PMC65542/1471-2407-2-2.nxml
--2018-03-04 15:48:20--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e2/5a/PMC65674.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_pac

Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/e5/8a ... done.
==> SIZE PMC122063.tar.gz ... 335625
==> PASV ... done.    ==> RETR PMC122063.tar.gz ... done.
Length: 335625 (328K) (unauthoritative)


2018-03-04 15:49:12 (725 KB/s) - ‘temp.zip’ saved [335625]

FINISHED --2018-03-04 15:49:12--
Total wall clock time: 2,0s
Downloaded: 1 files, 328K in 0,5s (725 KB/s)
./PMC122063/1471-2296-3-14.nxml
--2018-03-04 15:49:12--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f2/e7/PMC122084.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... L

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/76/0c ... done.
==> SIZE PMC139995.tar.gz ... 237491
==> PASV ... done.    ==> RETR PMC139995.tar.gz ... done.
Length: 237491 (232K) (unauthoritative)


2018-03-04 15:50:02 (534 KB/s) - ‘temp.zip’ saved [237491]

FINISHED --2018-03-04 15:50:02--
Total wall clock time: 1,8s
Downloaded: 1 files, 232K in 0,4s (534 KB/s)
./PMC139995/1472-6882-2-11.nxml
--2018-03-04 15:50:02--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/aa/0a/PMC140320.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/o

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/5c/e6 ... done.
==> SIZE PMC161807.tar.gz ... 297917
==> PASV ... done.    ==> RETR PMC161807.tar.gz ... done.
Length: 297917 (291K) (unauthoritative)


2018-03-04 15:50:54 (664 KB/s) - ‘temp.zip’ saved [297917]

FINISHED --2018-03-04 15:50:54--
Total wall clock time: 1,8s
Downloaded: 1 files, 291K in 0,4s (664 KB/s)
./PMC161807/1476-7120-1-7.nxml
--2018-03-04 15:50:55--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c2/de/PMC161814.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/24/b4 ... done.
==> SIZE PMC201020.tar.gz ... 297704
==> PASV ... done.    ==> RETR PMC201020.tar.gz ... done.
Length: 297704 (291K) (unauthoritative)


2018-03-04 15:51:45 (656 KB/s) - ‘temp.zip’ saved [297704]

FINISHED --2018-03-04 15:51:45--
Total wall clock time: 1,9s
Downloaded: 1 files, 291K in 0,4s (656 KB/s)
./PMC201020/1471-2261-3-9.nxml
--2018-03-04 15:51:46--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/80/c4/PMC212466.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::7
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/a6/77 ... done.
==> SIZE PMC270680.tar.gz ... 450674
==> PASV ... done.    ==> RETR PMC270680.tar.gz ... done.
Length: 450674 (440K) (unauthoritative)


2018-03-04 15:52:35 (769 KB/s) - ‘temp.zip’ saved [450674]

FINISHED --2018-03-04 15:52:35--
Total wall clock time: 1,9s
Downloaded: 1 files, 440K in 0,6s (769 KB/s)
./PMC270680/cc2185.nxml
--2018-03-04 15:52:36--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/78/86/PMC270720.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 2607:f220:41e:250::11
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_packag

Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/e5/28 ... done.
==> SIZE PMC341454.tar.gz ... 263932
==> PASV ... done.    ==> RETR PMC341454.tar.gz ... done.
Length: 263932 (258K) (unauthoritative)


2018-03-04 15:53:26 (524 KB/s) - ‘temp.zip’ saved [263932]

FINISHED --2018-03-04 15:53:26--
Total wall clock time: 2,0s
Downloaded: 1 files, 258K in 0,5s (524 KB/s)
./PMC341454/1471-2288-4-2.nxml
--2018-03-04 15:53:27--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/fa/77/PMC343278.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Lo

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/f4/15 ... done.
==> SIZE PMC415550.tar.gz ... 240078
==> PASV ... done.    ==> RETR PMC415550.tar.gz ... done.
Length: 240078 (234K) (unauthoritative)


2018-03-04 15:54:20 (461 KB/s) - ‘temp.zip’ saved [240078]

FINISHED --2018-03-04 15:54:20--
Total wall clock time: 2,4s
Downloaded: 1 files, 234K in 0,5s (461 KB/s)
./PMC415550/1471-2369-5-3.nxml
--2018-03-04 15:54:21--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e1/69/PMC415557.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/ab/85 ... done.
==> SIZE PMC446197.tar.gz ... 420180
==> PASV ... done.    ==> RETR PMC446197.tar.gz ... done.
Length: 420180 (410K) (unauthoritative)


2018-03-04 15:55:17 (866 KB/s) - ‘temp.zip’ saved [420180]

FINISHED --2018-03-04 15:55:17--
Total wall clock time: 1,9s
Downloaded: 1 files, 410K in 0,5s (866 KB/s)
./PMC446197/1471-2377-4-8.nxml
--2018-03-04 15:55:17--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/a5/36/PMC449711.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_pa

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/88/22 ... done.
==> SIZE PMC514553.tar.gz ... 268176
==> PASV ... done.    ==> RETR PMC514553.tar.gz ... done.
Length: 268176 (262K) (unauthoritative)


2018-03-04 15:56:08 (539 KB/s) - ‘temp.zip’ saved [268176]

FINISHED --2018-03-04 15:56:08--
Total wall clock time: 1,9s
Downloaded: 1 files, 262K in 0,5s (539 KB/s)
./PMC514553/1471-244X-4-23.nxml
--2018-03-04 15:56:09--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ea/97/PMC514561.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/o

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/c3/bf ... done.
==> SIZE PMC521491.tar.gz ... 291146
==> PASV ... done.    ==> RETR PMC521491.tar.gz ... done.
Length: 291146 (284K) (unauthoritative)


2018-03-04 15:57:01 (588 KB/s) - ‘temp.zip’ saved [291146]

FINISHED --2018-03-04 15:57:01--
Total wall clock time: 1,9s
Downloaded: 1 files, 284K in 0,5s (588 KB/s)
./PMC521491/1472-6920-4-17.nxml
--2018-03-04 15:57:01--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/aa/07/PMC521689.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/o

In [80]:
s3 = boto3.client('s3')


bucket_name = 'pubmedcentral_oa'
filename_in = './data/Calcif_Tissue_Int/PMC4491344.nxml'
filename_out = 'PMC4491344.nxml'

s3.upload_file(filename_in, bucket_name, filename_out)

In [85]:
clinical_oa_df['File'][:5]

48     oa_package/94/ba/PMC17815.tar.gz
115    oa_package/44/ab/PMC29034.tar.gz
123    oa_package/ad/d7/PMC29042.tar.gz
126    oa_package/e7/5a/PMC29045.tar.gz
128    oa_package/ed/00/PMC29047.tar.gz
Name: File, dtype: object

In [None]:
ftp_string1 = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/%s"
bucket_name = 'pubmedcentral_oa'

for index, row in clinical_oa_df.iterrows():
    #if index < 5:
        f = open("cmd.temp", "w")
        f.write(ftp_string1 % str(row['File']))
        f.close()
        #file_string = ftp_string1 + row['File'] +" -O temp.zip; tar -xzf temp.zip; rm temp.zip"
        #!echo ftp_string1 + row['File'] +" -O temp.zip; tar -xzf temp.zip; rm temp.zip" > cmd.temp
        !wget -i cmd.temp -O temp.zip; tar -xzf temp.zip; rm temp.zip
        #!tar -xzf temp.zip
        #!rm temp.zip
        filename_in = glob.glob('./'+str(row['Accession ID']+'/*.nxml'))[0]
        print(filename_in)
        #filename_in =  filename_out
        filename_out = str(row['Accession ID'])+ '.nxml'
        s3.upload_file(filename_in, bucket_name, filename_out)
        !rm -rf PMC*

--2018-03-04 15:46:32--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/94/ba/PMC17815.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/94/ba ... done.
==> SIZE PMC17815.tar.gz ... 129469
==> PASV ... done.    ==> RETR PMC17815.tar.gz ... done.
Length: 129469 (126K) (unauthoritative)


2018-03-04 15:46:33 (414 KB/s) - ‘temp.zip’ saved [129469]

FINISHED --2018-03-04 15:46:33--
Total wall clock time: 1,8s
Downloaded: 1 files, 126K in 0,3s (414 KB/s)
./PMC17815/ar-2-4-327.nxml
--2018-03-04 15:46:35--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/44/ab/PMC29034.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connectin

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/47/ab ... done.
==> SIZE PMC57003.tar.gz ... 276766
==> PASV ... done.    ==> RETR PMC57003.tar.gz ... done.
Length: 276766 (270K) (unauthoritative)


2018-03-04 15:47:23 (551 KB/s) - ‘temp.zip’ saved [276766]

FINISHED --2018-03-04 15:47:23--
Total wall clock time: 1,9s
Downloaded: 1 files, 270K in 0,5s (551 KB/s)
./PMC57003/1471-2431-1-2.nxml
--2018-03-04 15:47:24--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/a4/20/PMC57006.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_pac

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/94/50 ... done.
==> SIZE PMC65542.tar.gz ... 166479
==> PASV ... done.    ==> RETR PMC65542.tar.gz ... done.
Length: 166479 (163K) (unauthoritative)


2018-03-04 15:48:20 (380 KB/s) - ‘temp.zip’ saved [166479]

FINISHED --2018-03-04 15:48:20--
Total wall clock time: 1,9s
Downloaded: 1 files, 163K in 0,4s (380 KB/s)
./PMC65542/1471-2407-2-2.nxml
--2018-03-04 15:48:20--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e2/5a/PMC65674.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_pac

Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/e5/8a ... done.
==> SIZE PMC122063.tar.gz ... 335625
==> PASV ... done.    ==> RETR PMC122063.tar.gz ... done.
Length: 335625 (328K) (unauthoritative)


2018-03-04 15:49:12 (725 KB/s) - ‘temp.zip’ saved [335625]

FINISHED --2018-03-04 15:49:12--
Total wall clock time: 2,0s
Downloaded: 1 files, 328K in 0,5s (725 KB/s)
./PMC122063/1471-2296-3-14.nxml
--2018-03-04 15:49:12--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f2/e7/PMC122084.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... L

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/76/0c ... done.
==> SIZE PMC139995.tar.gz ... 237491
==> PASV ... done.    ==> RETR PMC139995.tar.gz ... done.
Length: 237491 (232K) (unauthoritative)


2018-03-04 15:50:02 (534 KB/s) - ‘temp.zip’ saved [237491]

FINISHED --2018-03-04 15:50:02--
Total wall clock time: 1,8s
Downloaded: 1 files, 232K in 0,4s (534 KB/s)
./PMC139995/1472-6882-2-11.nxml
--2018-03-04 15:50:02--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/aa/0a/PMC140320.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/o

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/5c/e6 ... done.
==> SIZE PMC161807.tar.gz ... 297917
==> PASV ... done.    ==> RETR PMC161807.tar.gz ... done.
Length: 297917 (291K) (unauthoritative)


2018-03-04 15:50:54 (664 KB/s) - ‘temp.zip’ saved [297917]

FINISHED --2018-03-04 15:50:54--
Total wall clock time: 1,8s
Downloaded: 1 files, 291K in 0,4s (664 KB/s)
./PMC161807/1476-7120-1-7.nxml
--2018-03-04 15:50:55--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c2/de/PMC161814.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/24/b4 ... done.
==> SIZE PMC201020.tar.gz ... 297704
==> PASV ... done.    ==> RETR PMC201020.tar.gz ... done.
Length: 297704 (291K) (unauthoritative)


2018-03-04 15:51:45 (656 KB/s) - ‘temp.zip’ saved [297704]

FINISHED --2018-03-04 15:51:45--
Total wall clock time: 1,9s
Downloaded: 1 files, 291K in 0,4s (656 KB/s)
./PMC201020/1471-2261-3-9.nxml
--2018-03-04 15:51:46--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/80/c4/PMC212466.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::7
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/a6/77 ... done.
==> SIZE PMC270680.tar.gz ... 450674
==> PASV ... done.    ==> RETR PMC270680.tar.gz ... done.
Length: 450674 (440K) (unauthoritative)


2018-03-04 15:52:35 (769 KB/s) - ‘temp.zip’ saved [450674]

FINISHED --2018-03-04 15:52:35--
Total wall clock time: 1,9s
Downloaded: 1 files, 440K in 0,6s (769 KB/s)
./PMC270680/cc2185.nxml
--2018-03-04 15:52:36--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/78/86/PMC270720.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 2607:f220:41e:250::11
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_packag

Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/e5/28 ... done.
==> SIZE PMC341454.tar.gz ... 263932
==> PASV ... done.    ==> RETR PMC341454.tar.gz ... done.
Length: 263932 (258K) (unauthoritative)


2018-03-04 15:53:26 (524 KB/s) - ‘temp.zip’ saved [263932]

FINISHED --2018-03-04 15:53:26--
Total wall clock time: 2,0s
Downloaded: 1 files, 258K in 0,5s (524 KB/s)
./PMC341454/1471-2288-4-2.nxml
--2018-03-04 15:53:27--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/fa/77/PMC343278.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Lo

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/f4/15 ... done.
==> SIZE PMC415550.tar.gz ... 240078
==> PASV ... done.    ==> RETR PMC415550.tar.gz ... done.
Length: 240078 (234K) (unauthoritative)


2018-03-04 15:54:20 (461 KB/s) - ‘temp.zip’ saved [240078]

FINISHED --2018-03-04 15:54:20--
Total wall clock time: 2,4s
Downloaded: 1 files, 234K in 0,5s (461 KB/s)
./PMC415550/1471-2369-5-3.nxml
--2018-03-04 15:54:21--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e1/69/PMC415557.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/ab/85 ... done.
==> SIZE PMC446197.tar.gz ... 420180
==> PASV ... done.    ==> RETR PMC446197.tar.gz ... done.
Length: 420180 (410K) (unauthoritative)


2018-03-04 15:55:17 (866 KB/s) - ‘temp.zip’ saved [420180]

FINISHED --2018-03-04 15:55:17--
Total wall clock time: 1,9s
Downloaded: 1 files, 410K in 0,5s (866 KB/s)
./PMC446197/1471-2377-4-8.nxml
--2018-03-04 15:55:17--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/a5/36/PMC449711.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_pa

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/88/22 ... done.
==> SIZE PMC514553.tar.gz ... 268176
==> PASV ... done.    ==> RETR PMC514553.tar.gz ... done.
Length: 268176 (262K) (unauthoritative)


2018-03-04 15:56:08 (539 KB/s) - ‘temp.zip’ saved [268176]

FINISHED --2018-03-04 15:56:08--
Total wall clock time: 1,9s
Downloaded: 1 files, 262K in 0,5s (539 KB/s)
./PMC514553/1471-244X-4-23.nxml
--2018-03-04 15:56:09--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ea/97/PMC514561.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/o

Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/oa_package/c3/bf ... done.
==> SIZE PMC521491.tar.gz ... 291146
==> PASV ... done.    ==> RETR PMC521491.tar.gz ... done.
Length: 291146 (284K) (unauthoritative)


2018-03-04 15:57:01 (588 KB/s) - ‘temp.zip’ saved [291146]

FINISHED --2018-03-04 15:57:01--
Total wall clock time: 1,9s
Downloaded: 1 files, 284K in 0,5s (588 KB/s)
./PMC521491/1472-6920-4-17.nxml
--2018-03-04 15:57:01--  ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/aa/07/PMC521689.tar.gz
           => ‘temp.zip’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/pmc/o

In [114]:
glob.glob('./data/*/'+'PMC5388656'+'.nxml')

['./data/3_Biotech/PMC5388656.nxml']

In [None]:
import glob
glob.glob('./data/*/PMC17815.nxml')