In [None]:
# Code Based Heavily on xml pubmed parser from Titipat Achakulvisut (https://github.com/titipata)

In [2]:
import calendar
import collections
from time import strptime
from six import string_types
from lxml import etree
from itertools import chain


def remove_namespace(tree):
    """
    Strip namespace from parsed XML
    """
    for node in tree.iter():
        try:
            has_namespace = node.tag.startswith('{')
        except AttributeError:
            continue  # node.tag is not a string (node is a comment or similar)
        if has_namespace:
            node.tag = node.tag.split('}', 1)[1]


def read_xml(path, nxml=False):
    """
    Parse tree from given XML path
    """
    try:
        tree = etree.parse(path)
    except:
        try:
            tree = etree.fromstring(path)
        except Exception:
            print("Error: it was not able to read a path, a file-like object, or a string as an XML")
            raise
    if '.nxml' in path or nxml:
        remove_namespace(tree) # strip namespace for
    return tree


def stringify_children(node):
    """
    Filters and removes possible Nones in texts and tails
    ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
    """
    parts = ([node.text] +
             list(chain(*([c.text, c.tail] for c in node.getchildren()))) +
             [node.tail])
    return ''.join(filter(None, parts))


def stringify_affiliation(node):
    """
    Filters and removes possible Nones in texts and tails
    ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
    """
    parts = ([node.text] +
             list(chain(*([c.text if (c.tag != 'label' and c.tag !='sup') else '', c.tail] for c in node.getchildren()))) +
             [node.tail])
    return ' '.join(filter(None, parts))


def stringify_affiliation_rec(node):
    """
    Flatten and join list to string
    ref: http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python
    """
    parts = _recur_children(node)
    parts_flatten = list(_flatten(parts))
    return ' '.join(parts_flatten).strip()


def _flatten(l):
    """
    Flatten list into one dimensional
    """
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
            for sub in _flatten(el):
                yield sub
        else:
            yield el


def _recur_children(node):
    """
    Recursive through node to when it has multiple children
    """
    if len(node.getchildren()) == 0:
        parts = ([node.text or ''] + [node.tail or '']) if (node.tag != 'label' and node.tag !='sup') else ([node.tail or ''])
        return parts
    else:
        parts = ([node.text or ''] +
                 [_recur_children(c) for c in node.getchildren()] +
                 [node.tail or ''])
        return parts


def month_or_day_formater(month_or_day):
    """
    Parameters
    ----------
    month_or_day: str or int
        must be one of the following:
            (i)  month: a three letter month abbreviation, e.g., 'Jan'.
            (ii) day: an integer.
    Returns
    -------
    numeric: str
        a month of the form 'MM' or a day of the form 'DD'.
        Note: returns None if:
            (a) the input could not be mapped to a known month abbreviation OR
            (b) the input was not an integer (i.e., a day).
    """
    if month_or_day.replace(".", "") in filter(None, calendar.month_abbr):
        to_format = strptime(month_or_day.replace(".", ""), '%b').tm_mon
    elif month_or_day.strip().isdigit() and "." not in str(month_or_day):
        to_format = int(month_or_day.strip())
    else:
        return None

    return ("0" if to_format < 10 else "") + str(to_format)


def pretty_print(node):
    """
    Pretty print a given lxml node
    """
    print(etree.tostring(node, pretty_print=True).decode('utf-8'))

In [125]:
import os
from lxml import etree
from itertools import chain
# from .utils import *
from unidecode import unidecode


__all__ = [
    'list_xml_path',
    'parse_pubmed_xml',
    'parse_pubmed_paragraph',
    'parse_pubmed_references',
    'parse_pubmed_caption'
]


def list_xml_path(path_dir):
    """
    List full xml path under given directory
    Parameters
    ----------
    path_dir: str, path to directory that contains xml or nxml file
    Returns
    -------
    path_list: list, list of xml or nxml file from given path
    """
    fullpath = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(path_dir)) for f in fn]
    path_list = [folder for folder in fullpath if os.path.splitext(folder)[-1] in ('.nxml', '.xml')]
    return path_list


def zip_author(author):
    """
    Give a list of author and its affiliation keys
    in this following format
    [first_name, last_name, [key1, key2]]
    return [[first_name, last_name, key1], [first_name, last_name, key2]] instead
    """
    author_zipped = list(zip([[author[0], author[1]]] * len(author[-1]), author[-1]))
    return list(map(lambda x: x[0] + [x[-1]], author_zipped))


def flatten_zip_author(author_list):
    """
    Apply zip_author to author_list and flatten it
    """
    author_zipped_list = map(zip_author, author_list)
    return list(chain.from_iterable(author_zipped_list))


def parse_article_meta(tree):
    """
    Parse PMID, PMC and DOI from given article tree
    """
    article_meta = tree.find('.//article-meta')
    pmid_node = article_meta.find('article-id[@pub-id-type="pmid"]')
    pmc_node = article_meta.find('article-id[@pub-id-type="pmc"]')
    pub_id_node = article_meta.find('article-id[@pub-id-type="publisher-id"]')
    doi_node = article_meta.find('article-id[@pub-id-type="doi"]')

    pmid = pmid_node.text if pmid_node is not None else ''
    pmc = pmc_node.text if pmc_node is not None else ''
    pub_id = pub_id_node.text if pub_id_node is not None else ''
    doi = doi_node.text if doi_node is not None else ''

    dict_article_meta = {'pmid': pmid,
                         'pmc': pmc,
                         'doi': doi,
                         'publisher_id': pub_id}

    return dict_article_meta


def parse_pubmed_xml(path, include_path=False, nxml=False):
    """
    Given single xml path, extract information from xml file
    and return parsed xml file in dictionary format.
    """
    tree = read_xml(path, nxml)

    tree_title = tree.find('.//title-group/article-title')
    if tree_title is not None:
        title = [t for t in tree_title.itertext()]
        sub_title = tree.xpath('.//title-group/subtitle/text()')
        title.extend(sub_title)
        title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
        full_title = ' '.join(title)
    else:
        full_title = ''

    try:
        abstracts = list()
        abstract_tree = tree.findall('.//abstract')
        for a in abstract_tree:
            for t in a.itertext():
                text = t.replace('\n', ' ').replace('\t', ' ').strip()
                abstracts.append(text)
        abstract = ' '.join(abstracts)
    except:
        abstract = ''

    journal_node = tree.findall('.//journal-title')
    if journal_node is not None:
        journal = ' '.join([j.text for j in journal_node])
    else:
        journal = ''

    dict_article_meta = parse_article_meta(tree)
    pub_year_node = tree.find('.//pub-date/year')
    pub_year = pub_year_node.text if pub_year_node is not None else ''
    pub_month_node = tree.find('.//pub-date/month')
    pub_month = pub_month_node.text if pub_month_node is not None else '01'
    pub_day_node = tree.find('.//pub-date/day')
    pub_day = pub_day_node.text if pub_day_node is not None else '01'

    subjects_node = tree.findall('.//article-categories.//subj-group/subject')
    subjects = list()
    if subjects_node is not None:
        for s in subjects_node:
            subject = ' '.join([s_.strip() for s_ in s.itertext()]).strip()
            subjects.append(subject)
        subjects = '; '.join(subjects)
    else:
        subjects = ''

    # create affiliation dictionary
    affil_id = tree.xpath('.//aff[@id]/@id')
    if len(affil_id) > 0:
        affil_id = list(map(str, affil_id))
    else:
        affil_id = ['']  # replace id with empty list

    affil_name = tree.xpath('.//aff[@id]')
    affil_name_list = list()
    for e in affil_name:
        name = stringify_affiliation_rec(e)
        name = name.strip().replace('\n', ' ')
        affil_name_list.append(name)
    affiliation_list = [[idx, name] for idx, name in zip(affil_id, affil_name_list)]

    tree_author = tree.xpath('.//contrib-group/contrib[@contrib-type="author"]')
    author_list = list()
    for author in tree_author:
        author_aff = author.findall('xref[@ref-type="aff"]')
        try:
            ref_id_list = [str(a.attrib['rid']) for a in author_aff]
        except:
            ref_id_list = ''
        try:
            author_list.append([author.find('name/surname').text,
                                author.find('name/given-names').text,
                                ref_id_list])
        except:
            author_list.append(['', '', ref_id_list])
    author_list = flatten_zip_author(author_list)

    dict_out = {'full_title': full_title.strip(),
                'abstract': abstract,
                'journal': journal,
                'pmid': dict_article_meta['pmid'],
                'pmc': dict_article_meta['pmc'],
                'doi': dict_article_meta['doi'],
                'publisher_id': dict_article_meta['publisher_id'],
                'author_list': author_list,
                'affiliation_list': affiliation_list,
                'publication_year': pub_year,
                'publication_date': '{}-{}-{}'.format(pub_day, pub_month, pub_year),
                'subjects': subjects}
    if include_path:
        dict_out['path_to_file'] = path
        
#     article_id = str(dict_article_meta['pmid'])+str(dict_article_meta['pmc'])
    
    return dict_out


def parse_pubmed_references(path):
    """
    Given path to xml file, parse references articles
    to list of dictionary
    """
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']

    references = tree.xpath('.//ref-list/ref[@id]')
    dict_refs = list()
    for reference in references:
        ref_id = reference.attrib['id']

        if reference.find('mixed-citation') is not None:
            ref = reference.find('mixed-citation')
        elif reference.find('element-citation') is not None:
            ref = reference.find('element-citation')
        else:
            ref = None

        if ref is not None:
            if 'publication-type' in ref.attrib.keys() and ref is not None:
                if ref.attrib.values() is not None:
                    journal_type = ref.attrib.values()[0]
                else:
                    journal_type = ''
                names = list()
                if ref.find('name') is not None:
                    for n in ref.findall('name'):
                        name = ' '.join([t.text or '' for t in n.getchildren()][::-1])
                        names.append(name)
                elif ref.find('person-group') is not None:
                    for n in ref.find('person-group'):
                        name = ' '.join(n.xpath('given-names/text()') + n.xpath('surname/text()'))
                        names.append(name)
                if ref.find('article-title') is not None:
                    article_title = stringify_children(ref.find('article-title')) or ''
                    article_title = article_title.replace('\n', ' ').strip()
                else:

                    article_title = ''
                if ref.find('source') is not None:
                    journal = ref.find('source').text or ''
                else:
                    journal = ''
                if ref.find('year') is not None:
                    year = ref.find('year').text or ''
                else:
                    year = ''
                if len(ref.findall('pub-id')) >= 1:
                    for pubid in ref.findall('pub-id'):
                        if 'doi' in pubid.attrib.values():
                            doi_cited = pubid.text
                        else:
                            doi_cited = ''
                        if 'pmid' in pubid.attrib.values():
                            pmid_cited = pubid.text
                        else:
                            pmid_cited = ''
                else:
                    doi_cited = ''
                    pmid_cited = ''
                dict_ref = {'pmid': pmid,
                            'pmc': pmc,
                            'ref_id': ref_id,
                            'pmid_cited': pmid_cited,
                            'doi_cited': doi_cited,
                            'article_title': article_title,
                            'name': '; '.join(names),
                            'year': year,
                            'journal': journal,
                            'journal_type': journal_type}
                dict_refs.append(dict_ref)
    if len(dict_refs) == 0:
        dict_refs = None
    return dict_refs


def parse_pubmed_paragraph(path, all_paragraph=False):
    """
    Give tree and reference dictionary
    return dictionary of referenced paragraph, section that it belongs to,
    and its cited PMID
    """
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']

    paragraphs = tree.xpath('//body//p')
    dict_pars = list()
    for paragraph in paragraphs:
        paragraph_text = stringify_children(paragraph)
        section = paragraph.find('../title')
        if section is not None:
            section = stringify_children(section).strip()
        else:
            section = ''

        ref_ids = list()
        for reference in paragraph.getchildren():
            if 'rid' in reference.attrib.keys():
                ref_id = reference.attrib['rid']
                ref_ids.append(ref_id)

        dict_par = {'pmc': pmc,
                    'pmid': pmid,
                    'reference_ids': ref_ids,
                    'section': section,
                    'text': paragraph_text}
        if len(ref_ids) >= 1 or all_paragraph:
            dict_pars.append(dict_par)
   
    return dict_pars


def parse_pubmed_caption(path):
    """
    Given single xml path, extract figure caption and
    reference id back to that figure
    """
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']

    figs = tree.findall('.//fig')
    dict_captions = list()
    if figs is not None:
        for fig in figs:
            fig_id = fig.attrib['id']
            fig_label = stringify_children(fig.find('label'))
            fig_captions = fig.find('caption').getchildren()
            caption = ' '.join([stringify_children(c) for c in fig_captions])
            graphic = fig.find('graphic')
            if graphic is not None:
                graphic_ref = graphic.attrib.values()[0]
            dict_caption = {'pmid': pmid,
                            'pmc': pmc,
                            'fig_caption': caption,
                            'fig_id': fig_id,
                            'fig_label': fig_label,
                            'graphic_ref': graphic_ref}
            dict_captions.append(dict_caption)
    if not dict_captions:
        dict_captions = None
    return dict_captions


def table_to_df(table_text):
    """
    Function to transform plain xml text to list of row values and
    columns
    """
    table_tree = etree.fromstring(table_text)
    columns = []
    for tr in table_tree.xpath('thead/tr'):
        for c in tr.getchildren():
            columns.append(unidecode(stringify_children(c)))

    row_values = []
    len_rows = []
    for tr in table_tree.findall('tbody/tr'):
        es = tr.xpath('td')
        row_value = [unidecode(stringify_children(e)) for e in es]
        len_rows.append(len(es))
        row_values.append(row_value)
    if len(len_rows) >= 1:
        len_row = max(set(len_rows), key=len_rows.count)
        row_values = [r for r in row_values if len(r) == len_row] # remove row with different length
        return columns, row_values
    else:
        return None, None


def parse_pubmed_table(path, return_xml=True):
    """
    Parse table from given Pubmed Open-Access XML file
    """
    tree = read_xml(path)
    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta['pmid']
    pmc = dict_article_meta['pmc']

    # parse table
    tables = tree.xpath('.//body.//sec.//table-wrap')
    table_dicts = list()
    for table in tables:
        if table.find('label') is not None:
            label = unidecode(table.find('label').text or '')
        else:
            label = ''

        # table caption
        if table.find('caption/p') is not None:
            caption_node = table.find('caption/p')
        elif table.find('caption/title') is not None:
            caption_node = table.find('caption/title')
        else:
            caption_node = None
        if caption_node is not None:
            caption = unidecode(stringify_children(caption_node).strip())
        else:
            caption = ''

        # table content
        if table.find('table') is not None:
            table_tree = table.find('table')
        elif table.find('alternatives/table') is not None:
            table_tree = table.find('alternatives/table')
        else:
            table_tree = None

        if table_tree is not None:
            table_xml = etree.tostring(table_tree)
            columns, row_values = table_to_df(table_xml)
            if row_values is not None:
                table_dict = {'pmid': pmid,
                              'pmc': pmc,
                              'label': label,
                              'caption': caption,
                              'table_columns': columns,
                              'table_values': row_values}
                if return_xml:
                    table_dict['table_xml'] = table_xml
                table_dicts.append(table_dict)
    if len(table_dicts) >= 1:
        return table_dicts
    else:
        return None

In [13]:
from fastai import *
from fastai.utils import *

In [203]:
from pathlib import Path
Path_pub = Path('D:/pubmed/xml')
folder_list = Path_pub.ls()

In [204]:
folder_list

[WindowsPath('D:/pubmed/xml/comm_use.A-B.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/comm_use.C-H.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/comm_use.I-N.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/comm_use.O-Z.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/non_comm_use.A-B.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/non_comm_use.C-H.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/non_comm_use.I-N.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/non_comm_use.O-Z.xml.tar.gz'),
 WindowsPath('D:/pubmed/xml/Alzheimers_Res_Ther'),
 WindowsPath('D:/pubmed/xml/Biol_Direct'),
 WindowsPath('D:/pubmed/xml/BMC_Neurosci'),
 WindowsPath('D:/pubmed/xml/BMC_Biochem'),
 WindowsPath('D:/pubmed/xml/BMC_Infect_Dis'),
 WindowsPath('D:/pubmed/xml/Basic_Res_Cardiol'),
 WindowsPath('D:/pubmed/xml/Appl_Math_(Irvine)'),
 WindowsPath('D:/pubmed/xml/Acta_Vet_Scand'),
 WindowsPath('D:/pubmed/xml/BMC_Nephrol'),
 WindowsPath('D:/pubmed/xml/BMC_Genomics'),
 WindowsPath('D:/pubmed/xml/BMC_Med_Inform_Decis_Mak'),
 WindowsPath('D:/pubmed/xml/B

In [None]:
ist_xml_path(path_dir)

In [16]:

test_path = Path_pub
test_list = test_path.ls()
# parse_pubmed_xml(path, include_path=False, nxml=False)

In [205]:
path_list = {}
counter_f = 0
for f in folder_list[8:]:
    path_list[counter_f] = (list_xml_path(f))
    counter_f+=1

In [206]:
path_list.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28])

In [208]:
path_list[0]

['D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4571139.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4571137.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4517508.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4513634.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4255417.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4940880.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC4942967.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC2874259.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3226270.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC2874261.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC2919700.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3226274.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3226277.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3226311.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3308021.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3506931.nxml',
 'D:\\pubmed\\xml\\Alzheimers_Res_Ther\\PMC3506945.nxml',
 'D:\\pubmed\\

In [214]:
# import concurrent.futures
# with concurrent.futures.ProcessPoolExecutor() as executor:

pubmed_db_full = {}
table_dict = {}
texts_only = {}
counter = 0

for file in path_list[counter]:

    text_only_temp = []

    article_dict = parse_pubmed_xml(file, nxml=True)
    text_only_temp.append(article_dict['abstract'])

    paragraph_list = parse_pubmed_paragraph(file, all_paragraph=True)
    paragraph_dict = {}
    
    if paragraph_list is not None:
        for i in paragraph_list:
            paragraph_dict[i['section']] = i['text']
            text_only_temp.append(i['text'])
    else:
        ''

#-------------------------------------------------

    caption_list = parse_pubmed_caption(file)
    caption_dict = {}
    if caption_list is not None:
        for i in caption_list:
            caption_dict[i['graphic_ref']] = i['fig_caption']
            text_only_temp.append(i['fig_caption'])
    else:
        ''

#-------------------------------------------------

    reference_list = parse_pubmed_references(file)

    table_dict =  {'pmid': article_dict['pmid'],
                       'pmc': article_dict['pmc'],
                       'journal': article_dict['journal'],    
                       'full_title': article_dict['full_title'],
                       'subjects': article_dict['subjects'],
                       'abstract': article_dict['abstract'],                           
                       'text': paragraph_dict,
                       'publication_date': article_dict['publication_date'],
                       'fig_caption': caption_dict,
                        'publisher_id': article_dict['publisher_id'],
                        'doi': article_dict['doi'],
                        'author_list': article_dict['author_list'],
                        'affiliation_list': article_dict['affiliation_list'],
                        'publication_year': article_dict['publication_year'],
                        'references': reference_list}


    pubmed_db_full[counter] = table_dict
    texts_only[counter] = text_only_temp    
    counter += 1
    
    if counter>400:
        break

In [None]:
    #     if counter>10:
    #         break

    #  'references': reference_dict['pmid_cited'],
    #                        'ref_doi_cited': reference_dict['doi_cited'],
    #                         'ref_name': reference_dict['name'],
    #                         'ref_year': reference_dict['year'],
    #                         'ref_journal_type': reference_dict['journal_type'],
    #                         'reference_ids': reference_dict['reference_ids'],

In [217]:
# pubmed_db_full
# pubmed_db_full[0]['text']

In [None]:
# listss={}
# for i in caption_list:
#     listss[i['graphic_ref']] = i['fig_caption']

In [169]:
# from bs4 import BeautifulSoup

# soup = BeautifulSoup(html_doc, 'xml.parser')
# html_doc
# print(soup.prettify())

In [None]:
# remove_namespace(tree)
# read_xml(files, nxml=True)
# stringify_children(node)
# stringify_affiliation(node)
# stringify_affiliation_rec(node)
# month_or_day_formater(month_or_day)
# pretty_print(node)

In [168]:
# dict_out

In [None]:
# from .pubmed_oa_parser import list_xml_path, \
#                               parse_pubmed_xml, \
#                               parse_pubmed_references, \
#                               parse_pubmed_paragraph, \
#                               parse_pubmed_caption, \
#                               parse_pubmed_table

# from .pubmed_web_parser import parse_xml_web, \
#                                parse_citation_web, \
#                                parse_outgoing_citation_web

In [167]:
# import os
# import re
# from glob import glob
# from datetime import datetime
# import random
# import subprocess
# # import pubmed_parser as pp
# from pyspark.sql import Row, SQLContext
# from pyspark import SparkConf, SparkContext
# from utils import get_update_date

# # directory
# home_dir = os.path.expanduser('~')
# download_dir = os.path.join(home_dir, 'Downloads')
# unzip_dir = os.path.join(download_dir, 'pubmed_oa') # path to unzip tar file
# save_dir = os.path.join(home_dir, 'Desktop')

# def parse_name(p):
#     """Turn dataframe from pubmed_parser to list of Spark Row"""
#     author_list = p.author_list
#     author_table = list()
#     if len(author_list) >= 1:
#         for author in author_list:
#             r = Row(pmc=p.pmc, pmid=p.pmid, last_name=author[0],
#                     first_name=author[1], affiliation_id=author[2])
#             author_table.append(r)
#         return author_table
#     else:
#         return None

# def parse_affiliation(p):
#     """Turn dataframe from pubmed_parser to list of Spark Row"""
#     affiliation_list = p.affiliation_list
#     affiliation_table = list()
#     if len(affiliation_list) >= 1:
#         for affil in affiliation_list:
#             r = Row(pmc=p.pmc, pmid=p.pmid,
#                     affiliation_id=affil[0], affiliation=affil[1])
#             affiliation_table.append(r)
#         return affiliation_table
#     else:
#         return None

# def update():
#     """Download and update file"""
#     save_file = os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')
#     file_list = list(filter(os.path.isdir, glob(save_file)))
#     if file_list:
#         d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0)
#         date_file = datetime.strptime(d, '%Y_%m_%d')
#         date_update = get_update_date(option='oa')
#         # if update is newer
#         is_update = date_update > date_file
#         if is_update:
#             print("MEDLINE update available!")
#             subprocess.call(['rm', '-rf', os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')]) # remove
#             subprocess.call(['rm', '-rf', download_dir, 'pubmed_oa'])
#             subprocess.call(['wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir])
#             if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir)
#             subprocess.call(['tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir])
#         else:
#             print("No update available")
#     else:
#         print("Download Pubmed Open-Access for the first time")
#         is_update = True
#         date_update = get_update_date(option='oa')
#         subprocess.call(['wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir])
#         if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir)
#         subprocess.call(['tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir])
#     return is_update, date_update

# def process_file(date_update, fraction=0.01):
#     """Process unzipped Pubmed Open-Access folder to parquet file"""
#     print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction))
#     date_update_str = date_update.strftime("%Y_%m_%d")
#     if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')):
#         subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet']) # remove if folder still exist

#     path_all = pp.list_xml_path(unzip_dir)
#     if fraction < 1:
#         n_sample = int(fraction * len(path_all))
#         rand_index = random.sample(range(len(path_all)), n_sample)
#         rand_index.sort()
#         path_sample = [path_all[i] for i in rand_index]
#     else:
#         path_sample = path_all

#     path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path
#     parse_results_rdd = path_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
#     pubmed_oa_df = parse_results_rdd.toDF()
#     pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi',
#                                      'file_name', 'pmc', 'pmid',
#                                      'publication_year', 'publisher_id',
#                                      'journal', 'subjects']]
#     pubmed_oa_df_sel.write.parquet(os.path.join(save_dir, 'pubmed_oa_%s.parquet' % date_update_str),
#                                    mode='overwrite')

#     parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
#         filter(lambda x: x is not None).\
#         flatMap(lambda xs: [x for x in xs])
#     parse_name_df = parse_name_rdd.toDF()
#     parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
#                                 mode='overwrite')

#     parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
#         filter(lambda x: x is not None).\
#         flatMap(lambda xs: [x for x in xs])
#     parse_affil_df = parse_affil_rdd.toDF()
#     parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
#                                 mode='overwrite')
#     print('Finished parsing Pubmed Open-Access subset')

# conf = SparkConf().setAppName('pubmed_oa_spark')\
#     .setMaster('local[8]')\
#     .set('executor.memory', '8g')\
#     .set('driver.memory', '8g')\
#     .set('spark.driver.maxResultSize', '0')

# if __name__ == '__main__':
#     sc = SparkContext(conf=conf)
#     sqlContext = SQLContext(sc)
#     is_update, date_update = update()
#     if is_update:
#         process_file(date_update)
#     sc.stop()

ModuleNotFoundError: No module named 'pyspark'

In [None]:
# import os
# from lxml import etree
# from itertools import chain
# # from .utils import *
# from unidecode import unidecode

# __all__ = [
#     'list_xml_path',
#     'parse_pubmed_xml',
#     'parse_pubmed_paragraph',
#     'parse_pubmed_references',
#     'parse_pubmed_caption'
# ]


# def list_xml_path(path_dir):
#     """
#     List full xml path under given directory
#     Parameters
#     ----------
#     path_dir: str, path to directory that contains xml or nxml file
#     Returns
#     -------
#     path_list: list, list of xml or nxml file from given path
#     """
#     fullpath = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(path_dir)) for f in fn]
#     path_list = [folder for folder in fullpath if os.path.splitext(folder)[-1] in ('.nxml', '.xml')]
#     return path_list


# def zip_author(author):
#     """
#     Give a list of author and its affiliation keys
#     in this following format
#     [first_name, last_name, [key1, key2]]
#     return [[first_name, last_name, key1], [first_name, last_name, key2]] instead
#     """
#     author_zipped = list(zip([[author[0], author[1]]] * len(author[-1]), author[-1]))
#     return list(map(lambda x: x[0] + [x[-1]], author_zipped))


# def flatten_zip_author(author_list):
#     """
#     Apply zip_author to author_list and flatten it
#     """
#     author_zipped_list = map(zip_author, author_list)
#     return list(chain.from_iterable(author_zipped_list))


# def parse_article_meta(tree):
#     """
#     Parse PMID, PMC and DOI from given article tree
#     """
#     article_meta = tree.find('.//article-meta')
#     pmid_node = article_meta.find('article-id[@pub-id-type="pmid"]')
#     pmc_node = article_meta.find('article-id[@pub-id-type="pmc"]')
#     pub_id_node = article_meta.find('article-id[@pub-id-type="publisher-id"]')
#     doi_node = article_meta.find('article-id[@pub-id-type="doi"]')

#     pmid = pmid_node.text if pmid_node is not None else ''
#     pmc = pmc_node.text if pmc_node is not None else ''
#     pub_id = pub_id_node.text if pub_id_node is not None else ''
#     doi = doi_node.text if doi_node is not None else ''

#     dict_article_meta = {'pmid': pmid,
#                          'pmc': pmc,
#                          'doi': doi,
#                          'publisher_id': pub_id}

#     return dict_article_meta


# def parse_pubmed_xml(path, include_path=False, nxml=False):
#     """
#     Given single xml path, extract information from xml file
#     and return parsed xml file in dictionary format.
#     """
#     tree = read_xml(path, nxml)

#     tree_title = tree.find('.//title-group/article-title')
#     if tree_title is not None:
#         title = [t for t in tree_title.itertext()]
#         sub_title = tree.xpath('.//title-group/subtitle/text()')
#         title.extend(sub_title)
#         title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
#         full_title = ' '.join(title)
#     else:
#         full_title = ''

#     try:
#         abstracts = list()
#         abstract_tree = tree.findall('.//abstract')
#         for a in abstract_tree:
#             for t in a.itertext():
#                 text = t.replace('\n', ' ').replace('\t', ' ').strip()
#                 abstracts.append(text)
#         abstract = ' '.join(abstracts)
#     except:
#         abstract = ''

#     journal_node = tree.findall('.//journal-title')
#     if journal_node is not None:
#         journal = ' '.join([j.text for j in journal_node])
#     else:
#         journal = ''

#     dict_article_meta = parse_article_meta(tree)
#     pub_year_node = tree.find('.//pub-date/year')
#     pub_year = pub_year_node.text if pub_year_node is not None else ''
#     pub_month_node = tree.find('.//pub-date/month')
#     pub_month = pub_month_node.text if pub_month_node is not None else '01'
#     pub_day_node = tree.find('.//pub-date/day')
#     pub_day = pub_day_node.text if pub_day_node is not None else '01'

#     subjects_node = tree.findall('.//article-categories.//subj-group/subject')
#     subjects = list()
#     if subjects_node is not None:
#         for s in subjects_node:
#             subject = ' '.join([s_.strip() for s_ in s.itertext()]).strip()
#             subjects.append(subject)
#         subjects = '; '.join(subjects)
#     else:
#         subjects = ''

#     # create affiliation dictionary
#     affil_id = tree.xpath('.//aff[@id]/@id')
#     if len(affil_id) > 0:
#         affil_id = list(map(str, affil_id))
#     else:
#         affil_id = ['']  # replace id with empty list

#     affil_name = tree.xpath('.//aff[@id]')
#     affil_name_list = list()
#     for e in affil_name:
#         name = stringify_affiliation_rec(e)
#         name = name.strip().replace('\n', ' ')
#         affil_name_list.append(name)
#     affiliation_list = [[idx, name] for idx, name in zip(affil_id, affil_name_list)]

#     tree_author = tree.xpath('.//contrib-group/contrib[@contrib-type="author"]')
#     author_list = list()
#     for author in tree_author:
#         author_aff = author.findall('xref[@ref-type="aff"]')
#         try:
#             ref_id_list = [str(a.attrib['rid']) for a in author_aff]
#         except:
#             ref_id_list = ''
#         try:
#             author_list.append([author.find('name/surname').text,
#                                 author.find('name/given-names').text,
#                                 ref_id_list])
#         except:
#             author_list.append(['', '', ref_id_list])
#     author_list = flatten_zip_author(author_list)

#     dict_out = {'full_title': full_title.strip(),
#                 'abstract': abstract,
#                 'journal': journal,
#                 'pmid': dict_article_meta['pmid'],
#                 'pmc': dict_article_meta['pmc'],
#                 'doi': dict_article_meta['doi'],
#                 'publisher_id': dict_article_meta['publisher_id'],
#                 'author_list': author_list,
#                 'affiliation_list': affiliation_list,
#                 'publication_year': pub_year,
#                 'publication_date': '{}-{}-{}'.format(pub_day, pub_month, pub_year),
#                 'subjects': subjects}
#     if include_path:
#         dict_out['path_to_file'] = path
        
#     article_id = str(dict_article_meta['pmid'])+str(dict_article_meta['pmc'])
#     return dict_out, article_id


# def parse_pubmed_references(path):
#     """
#     Given path to xml file, parse references articles
#     to list of dictionary
#     """
#     tree = read_xml(path)
#     dict_article_meta = parse_article_meta(tree)
#     pmid = dict_article_meta['pmid']
#     pmc = dict_article_meta['pmc']

#     references = tree.xpath('.//ref-list/ref[@id]')
#     dict_refs = list()
#     for reference in references:
#         ref_id = reference.attrib['id']

#         if reference.find('mixed-citation') is not None:
#             ref = reference.find('mixed-citation')
#         elif reference.find('element-citation') is not None:
#             ref = reference.find('element-citation')
#         else:
#             ref = None

#         if ref is not None:
#             if 'publication-type' in ref.attrib.keys() and ref is not None:
#                 if ref.attrib.values() is not None:
#                     journal_type = ref.attrib.values()[0]
#                 else:
#                     journal_type = ''
#                 names = list()
#                 if ref.find('name') is not None:
#                     for n in ref.findall('name'):
#                         name = ' '.join([t.text or '' for t in n.getchildren()][::-1])
#                         names.append(name)
#                 elif ref.find('person-group') is not None:
#                     for n in ref.find('person-group'):
#                         name = ' '.join(n.xpath('given-names/text()') + n.xpath('surname/text()'))
#                         names.append(name)
#                 if ref.find('article-title') is not None:
#                     article_title = stringify_children(ref.find('article-title')) or ''
#                     article_title = article_title.replace('\n', ' ').strip()
#                 else:

#                     article_title = ''
#                 if ref.find('source') is not None:
#                     journal = ref.find('source').text or ''
#                 else:
#                     journal = ''
#                 if ref.find('year') is not None:
#                     year = ref.find('year').text or ''
#                 else:
#                     year = ''
#                 if len(ref.findall('pub-id')) >= 1:
#                     for pubid in ref.findall('pub-id'):
#                         if 'doi' in pubid.attrib.values():
#                             doi_cited = pubid.text
#                         else:
#                             doi_cited = ''
#                         if 'pmid' in pubid.attrib.values():
#                             pmid_cited = pubid.text
#                         else:
#                             pmid_cited = ''
#                 else:
#                     doi_cited = ''
#                     pmid_cited = ''
#                 dict_ref = {'pmid': pmid,
#                             'pmc': pmc,
#                             'ref_id': ref_id,
#                             'pmid_cited': pmid_cited,
#                             'doi_cited': doi_cited,
#                             'article_title': article_title,
#                             'name': '; '.join(names),
#                             'year': year,
#                             'journal': journal,
#                             'journal_type': journal_type}
#                 dict_refs.append(dict_ref)
#     if len(dict_refs) == 0:
#         dict_refs = None


#     """
#     Give tree and reference dictionary
#     return dictionary of referenced paragraph, section that it belongs to,
#     and its cited PMID
#     """
#     tree = read_xml(path)
#     dict_article_meta = parse_article_meta(tree)
#     pmid = dict_article_meta['pmid']
#     pmc = dict_article_meta['pmc']

#     paragraphs = tree.xpath('//body//p')
#     dict_pars = list()
#     for paragraph in paragraphs:
#         paragraph_text = stringify_children(paragraph)
#         section = paragraph.find('../title')
#         if section is not None:
#             section = stringify_children(section).strip()
#         else:
#             section = ''

#         ref_ids = list()
#         for reference in paragraph.getchildren():
#             if 'rid' in reference.attrib.keys():
#                 ref_id = reference.attrib['rid']
#                 ref_ids.append(ref_id)

#         dict_par = {'pmc': pmc,
#                     'pmid': pmid,
#                     'reference_ids': ref_ids,
#                     'section': section,
#                     'text': paragraph_text}
#         if len(ref_ids) >= 1 or all_paragraph:
#             dict_pars.append(dict_par)

#     """
#     Given single xml path, extract figure caption and
#     reference id back to that figure
#     """
#     tree = read_xml(path)
#     dict_article_meta = parse_article_meta(tree)
#     pmid = dict_article_meta['pmid']
#     pmc = dict_article_meta['pmc']

#     figs = tree.findall('.//fig')
#     dict_captions = list()
#     if figs is not None:
#         for fig in figs:
#             fig_id = fig.attrib['id']
#             fig_label = stringify_children(fig.find('label'))
#             fig_captions = fig.find('caption').getchildren()
#             caption = ' '.join([stringify_children(c) for c in fig_captions])
#             graphic = fig.find('graphic')
#             if graphic is not None:
#                 graphic_ref = graphic.attrib.values()[0]
#             dict_caption = {'pmid': pmid,
#                             'pmc': pmc,
#                             'fig_caption': caption,
#                             'fig_id': fig_id,
#                             'fig_label': fig_label,
#                             'graphic_ref': graphic_ref}
#             dict_captions.append(dict_caption)
#     if not dict_captions:
#         dict_captions = None


# # def table_to_df(table_text):
# #     """
# #     Function to transform plain xml text to list of row values and
# #     columns
# #     """
# #     table_tree = etree.fromstring(table_text)
# #     columns = []
# #     for tr in table_tree.xpath('thead/tr'):
# #         for c in tr.getchildren():
# #             columns.append(unidecode(stringify_children(c)))

# #     row_values = []
# #     len_rows = []
# #     for tr in table_tree.findall('tbody/tr'):
# #         es = tr.xpath('td')
# #         row_value = [unidecode(stringify_children(e)) for e in es]
# #         len_rows.append(len(es))
# #         row_values.append(row_value)
# #     if len(len_rows) >= 1:
# #         len_row = max(set(len_rows), key=len_rows.count)
# #         row_values = [r for r in row_values if len(r) == len_row] # remove row with different length
# #         return columns, row_values
# #     else:
# #         return None, None


# # def parse_pubmed_table(path, return_xml=True):
#     """
#     Parse table from given Pubmed Open-Access XML file
#     """
#     tree = read_xml(path)
#     dict_article_meta = parse_article_meta(tree)
#     pmid = dict_article_meta['pmid']
#     pmc = dict_article_meta['pmc']

#     # parse table
#     tables = tree.xpath('.//body.//sec.//table-wrap')
#     table_dicts = list()
#     for table in tables:
#         if table.find('label') is not None:
#             label = unidecode(table.find('label').text or '')
#         else:
#             label = ''

#         # table caption
#         if table.find('caption/p') is not None:
#             caption_node = table.find('caption/p')
#         elif table.find('caption/title') is not None:
#             caption_node = table.find('caption/title')
#         else:
#             caption_node = None
#         if caption_node is not None:
#             caption = unidecode(stringify_children(caption_node).strip())
#         else:
#             caption = ''

#         # table content
#         if table.find('table') is not None:
#             table_tree = table.find('table')
#         elif table.find('alternatives/table') is not None:
#             table_tree = table.find('alternatives/table')
#         else:
#             table_tree = None

#         if table_tree is not None:
#             table_xml = etree.tostring(table_tree)
#             columns, row_values = table_to_df(table_xml)
#             if row_values is not None:
                
#         table_dict =  {'pmid': pmid,
#                                'pmc': pmc,
#                                'article_title': article_title,
#                                'full_title': full_title.strip(),
#                                'subjects': subjects
#                                'abstract': abstract,
#                                'journal': journal,                              
#                                'text': paragraph_text,
#                                'fig_caption': caption,
#                                'fig_id': fig_id,
#                                'fig_label': fig_label,
#                                'graphic_ref': graphic_ref,
#                                'label': label,
#                                'table_columns': columns,
#                                'table_values': row_values,
#                                'pmid_cited': pmid_cited,
#                                'doi_cited': doi_cited,
#                                 'name': '; '.join(names),
#                                 'year': year,
#                                 'journal': journal,
#                                 'journal_type': journal_type,
#                                 'reference_ids': ref_ids,
#                                 'section': section,
#                                 'publisher_id': dict_article_meta['publisher_id'],
#                                 'doi': dict_article_meta['doi'],
#                                 'author_list': author_list,
#                                 'affiliation_list': affiliation_list,
#                                 'publication_year': pub_year,
#                                 'publication_date': '{}-{}-{}'.format(pub_day, pub_month, pub_year)}
                 
#                 if return_xml:
#                     table_dict['table_xml'] = table_xml
#                 table_dicts.append(table_dict)
#     if len(table_dicts) >= 1:
#         return table_dicts
#     else:
#         return None

In [119]:
#    dict_out = {'full_title': full_title.strip(),----
#                 'abstract': abstract,------
#                 'journal': journal,------
#                 'pmid': dict_article_meta['pmid'],----
#                 'pmc': dict_article_meta['pmc'],-----
#                 'doi': dict_article_meta['doi'],-------
#                 'publisher_id': dict_article_meta['publisher_id'],-----------
#                 'author_list': author_list,---------------
#                 'affiliation_list': affiliation_list,--------------
#                 'publication_year': pub_year,----------------
#                 'publication_date': '{}-{}-{}'.format(pub_day, pub_month, pub_year),----------------
#                 'subjects': subjects}---------------
    
#     {'pmid': pmid,-------
#     'pmc': pmc,-----
#     'fig_caption': caption,---------
#     'fig_id': fig_id,-----------
#     'fig_label': fig_label,-----------
#     'graphic_ref': graphic_ref------------
    
     
     
#         dict_par = {'pmc': pmc,-------------
#                     'pmid': pmid,-----------------
#                     'reference_ids': ref_ids,-----------
#                     'section': section,-----------------
#                     'text': paragraph_text}-----------
     
     
#      dict_ref = {'pmid': pmid,------------
#                             'pmc': pmc,------
#                             'ref_id': ref_id,
#                             'pmid_cited': pmid_cited,
#                             'doi_cited': doi_cited,
#                             'article_title': article_title,
#                             'name': '; '.join(names),
#                             'year': year,
#                             'journal': journal,
#                             'journal_type': journal_type}
     
     
#          dict_caption = {'pmid': pmid,-----
#                             'pmc': pmc,----
#                             'fig_caption': caption,
#                             'fig_id': fig_id,
#                             'fig_label': fig_label,
#                             'graphic_ref': graphic_ref}

SyntaxError: invalid syntax (<ipython-input-119-77d3e5d1fbfc>, line 12)