In [2]:
# walk_to_json.py post 2to3 refactoring

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to walk AWOL backup and create json resource files.
"""


import argparse
import errno
import fileinput
from functools import wraps
import hashlib
import json
import logging
import os
import pprint
import re
import sys
import traceback

# from pyzotero import zotero
from isaw.awol import awol_article, resource
from isaw.awol.parse.awol_parsers import AwolParsers

RX_URLFLAT = re.compile(r'[=+\?\{\}\{\}\(\)\\\-_&%#/,\.;:]+')
RX_DEDUPEH = re.compile(r'[-]+')
DEFAULTLOGLEVEL = logging.WARNING

def arglogger(func):
    """
    decorator to log argument calls to functions
    """
    @wraps(func)
    def inner(*args, **kwargs): 
        logger = logging.getLogger(func.__name__)
        logger.debug("called with arguments: %s, %s" % (args, kwargs))
        return func(*args, **kwargs) 
    return inner    


@arglogger
def main (args):
    """
    main functions
    """
    logger = logging.getLogger(sys._getframe().f_code.co_name)
    root_dir = args.whence[0]
    dest_dir = args.thence[0]
    walk_count = 0
    resources = None
    index = {}
    parsers = AwolParsers()
    for dir_name, sub_dir_list, file_list in os.walk(root_dir):
        if resources is not None:
            del resources
        for file_name in file_list:
            if 'post-' in file_name and file_name[-4:] == '.xml':
                walk_count = walk_count + 1
                if args.progress and walk_count % 50 == 1:
                    print(('\n*****************************\nPERCENT COMPLETE: {0:.0f}\n'.format(float(walk_count)/4261.0*100.0)))
                logger.info('\n=========================================================================================\nARTICLE:\n')
                target = os.path.join(dir_name, file_name)
                try:
                    a = awol_article.AwolArticle(atom_file_name=target)
                except (ValueError, RuntimeError) as e:
                    logger.warning(e)
                else:
                    logger.info('article title: {0}'.format(a.title))
                    logger.info('url: {0}'.format(a.url))
                    awol_id = '-'.join(('awol', a.id.split('.')[-1]))
                    logger.info('awol_id: {0}'.format(awol_id))
                    resources = None
                    try:
                        resources = parsers.parse(a)
                    except NotImplementedError as e:
                        logger.warning(e)
                    else:
                        try:
                            length = len(resources)
                        except TypeError:
                            length = 0
                        if length > 0:
                            for i,r in enumerate(resources):
                                logger.info('\n-----------------------------------------------------------------------------------------\nRESOURCE\n')
                                logger.info('url: {0}'.format(r.url))
                                logger.info('title: {0}'.format(r.title))
                                domain = r.domain
                                this_dir = os.path.join(dest_dir, domain)
                                try:
                                    os.makedirs(this_dir)
                                except OSError as exc:
                                    if exc.errno == errno.EEXIST and os.path.isdir(this_dir):
                                        pass
                                    else: raise
                                try:
                                    domain_index = index[domain]
                                except KeyError:
                                    domain_index = index[domain] = {}
                                stub = r.url.split(domain)[-1][1:].encode('utf-8')
                                if stub == '' or stub == '/':
                                    stub = domain.encode('utf-8').replace('.', '-')
                                if stub[-1] == '/':
                                    stub = stub[:-1]
                                if len(stub) > 80 or '?' in stub or '&' in stub or '%' in stub or ' ' in stub:
                                    m = hashlib.sha1()
                                    m.update(stub)
                                    resource_key = m.hexdigest()
                                else:
                                    resource_key = RX_DEDUPEH.sub('-', RX_URLFLAT.sub('-', stub))
                                filename = '.'.join((resource_key, 'json'))
                                this_path = os.path.join(this_dir, filename)
                                try:
                                    domain_resources = domain_index[resource_key]
                                except KeyError:
                                    pass
                                else:                    
                                    # collision! load earlier version from disk and merge
                                    logger.warning('collision in {0}: {1}/{2}'.format(a.url, r.domain, resource_key))
                                    r_earlier = resource.Resource()
                                    r_earlier.json_load(this_path)
                                    try:
                                        r_merged = resource.merge(r_earlier, r)
                                    except ValueError as e:
                                        logger.error(str(e) + ' while trying to merge; saving separately')
                                        m = hashlib.sha1()
                                        m.update(r.url)
                                        resource_key = m.hexdigest()
                                        filename = '.'.join((resource_key, 'json'))
                                        this_path = os.path.join(this_dir, filename)
                                    else:
                                        r = r_merged
                                    del r_earlier
                                r.resource_key = resource_key
                                r.json_dump(this_path, formatted=True)
                                logger.info('filename: {0}'.format(this_path))
                                try:
                                    resource_title = r.extended_title
                                except AttributeError:
                                    resource_title = r.title
                                resource_package = {
                                    'title_full': resource_title,
                                    'url': r.url,
                                    'key': resource_key,
                                }
                                if resource_title != r.title:
                                    resource_package['title'] = r.title
                                try:
                                    resource_list = domain_index[resource_key]
                                except KeyError:
                                    resource_list = domain_index[resource_key] = []
                                resource_list.append(resource_package)
            else:
                logger.debug('skipping {0}'.format(file_name))
        for ignore_dir in ['.git', '.svn', '.hg']:
            if ignore_dir in sub_dir_list:
                sub_dir_list.remove(ignore_dir)

    logger.info('sorting domain list')
    domain_list = sorted(index.keys())
    domain_count = len(domain_list)
    resource_count = 0
    record_count = 0
    max_collisions = 0
    total_collisions = 0
    redundant_resources = 0
    logger.info("FULL INDEX OF RESOURCES")
    logger.info("=======================")
    for domain in domain_list:
        logger.info(domain)
        i = 0
        dash = ''
        while i < len(domain):
            dash = dash+'-'
            i = i+1
        logger.info(dash)
        logger.info('sorting resource list for domain {0}'.format(domain))
        resource_list = sorted(index[domain].keys())
        logger.info('{0} unique resources in this domain'.format(len(resource_list)))
        resource_count = resource_count + len(resource_list)
        for resource_key in resource_list:
            resources = index[domain][resource_key]
            logger.info('    {0}'.format(resources[0]['title_full']))
            record_count = record_count + len(resources)
            if len(resources) > 1:
                logger.info ('        multiple records: {0}'.format(len(resources)))
                total_collisions = total_collisions + len(resources)
                redundant_resources = redundant_resources + 1
                if len(resources) > max_collisions:
                    max_collisions = len(resources)
    logger.info("=======================")
    logger.info("Total {0} domains".format(domain_count))
    logger.info("Total {0} unique resources recorded".format(resource_count))
    logger.info("Total number of records: {0}".format(record_count))
    logger.info("Highest number of redundancies (collisions): {0}".format(max_collisions))
    logger.info("Total number of redundant records: {0}".format(total_collisions))
    logger.info("Percentage of redundantly recorded resources: {0:.2f}".format(round(float(redundant_resources)/float(resource_count)*100.0),2))
if __name__ == "__main__":
    log_level = DEFAULTLOGLEVEL
    log_level_name = logging.getLevelName(log_level)
    logging.basicConfig(level=log_level)

    try:
        parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        parser.add_argument ("-l", "--loglevel", type=str, help="desired logging level (case-insensitive string: DEBUG, INFO, WARNING, ERROR" )
        parser.add_argument ("-v", "--verbose", action="store_true", default=False, help="verbose output (logging level == INFO")
        parser.add_argument ("-vv", "--veryverbose", action="store_true", default=False, help="very verbose output (logging level == DEBUG")
        parser.add_argument ("--progress", action="store_true", default=False, help="show progress")
        parser.add_argument('credfile', type=str, nargs=1, help='path to credential file')
        #parser.add_argument('postfile', type=str, nargs='?', help='filename containing list of post files to process')
        parser.add_argument('whence', type=str, nargs=1, help='path to directory to read and process')
        parser.add_argument('thence', type=str, nargs=1, help='path to directory where you want the json-serialized resources dumped')
        args = parser.parse_args()
        if args.loglevel is not None:
            args_log_level = re.sub('\s+', '', args.loglevel.strip().upper())
            try:
                log_level = getattr(logging, args_log_level)
            except AttributeError:
                logging.error("command line option to set log_level failed because '%s' is not a valid level name; using %s" % (args_log_level, log_level_name))
        if args.veryverbose:
            log_level = logging.DEBUG
        elif args.verbose:
            log_level = logging.INFO
        log_level_name = logging.getLevelName(log_level)
        logging.getLogger().setLevel(log_level)
        if log_level != DEFAULTLOGLEVEL:
            logging.warning("logging level changed to %s via command line option" % log_level_name)
        else:
            logging.info("using default logging level: %s" % log_level_name)
        logging.debug("command line: '%s'" % ' '.join(sys.argv))
        main(args)
        sys.exit(0)
    except KeyboardInterrupt as e: # Ctrl-C
        raise e
    except SystemExit as e: # sys.exit()
        raise e
    except Exception as e:
        print("ERROR, UNEXPECTED EXCEPTION")
        print(str(e))
        traceback.print_exc()
        os._exit(1)


ModuleNotFoundError: No module named 'isaw'

In [None]:
# awol_article.py post 2to3 refactoring

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Work with an Atom entry representing an AWOL blog post.

This module defines the following classes:
 
 * AwolArticle: represents key information about the entry.

"""

from importlib import import_module
import logging
import os
import pkg_resources
import re
import sys

from bs4 import BeautifulSoup
import langid
import requests
import unicodecsv

from isaw.awol.article import Article
from isaw.awol.normalize_space import normalize_space
from isaw.awol.resource import Resource


PATH_CURRENT = os.path.dirname(os.path.abspath(__file__))
# Build a dictionary of format {<colon prefix>:<list of cols 2,3 and 4>}
colon_prefix_csv = pkg_resources.resource_stream('isaw.awol', 'awol_colon_prefixes.csv')
dreader = unicodecsv.DictReader(
    colon_prefix_csv,
    fieldnames = [
        'col_pre', 
        'omit_post', 
        'strip_title', 
        'mul_res'
    ], 
    delimiter = ',', 
    quotechar = '"')
COLON_PREFIXES = dict()
for row in dreader:
    COLON_PREFIXES.update({
        normalize_space(row['col_pre']).lower():
            [
                row['omit_post'], 
                row['strip_title'], 
                row['mul_res']
            ]
    })
del dreader
DOMAINS_TO_IGNORE = [
    'draft.blogger.com'
]
DOMAINS_SECONDARY = [
    'ancientworldonline.blogspot.com'
]
LANGID_THRESHOLD = 0.95
RX_CANARY = re.compile(r'[\.,:!\"“„\;\-\s]+', re.IGNORECASE)
RX_NUMERICISH = re.compile(r'^a?n?d?\s*[\.,:!\"“„\;\-\s\d\(\)\[\]]+$', re.IGNORECASE)
RX_MATCH_DOMAIN = re.compile('^https?:\/\/([^/#]+)')
RX_IDENTIFIERS = {
    'issn': {
        'electronic': [
            re.compile(r'(electronic|e-|e‒|e–|e—|e|online|on-line|digital)([\s:]*issn[^\d]*[\dX-‒–—]{4}[-‒–—\s]?[\dX]{4})', re.IGNORECASE),
            re.compile(r'(issn[\s\(]*)(electrónico|électronique|online|on-line|digital)([^\d]*[\dX-‒–—]{4}[-‒–—\s]?[\dX]{4})', re.IGNORECASE),
            re.compile(r'(issn[^\d]*[\dX-‒–—]{4}[-‒–—\s]?[\dX]{4}[\s\(]*)(electrónico|électronique|online|on-line|digital)', re.IGNORECASE),
        ],
        'generic': [
            re.compile(r'(issn[^\d]*[\dX-‒–—]{4}[-‒–—\s]?[\dX]{4})', re.IGNORECASE),
            re.compile(r'(issn[^\d]*[\dX-‒–—]{8-9})', re.IGNORECASE)
        ],
        'extract': {
            'precise': re.compile(r'^[^\d]*([\dX]{4}[-‒–—\s]?[\dX]{4}).*$', re.IGNORECASE),
            'fallback': re.compile(r'^[^\d]*([\dX-‒–—\s]+).*$', re.IGNORECASE)
        }
    },
    'isbn': {
        'electronic': [
            re.compile(r'(electronic|e-|e‒|e–|e—|online|on-line|digital)([\s:]*isbn[^\d]*[\dX-‒–—]+)', re.IGNORECASE),
            re.compile(r'(isbn[\s\(]*)(electrónico|électronique|online|on-line|digital)([^\d]*[\dX-‒–—]+)', re.IGNORECASE),
            re.compile(r'(isbn[^\d]*[\dX-‒–—]+[\s\(]*)(electrónico|électronique|online|on-line|digital)', re.IGNORECASE),
        ],
        'generic': [
            re.compile(r'isbn[^\d]*[\dX-‒–—]+', re.IGNORECASE),
        ],
        'extract': {
            'precise': re.compile(r'^[^\d]*([\dX-‒–—]+).*$', re.IGNORECASE),
        }
    }
}

title_strings_csv = pkg_resources.resource_stream('isaw.awol', 'awol_title_strings.csv')
dreader = unicodecsv.DictReader(
    title_strings_csv,
    fieldnames = [
        'titles', 
        'tags'
    ], 
    delimiter = ',', 
    quotechar = '"')
TITLE_SUBSTRING_TAGS = dict()
for row in dreader:
    TITLE_SUBSTRING_TAGS.update({row['titles']:row['tags']})
del dreader
TITLE_SUBSTRING_TERMS = {k:v for (k,v) in TITLE_SUBSTRING_TAGS.items() if ' ' not in k}
TITLE_SUBSTRING_TERMS['boğazköy'] = 'Boğazköy'
TITLE_SUBSTRING_PHRASES = {k:v for (k,v) in TITLE_SUBSTRING_TAGS.items() if k not in list(TITLE_SUBSTRING_TERMS.keys())}
AGGREGATORS = [
    'www.jstor.org',
    'oi.uchicago.edu',
    'www.persee.fr',
    'dialnet.unirioja.es',
    'amar.hsclib.sunysb.edu',
    'hrcak.srce.hr',
    'www.griffith.ox.ac.uk'
]
AGGREGATOR_IGNORE = [
    'http://www.jstor.org/page/info/about/archives/collections.jsp',
    'https://oi.uchicago.edu/getinvolved/',
    'http://oi.uchicago.edu/news/'
]
POST_SELECTIVE = {
    'http://ancientworldonline.blogspot.com/2012/07/chicago-demotic-dictionary-t.html': [0,],
    'http://ancientworldonline.blogspot.com/2013/01/new-issues-of-asor-journals.html': [0,1,]
}
SUBORDINATE_FLAGS = [
    'terms of use',
    'download pdf',
    'download',
]
NO_FORCING = [
    'http://ancientworldonline.blogspot.com/2011/03/ancient-world-in-persee.html',
    'http://ancientworldonline.blogspot.com/2009/09/open-access-journals-in-ancient-studies.html',
    'http://ancientworldonline.blogspot.com/2011/05/open-access-journal-bsaa-arqueologia.html',
]
NO_SUBORDINATES = [
    'http://ancientworldonline.blogspot.com/2012/12/newly-online-from-ecole-francaise-de.html',
    'http://ancientworldonline.blogspot.com/2011/03/ancient-world-in-persee.html'
]
FORCE_AS_SUBORDINATE_AFTER = [
    'http://oi.uchicago.edu/research/library/acquisitions.html',
    'http://oi.uchicago.edu/research/pubs/ar/10-11/',
    'http://oi.uchicago.edu/research/pubs/ar/28-59/',
    'http://oi.uchicago.edu/research/pubs/catalog/as/',
    'http://oi.uchicago.edu/research/pubs/catalog/as/',
    'http://oi.uchicago.edu/research/pubs/catalog/saoc/',
    'http://www.persee.fr/web/ouvrages/home/prescript/fond/befar',
    'http://www.persee.fr/web/ouvrages/home/prescript/issue/mom_0184-1785_2011_act_45_1#',
    'https://oi.uchicago.edu/research/pubs/ar/11-20/11-12/',
    'https://oi.uchicago.edu/research/pubs/catalog/oip/',
    'oriental institute news & notes',
    'http://amar.hsclib.sunysb.edu/amar/',
    'http://www.persee.fr/web/revues/home/prescript/issue/litt_0047-4800_2001_num_122_2',
    'http://oi.uchicago.edu/research/pubs/nn/',
    'http://ancientworldonline.blogspot.com/2010/04/open-access-journal-oriental-institute.html'
]
RELATED_FLAGS = [
    'list of volumes in print',
    'membership'
]
FORCE_AS_RELATED_AFTER = [
    'http://oi.uchicago.edu/research/library/dissertation/nolan.html',
    'http://oi.uchicago.edu/research/pubs/ar/28-59',
    'https://oi.uchicago.edu/research/pubs/archeological/',
    'list of volumes in print',
]
SUPPRESS_RESOURCE = [
    'terms of use',
    'download pdf',
    'download',
    'membership',
    'here'
]


RX_DASHES = re.compile(r'[‒–—-]+')


def clean_title(raw):
    prepped = normalize_space(raw)
    chopped = prepped.split('.')
    if len(chopped) > 2:
        cooked = '.'.join(tuple(chopped[:2]))
        i = 2
        while i < len(chopped) and len(cooked) < 40:
            cooked = cooked + '.' + chopped[i]
            i = i + 1
    else:
        cooked = prepped
    junk = [
        ('(', ')'),
        ('[', ']'),
        ('{', '}'),
        ('"', '"'),
        ("'", "'"),
        ('<', '>'),
        ('«', '»'),
        ('‘', '’'),
        ('‚', '‛'),
        ('“', '”'),
        ('‟', '„'),
        ('‹', '›'),
        ('〟', '＂'),
        ('\\'),
        ('/'),
        ('|'),
        (','),
        (';'),
        ('-'),
        ('.'),
        ('_'),
    ]
    for j in junk:
        if len(j) == 2:
            cooked = cooked[1:-1] if cooked[0] == j[0] and cooked[-1] == j[1] else cooked
        else:
            cooked = cooked[1:] if cooked[0] == j[0] else cooked
            cooked = cooked[:-1] if cooked[-1] == j[0] else cooked
        if cooked[0:4] == 'and ':
            cooked = cooked[4:]
        cooked = cooked.strip()
    return cooked

class AwolArticle(Article):
    """Manipulate and extract data from an AWOL blog post."""

    def __init__(self, atom_file_name=None, json_file_name=None):

        Article.__init__(self, atom_file_name, json_file_name)
        lt = self.title.lower()
        if lt in list(COLON_PREFIXES.keys()):
            if COLON_PREFIXES[lt][0] == 'yes':
                return None






In [None]:
# article.py post 2to3 refactoring
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Working with blog posts.

This module defines the following classes:
 
 * Article: represents key information about the post.
"""

import logging
import os
import sys
import unicodedata

from beautifulsoup4 import BeautifulSoup
from lxml import etree as exml
from lxml.etree import XMLSyntaxError as XMLSyntaxError

from isaw.awol.normalize_space import normalize_space
from isaw.awol.clean_string import purify_text, purify_html
from isaw.awol.tools import urls

XML_PARSER = exml.XMLParser(recover=False)
XML_PARSER_LENIENT = exml.XMLParser(recover=True)
XSL_CLEANUP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cleanup.xsl')
XSL_CLEANUP = exml.parse(XSL_CLEANUP_PATH)

class Article():
    """Manipulate and extract data from a blog post."""

    def __init__(self, atom_file_name=None, json_file_name=None):
        """Load post from Atom entry or JSON and extract basic info.

        The method looks for the following components and saves their 
        values as attributes of the object:

            * id (string): unique identifier for the blog post
            * title (unicode): title of the blog post
            * url (unicode): url of the blog post
            * categories (list of unicode strings): categories assigned to
              the blog post
            * content (string): raw text content of the blog post
            * soup: soupified content of the blog post.
        """

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        if atom_file_name is not None:
            if json_file_name is not None:
                logger.warning(
                    'Filenames for both Atom and JSON were specified'
                    + ' in Article constructor. JSON filename ignored.')

            self._load_atom(atom_file_name)
        elif json_file_name is not None:
            # todo
            self.__load_json(json_file_name)

    def _load_atom(self, atom_file_name):
        """Open atom file and parse for basic info.

        We attempt to set the following attributes on the class:

         * id (string): tag id for this atom entry
         * title (unicode): title of the original blog post
         * url (string): url for the original blog post)
         * categories (dictionary) with the following keys:
           * 'vocabulary' (string): captures "scheme" from the entry categories
           * 'term' (string): verbatim from the entry categories
         * content (unicode): normalized unicode string containing everything
           that was in the entry content (see normalization comments below)
         * soup (bs4 BeutifulSoup object): html-parsed version of content

        All strings are space normalized (i.e., all continguous spans of
        whitespace are collapsed to a single space and the result string is
        stripped of leading and trailing whitespace).

        The normalization form of all unicode strings (title and content) are
        converted to Normalization Form "C" (canonical normalized).
        """

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        with open(atom_file_name, 'r') as file_object:
            self.doc = exml.parse(file_object)
        self.root = self.doc.getroot()
        root = self.root
        self.id = root.find('{http://www.w3.org/2005/Atom}id').text.strip()
        #logger.debug('article id: "{0}"'.format(self.id))

        # title of blog post should be same as title of atom entry
        raw_title = str(root.find('{http://www.w3.org/2005/Atom}title').text)
        try:
            self.title = purify_text(normalize_space(unicodedata.normalize('NFC', raw_title)))
        except TypeError:
            msg = 'could not extract blog post title for article with id: "{0}"'.format(self.id)
            raise RuntimeWarning(msg)
            
        else:
            #logger.debug(u'article title: "{0}"'.format(self.title))
            pass

        # get url of blog post (html alternate)
        try:
            raw_url = str(root.xpath("//*[local-name()='link' and @rel='alternate']")[0].get('href'))
        except IndexError:
            msg = 'could not extract blog post URL for article with id: "{0}"'.format(self.id)
            raise RuntimeError(msg)
        else:
            try:
                raw_url = normalize_space(unicodedata.normalize('NFC', raw_url))
            except TypeError:
                msg = 'could not normalize blog post URL for article with id: "{0}"'.format(self.id)
                raise RuntimeError(msg)
            else:
                if urls.valid(raw_url):
                    self.url = raw_url
                else:
                    msg = 'invalid blog post URL ({0}) for article with id: "{1}"'.format(raw_url, self.id)
                    raise RuntimeError(msg)

        # capture categories as vocabulary terms
        self.categories = [{'vocabulary' : c.get('scheme'), 'term' : normalize_space(unicodedata.normalize('NFC', str(c.get('term'))))} for c in root.findall('{http://www.w3.org/2005/Atom}category')]
        
        # extract content, normalize, and parse as HTML for later use
        raw_content = root.find('{http://www.w3.org/2005/Atom}content').text
        soup = BeautifulSoup(raw_content)   # mainly to convert character entities to unicode
        soup_content = str(soup)
        del soup
        content = unicodedata.normalize('NFC', soup_content)
        del soup_content
        content = normalize_space(content)
        content = purify_html(content)  # get rid of all manner of evil, stupid stuff
        self.content = content
        try:
            html = exml.fromstring(content, XML_PARSER)
        except XMLSyntaxError:
            msg = 'XMLSyntaxError while trying to parse content of {0}; trying html5lib parser with BeautifulSoup and then lxml parser with recover=True'.format(atom_file_name)
            logger.warning(msg)
            soup = BeautifulSoup(raw_content, 'html5lib')
            soup_content = str(soup)
            del soup
            content = unicodedata.normalize('NFC', soup_content)
            del soup_content
            content = normalize_space(content)
            content = purify_html(content)
            self.content = content
            try:
                html = exml.fromstring(content, XML_PARSER_LENIENT)
            except XMLSyntaxError:
                msg = 'XMLSyntaxError while trying to re-parse content of {0} using html5lib parser with BeautifulSoup'.format(atom_file_name)
                logger.error(msg)
                logger.error(content)
                sys.exit(-1000)

        #logger.debug('normalized html:\n\n' + exml.tostring(html, pretty_print=True))
        transform = exml.XSLT(XSL_CLEANUP)
        clean_html = transform(html)
        #logger.debug('cleaned html:\n\n' + exml.tostring(clean_html, pretty_print=True))
        self.soup = BeautifulSoup(exml.tostring(clean_html))


    def _load_json(self, json_file_name):
        """open atom file and parse for basic info"""
        emsg = 'Article constructor does not yet support JSON.'
        raise NotImplementedError(emsg)

    def __str__(self):
        """Print all data about the article."""

        return str(self.id+"|"+self.title+"|"+str(self.tags)+"|"+
            self.content+"|"+self.url+"|"+self.blogUrl+"|"+self.template+
            "|"+self.issn)




In [None]:
# awol_parsers.py post 2to3 refactoring
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Bank of parsers to use for extracting AWOL blog content.

This module defines the following classes:

 * AwolParsers: parse AWOL blog post content for resources
"""

import logging
from importlib import import_module
import pkgutil
import sys

class AwolParsers():
    """Pluggable framework for parsing content from an AwolArticle."""

    def __init__(self):
        """Load available parsers."""

        self.parsers = {}

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        ignore_parsers = [
            'awol_parsers',             # self
            'awol_parse',               # superclass
            'awol_parse_domain',        # superclass
        ]
        where = 'isaw/awol/parse'
        parser_names = [name for _, name, _ in pkgutil.iter_modules([where]) if 'parse' in name]
        for parser_name in parser_names:
            if parser_name not in ignore_parsers:
                levels = where.split('/')
                levels.append(parser_name)
                parser_path = '.'.join(tuple(levels))
                #logger.debug('importing module "{0}"'.format(parser_path))
                mod = import_module(parser_path)
                parser = mod.Parser()
                self.parsers[parser.domain] = parser

    def parse(self, article):
        logger = logging.getLogger(sys._getframe().f_code.co_name)

        self.reset()
        self.content_soup = article.soup
        domains = self.get_domains()
        length = len(domains)
        logger.debug(
            '\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nparsing '
            + article.url
            + '\n')
        logger.debug('domains: {0}'.format(repr(domains)))
        if length == 0:
            raise NotImplementedError('awol_parsers does not know what to do with no domains in article: {0}'.format(article.id))
        else:
            tlow = article.title.lower()
            if 'journal:' in tlow:
                parser = self.parsers['generic-single']
            elif length == 1:
                try:
                    parser = self.parsers[domains[0]]
                except KeyError:
                    if domains[0] in ['www.egyptpro.sci.waseda.ac.jp',]:
                        parser = self.parsers['generic-single']
                    else:
                        parser = self.parsers['generic']
            else:
                raise NotImplementedError('awol_parsers does not know what to do with multiple domains in article: {0}\n    {1}'.format(article.id, '\n    '.join(domains)))
            logger.info('using "{0}" parser'.format(parser.domain))
            return parser.parse(article)


    def reset(self):
        self.content_soup = None

        #for parser in self.parsers:
        #    parser.reset()


    def get_domains(self, content_soup=None):
        """find valid resource domains in content"""

        if content_soup is None and self.content_soup is None:
            raise AttributeError('No content soup has been fed to parsers.')

        if content_soup is not None:
            self.reset()
            self.content_soup = content_soup

        return self.parsers['generic'].get_domains(self.content_soup)



In [None]:
# awol_parse_generic.py post 2to3 refactoring
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Parse HTML content for resources generically.

This module defines the following classes:

 * Parser
"""

import logging
import sys

from isaw.awol.parse.awol_parse import AwolBaseParser

class Parser(AwolBaseParser):
    """Extract data from an AWOL blog post agnostic to domain of resource."""

    def __init__(self):
        self.domain = 'generic'
        AwolBaseParser.__init__(self)



In [None]:
# awol_parse.py post 2to3 refactoring
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Parse HTML content for resources.

This module defines the following classes:

 * AwolParser: parse AWOL blog post content for resources
"""

from copy import copy, deepcopy
import logging
import pkg_resources
import pprint
import regex as re
import requests
import sys

from bs4 import BeautifulSoup
from bs4.element import NavigableString
from langid.langid import LanguageIdentifier, model
from lxml import etree
import unicodecsv

from isaw.awol.clean_string import *
from isaw.awol.normalize_space import normalize_space
from isaw.awol.resource import Resource
from isaw.awol.tools import mods

LANGUAGE_IDENTIFIER = LanguageIdentifier.from_modelstring(model, norm_probs=True)
LANGID_THRESHOLD = 0.98

DOMAINS_IGNORE = [
    'draft.blogger.com',
    'bobcat.library.nyu.edu',
    'www.addthis.com',
    'cientworldonline.blogspot.com' # that there's a typo in a link somewhere in the blog
]
DOMAINS_SELF = [
    'ancientworldonline.blogspot.com',
]
BIBLIO_SOURCES = {
    'zenon.dainst.org': {
        'url_pattern': re.compile('^https?:\/\/zenon.dainst.org/Record/\d+\/?$'),
        'url_append': '/RDF',
        'type': 'application/rdf+xml',
        'namespaces' : {
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'mods': 'http://www.loc.gov/mods/v3'
        },
        'payload_xpath': '//rdf:Description[1]/mods:mods[1]',
        'payload_type': 'application/mods+xml',
        'date_fixer': re.compile(r'^(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>[\d\.]+)$')
    }
}
DOMAINS_BIBLIOGRAPHIC = list(BIBLIO_SOURCES.keys())
MODS2RESOURCES = {
    'publisher':'publishers',
    'language':'languages',
    'statement_of_responsibility':'responsibility',
    'place':'places',
    'issued_date':'issued_dates',
    'uri':'identifiers'

}
ANCHOR_TEXT_IGNORE = [
    'contact us',
]
ANCHOR_URLS_IGNORE = [
]
colon_prefix_csv = pkg_resources.resource_stream('isaw.awol', 'awol_colon_prefixes.csv')
dreader = unicodecsv.DictReader(
    colon_prefix_csv,
    fieldnames = [
        'col_pre',
        'omit_post',
        'strip_title',
        'mul_res'
    ],
    delimiter = ',',
    quotechar = '"')
COLON_PREFIXES = dict()
for row in dreader:
    COLON_PREFIXES.update({
        normalize_space(row['col_pre']).lower():
            [
                row['omit_post'],
                row['strip_title'],
                row['mul_res']
            ]
    })
del dreader
def check_colon(title):
    if ':' in title:
        colon_prefix = title.split(':')[0].lower()
        if colon_prefix in list(COLON_PREFIXES.keys()) and (COLON_PREFIXES[colon_prefix])[1] == 'yes':
            return clean_string(':'.join(title.split(':')[1:]))
        else:
            return title
    else:
        return title
OMIT_TITLES = [
    'administrative',
    'administrative note'
]
def allow_by_title(title):
    if title.lower() in OMIT_TITLES:
        return False
    elif ':' in title:
        colon_prefix = title.split(':')[0].lower()
        if colon_prefix in list(COLON_PREFIXES.keys()) and (COLON_PREFIXES[colon_prefix])[0] == 'yes':
            return False
    return True

RX_IDENTIFIERS = {
    'issn': {
        'electronic': [
            re.compile(r'(e-|e)(issn[\s:\-]*[\dX\-]{4}[\-\s]+[\dX]{4})', re.IGNORECASE),
            re.compile(r'(electronic|online|on-line|digital|internet)([\s:]*issn[^\d]*[\dX]{4}[\-\s]+[\dX]{4})', re.IGNORECASE),
            re.compile(r'(issn[\s\(\-]*)(electrónico|électronique|online|on-line|digital|internet)([^\d]*[\dX]{4}[\-\s]+[\dX]{4})', re.IGNORECASE),
            re.compile(r'(issn[^\d]*[\dX]{4}[\-\s]+[\dX]{4}[\s\(]*)(electrónico|électronique|online|on-line|digital)', re.IGNORECASE),
        ],
        'generic': [
            re.compile(r'(issn[^\d]*[\dX]{4}[\-\s]+[\dX]{4})', re.IGNORECASE),
            re.compile(r'(issn[^\d]*[\dX\-\s]{8-11})', re.IGNORECASE)
        ],
        'extract': {
            'precise': re.compile(r'^[^\d]*([\dX]{4}[\-\s]+[\dX]{4}).*$', re.IGNORECASE),
            'fallback': re.compile(r'^[^\d]*([\dX\-\s]+).*$', re.IGNORECASE)
        }
    },
    'isbn': {
        'electronic': [
            re.compile(r'(electronic|e-|online|on-line|digital)([\s:]*isbn[^\d]*[\dX\-]+)', re.IGNORECASE),
            re.compile(r'(isbn[\s\(]*)(electrónico|électronique|online|on-line|digital)([^\d]*[\dX\-]+)', re.IGNORECASE),
            re.compile(r'(isbn[^\d]*[\dX\-]+[\s\(]*)(electrónico|électronique|online|on-line|digital)', re.IGNORECASE),
        ],
        'generic': [
            re.compile(r'isbn[^\d]*[\dX\-]+', re.IGNORECASE),
        ],
        'extract': {
            'precise': re.compile(r'^[^\d]*([\dX\-]+).*$', re.IGNORECASE),
        }
    }
}

RX_AUTHORS = [
    re.compile(r'(compiled by |assembled by |created by |written by |authors?):?\s*([^\.]+)', re.IGNORECASE)
]
RX_EDITORS = [
    re.compile(r'(edited by |editors?):?\s*([^\.]+)', re.IGNORECASE)
]

title_strings_csv = pkg_resources.resource_stream('isaw.awol', 'awol_title_strings.csv')
dreader = unicodecsv.DictReader(
    title_strings_csv,
    fieldnames = [
        'titles',
        'tags'
    ],
    delimiter = ',',
    quotechar = '"')
TITLE_SUBSTRING_TAGS = dict()
for row in dreader:
    TITLE_SUBSTRING_TAGS.update({row['titles']:row['tags']})
del dreader
TITLE_SUBSTRING_TERMS = {k:v for (k,v) in TITLE_SUBSTRING_TAGS.items() if ' ' not in k}
TITLE_SUBSTRING_PHRASES = {k:v for (k,v) in TITLE_SUBSTRING_TAGS.items() if k not in list(TITLE_SUBSTRING_TERMS.keys())}
RX_ANALYTIC_TITLES = [
    # volume, issue, year (e.g. Bd. 52, Nr. 1 (2005))
    {
        'rx': re.compile(r'^Bd\.\s+(\d+),?\s+Nr\.\s+(\d+)\s+\(?(\d{4})\)?$', re.IGNORECASE),
        'volume': 1,
        'issue': 2,
        'year': 3
    },
    # year, blah blah blah, then volume (e.g. u'1888 Mitteilungen des Deutschen Arch\xe4ologischen Instituts / R\xf6mische Abteilung Band 3')
    {
        'rx': re.compile(r'^(\d{4})[^\d]+Band (\d+)$', re.IGNORECASE),
        'volume': 2,
        'year': 1
    },
    # vol slash year (e.g. University Museums and Collections Journal 4/2011)
    {
        'rx': re.compile(r'^[^\d]*(\d+)\/(\d{4})[^\d]*$', re.IGNORECASE),
        'volume': 1,
        'year': 2,
    },
    # year, then volume
    {
        'rx': re.compile(r'^[^\d]*(\d{4})\W*([\d\-]+)[^\d]*$', re.IGNORECASE),
        'volume': 2,
        'year': 1
    },
    # volume, then year
    {
        'rx': re.compile(r'^[^\d]*([\d\-]{1-4})\W*(\d{4})[^\d]*$', re.IGNORECASE),
        'volume': 1,
        'year': 2
    },
    # year only
    {
        'rx': re.compile(r'^[^\d]*(\d{4})[^\d]*$', re.IGNORECASE),
        'year': 1,
    },
    # volume only
    {
        'rx': re.compile(r'^[^\d]*([\d\-]+)[^\d]*$', re.IGNORECASE),
        'volume': 1,
    },


]
RX_PUNCT_FIX = re.compile(r'\s+([\.,:;]{1})')
RX_PUNCT_DEDUPE = re.compile(r'([\.,:;]{1})([\.,:;]{1})')

def domain_from_url(url):
    return url.replace('http://', '').replace('https://', '').split('/')[0]

class AwolBaseParser:
    """Superclass to extract resource data from an AwolArticle."""

    # constructor
    def __init__(self):
        self.reset()

    # public methods
    def get_domains(self, content_soup=None):
        """Determine domains of resources linked in content."""

        #logger = logging.getLogger(sys._getframe().f_code.co_name)

        if content_soup is not None:
            self.reset(content_soup)

        c = self.content
        if c['domains'] is None:
            soup = c['soup']
            anchors = [a for a in soup.find_all('a')]
            urls = [a.get('href') for a in anchors if a.get('href') is not None]
            urls = list(set(urls))
            domains = [domain_from_url(url) for url in urls]
            domains = list(set(domains))
            domains = [domain for domain in domains if domain not in self.skip_domains]
            if len(domains) > 1:
                domains = [domain for domain in domains if domain not in self.bibliographic_domains]
            c['domains'] = domains
        return c['domains']

    def parse(self, article):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        self.reset(article.soup)
        resources = self._get_resources(article)
        return resources

    def reset(self, content_soup=None):
        self.content = {}
        c = self.content
        if content_soup is not None:
            c['soup'] = content_soup
        c['anchors'] = None
        c['domains'] = None
        self.skip_domains = copy(DOMAINS_IGNORE) + copy(DOMAINS_SELF)
        self.bibliographic_domains = copy(DOMAINS_BIBLIOGRAPHIC)
        self.skip_text = copy(ANCHOR_TEXT_IGNORE)
        self.skip_urls = copy(ANCHOR_URLS_IGNORE)

    # private methods
    def _consider_anchor(self, a):
        url = a.get('href')
        if url is not None:
            text = a.get_text()
            if len(text) > 0:
                domain = domain_from_url(url)
                if (domain in self.skip_domains
                or url in self.skip_urls
                or text in self.skip_text):
                    pass
                else:
                    return True
            else:
                pass
        else:
            pass
        return False

    def _filter_anchors(self, anchors):
        filtered = [a for a in anchors if self._consider_anchor(a)]
        return filtered

    def _get_anchor_ancestor_for_title(self, anchor):

        a = anchor
        url = a.get('href')
        parent = a.find_parent('li')
        if parent is not None:
            anchor_ancestor = parent
        else:
            previous_parent = a
            parent = a.parent
            while parent is not None and len([a for a in parent.find_all('a') if a.get('href') != url]) > 0:
                prevous_parent = parent
                parent = parent.parent
            if previous_parent.name == 'body':
                anchor_ancestor = anchor
            else:
                anchor_ancestor = previous_parent
        return anchor_ancestor

    def _get_anchors(self):
        c = self.content
        if c['anchors'] is not None:
            return c['anchors']
        soup = c['soup']
        raw_anchors = [a for a in soup.find_all('a') if a.find_previous('a', href=a.get('href')) is None]
        anchors = self._filter_anchors(raw_anchors)
        c['anchors'] = anchors
        return anchors

    def _get_description(self, context=None, title=''):
        if context is None:
            c = self.content
            soup = c['soup']
            first_node = soup.body.contents[0]
            skip_first_anchor = True
        else:
            first_node = context
            skip_first_anchor = False

        def digdigdig(this_node, first_node, stop_tags, skip_first_anchor, previous_urls):
            node_type = type(this_node)
            node_name = this_node.name
            try:
                node_url = this_node.get('href')
            except AttributeError:
                node_url = ''
            if node_url is None:
                node_url = ''
            if '/' in node_url:
                chunks = node_url.split('/')
                if chunks[-1] in ['index.html', 'index.php', '', None]:
                    node_url = '/'.join(chunks[:-1])
            results = []
            if (
                this_node != first_node
                and node_name in stop_tags
                and (
                    node_name != 'a'
                    or (
                        'a' in stop_tags
                        and node_name == 'a'
                        and (
                                (
                                skip_first_anchor
                                and len(previous_urls) == 0
                                )
                            or (
                                not(skip_first_anchor)
                                and len(previous_urls) > 0
                                and node_url != previous_urls[-1]
                                )
                            )
                        )
                    )
                ):
                return (True, results)
            if node_name == 'a':
                previous_urls.append(node_url)
            try:
                previous_text = normalize_space(this_node.previous_sibling.get_text())
            except AttributeError:
                previous_text = ''
            try:
                previous_last = previous_text[-1]
            except IndexError:
                previous_last = previous_text
            if node_name == 'br' and previous_last != '.':
                results.append('. ')
            if node_type == NavigableString:
                results.append(str(this_node))
            else:
                try:
                    descendants = this_node.descendants
                except AttributeError:
                    pass
                else:
                    if descendants is not None:
                        for child in this_node.children:
                            stop, child_results = digdigdig(child, first_node, stop_tags, skip_first_anchor, previous_urls)
                            results.extend(child_results)
                            if stop:
                                return (stop, results)
            return (False, results)

        def skiptomalou(first_node, stop_tags, skip_first_anchor):
            previous_urls = []
            stop, desc_lines = digdigdig(first_node, first_node, stop_tags, skip_first_anchor, previous_urls)
            node = first_node
            while True:
                previous_node = node
                node = node.next_sibling
                if node is None:
                    break
                try:
                    node_name = node.name
                except AttributeError:
                    node_name = type(node)
                try:
                    node_url = node.get('href')
                except AttributeError:
                    node_url = ''
                if node_url is None:
                    node_url = ''
                if '/' in node_url:
                    chunks = node_url.split('/')
                    if chunks[-1] in ['index.html', 'index.php', '', None]:
                        node_url = '/'.join(chunks[:-1])
                if (
                    node_name in stop_tags
                    and (
                        node_name != 'a'
                        or (
                            'a' in stop_tags
                            and node_name == 'a'
                            and (
                                    (
                                    not(skip_first_anchor)
                                    and len(previous_urls) == 0
                                    )
                                or (
                                    skip_first_anchor
                                    and len(previous_urls) > 0
                                    and node_url != previous_urls[-1]
                                    )
                                )
                            )
                        )
                    ):
                    break
                if node_name == 'a':
                    previous_urls.append(node_url)
                stop, results = digdigdig(node, first_node, stop_tags, skip_first_anchor, previous_urls)
                desc_lines.extend(results)
                if stop:
                    break
            return desc_lines

        stop_tags = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'ol', 'ul', 'dl', 'dt', 'li', 'table']
        desc_lines = skiptomalou(first_node, stop_tags, skip_first_anchor)

        stop_tags = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
        if len(desc_lines) == 0:
            desc_lines = skiptomalou(first_node, stop_tags, False)
        elif ukey(desc_lines) == ukey(title):
            desc_lines = skiptomalou(first_node, stop_tags, skip_first_anchor)
        if len(desc_lines) == 0:
            desc_text = None
        else:
            desc_text = deduplicate_lines('\n'.join(desc_lines))
            desc_text = ''.join(desc_lines)
            if len(desc_text) == 0:
                desc_text = None
            else:
                desc_text = desc_text.replace('%IMAGEREPLACED%', '').strip()
                desc_text = RX_PUNCT_FIX.sub(r'\1', desc_text)
                desc_text = deduplicate_sentences(desc_text)
                desc_text = RX_PUNCT_DEDUPE.sub(r'\1', desc_text)
                desc_text = normalize_space(desc_text)
                if len(desc_text) == 0:
                    desc_text = None
                elif desc_text[-1] != '.':
                    desc_text += '.'

        return desc_text

    def _get_language(self, *args):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        chunks = [chunk for chunk in args if chunk is not None]
        s = ' '.join((tuple(chunks)))
        s = normalize_space(s)
        logger.debug('s: \n"{}\n'.format(s.encode('utf-8')))
        if s != '':
            language = LANGUAGE_IDENTIFIER.classify(s)
            logger.debug(repr(language))
            if language[1] >= LANGID_THRESHOLD:
                return language[0]
        return None

    def _get_next_valid_url(self, anchor):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        a = anchor
        while a is not None:
            logger.debug('anchor text: {0}'.format(repr(a.get_text())))
            if a.get_text() != '':
                try:
                    url = a.get('href')
                except AttributeError:
                    url = None
                else:
                    domain = domain_from_url(url)
                    if domain not in self.skip_domains:
                        break
            a = a.find_next('a')
        if a is None:
            raise ValueError('could not find valid self-or-subsequent resource anchor')
        return (anchor, a, url, domain)

    def _get_primary_anchor(self):
        anchors = self._get_anchors()
        try:
            a = self._get_anchors()[0]
        except IndexError:
            msg = 'failed to parse primary anchor from {0}'.format(self.content['soup'])
            raise RuntimeError(msg)
        return a

    def _get_primary_resource(self, article):
        # title
        a = self._get_primary_anchor()
        a_title = clean_string(a.get_text())
        titles = self._reconcile_titles(a_title, article.title)
        try:
            title = titles[0]
        except IndexError:
            msg = 'could not extract resource title'
            raise IndexError(msg)
        try:
            title_extended = titles[1]
        except IndexError:
            title_extended = None

        # description
        desc_text = self._get_description(title=title)
        if desc_text is None:
            desc_text = title

        # parse authors
        authors = self._parse_authors(desc_text)

        # parse identifiers
        identifiers = self._parse_identifiers(desc_text)

        # language
        language = self._get_language(title, title_extended, desc_text)

        # determine keywords
        keywords = self._parse_keywords(article.title, titles[-1], article.categories)

        # create and populate the resource object
        params = {
            'url': a.get('href'),
            'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0],
            'title': title
        }
        if desc_text is not None:
            params['description'] = desc_text
        if len(authors) > 0:
            params['authors'] = authors
        if len(list(identifiers.keys())) > 0:
            params['identifiers'] = identifiers
        if title_extended is not None:
            params['title_extended'] = title_extended
        if language is not None:
            params['languages'] = language
        if len(keywords) > 0:
            params['keywords'] = keywords
        resource = self._make_resource(**params)

        # provenance
        self._set_provenance(resource, article)

        return resource

    def _get_related_resources(self):
        resources = []
        anchors = self._get_anchors()[1:]
        anchors = [a for a in anchors if domain_from_url(a.get('href')) in DOMAINS_SELF]
        for a in anchors:
            # title
            title_context = self._get_anchor_ancestor_for_title(a)
            title = clean_string(title_context.get_text())

            # description
            next_node = title_context.next_element
            desc_text = self._get_description(next_node, title=title)

            # parse identifiers
            identifiers = self._parse_identifiers(desc_text)

            # language
            language = self._get_language(title, desc_text)

            # determine keywords
            keywords = self._parse_keywords(resource_title=title, resource_text=desc_text)

            # create and populate the resource object
            r = Resource()
            params = {
                'url': a.get('href'),
                'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0],
                'title': title
            }
            if desc_text is not None:
                params['description'] = desc_text
            if len(list(identifiers.keys())) > 0:
                params['identifiers'] = identifiers
            if language is not None:
                params['languages'] = language
            if len(keywords) > 0:
                params['keywords'] = keywords
            resource = self._make_resource(**params)
            resources.append(resource)
        return resources

    def _nodesplain(self, node, gloss='', include_source=False):
        """Provide copious information about this XML node."""
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        template = """
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    >>> NODESPLANATION <<<
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    type: {node_type}
    name: {name}
    xpath: /{xpath}
    attributes: {attributes}
    text: {text}
    gloss: {gloss}
    source: {source}
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        """

        name = node.name
        try:
            text = normalize_space(' '.join([string for string in node.stripped_strings]))
        except AttributeError:
            text = 'None'
        try:
            attributes = pprint.pformat(node.attrs)
        except AttributeError:
            attributes = 'None'
        count = str(1+len([t for t in node.previous_siblings if t.name == name]))
        path = ['{name}[{count}]'.format(name=name, count=count)]
        for parent in node.parents:
            if type(parent) != NavigableString:
                parent_name = parent.name
                count = str(1+len([t for t in parent.previous_siblings if t.name == parent_name]))
            path = ['{name}[{count}]'.format(name=parent_name, count=count)] + path
        root = [p for p in node.parents][1]
        params = {
            'node_type': type(node),
            'name': name,
            'xpath': '/'.join(path),
            'attributes': attributes,
            'text': text,
            'gloss': gloss,
            'source': root.prettify() if include_source else ''
        }
        return template.format(**params)

    def _get_resources(self, article):
        if allow_by_title(article.title):
            primary_resource = self._get_primary_resource(article)
            parent = primary_resource.package()
            if len(list(primary_resource.identifiers.keys())) > 0:
                try:
                    parent['issn'] = primary_resource.identifiers['issn']['electronic'][0]
                except KeyError:
                    try:
                        parent['issn'] = primary_resource.identifiers['issn']['generic'][0]
                    except KeyError:
                        try:
                            parent['isbn'] = primary_resource.identifiers['isbn'][0]
                        except KeyError:
                            pass

            subs = self._get_subordinate_resources(article, primary_resource.package())
            for sr in subs:
                sr.is_part_of = parent
                primary_resource.subordinate_resources.append(sr.package())

            rels = self._get_related_resources()
            for rr in rels:
                primary_resource.related_resources(append(rr.package()))

            return [primary_resource,] + subs + rels
        else:
            return None

    def _get_resource_from_article(self, article, anchor, context=None):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        # titles
        anchor_title = clean_string(anchor.get_text())
        titles = self._reconcile_titles(anchor_title, article.title)
        try:
            title = titles[0]
        except IndexError:
            msg = 'could not extract resource title'
            raise IndexError(msg)
        try:
            title_extended = titles[1]
        except IndexError:
            title_extended = None

        # description
        desc_text = self._get_description(context, title=title)
        if desc_text is None:
            logger.warning('could not extract primary resource description from {0}; using title'.format(article.url))
            desc_text = title

        # parse authors
        authors = self._parse_authors(desc_text)

        # parse identifiers
        identifiers = self._parse_identifiers(desc_text)

        # language
        language = self._get_language(title, title_extended, desc_text)

        # determine keywords
        keywords = self._parse_keywords(article.title, titles[-1], article.categories)

        # create and populate the resource object
        params = {
            'url': anchor.get('href'),
            'domain': domain_from_url(anchor.get('href')),
            'title': title
        }
        if desc_text is not None:
            params['description'] = desc_text
        if len(list(identifiers.keys())) > 0:
            params['identifiers'] = identifiers
        if len(authors) > 0:
            params['authors'] = authors
        if title_extended is not None:
            params['title_extended'] = title_extended
        if language is not None:
            params['languages'] = language
        if len(keywords) > 0:
            params['keywords'] = keywords
        resource = self._make_resource(**params)

        # provenance
        self._set_provenance(resource, article)

        return resource

    def _get_resource_from_external_biblio(self, url):
        """Attempt to get third-party structured bibliographic data."""

        logger = logging.getLogger(sys._getframe().f_code.co_name)
        domain = domain_from_url(url)

        try:
            biblio_howto = BIBLIO_SOURCES[domain]
        except KeyError:
            msg = 'parsing structured bibliographic data from {0} is not supported.'.format(domain)
            raise NotImplementedError(msg)
        else:
            m = biblio_howto['url_pattern'].match(url)
            if m:
                biblio_url = url + biblio_howto['url_append']
                biblio_req = requests.get(biblio_url)
                if biblio_req.status_code == 200:
                    actual_type = biblio_req.headers['content-type']
                    if actual_type != biblio_howto['type']:
                        raise IOError('got {actualtype} from {biblurl} when '
                            + '{soughttype} was expected'.format(
                                actualtype=actual_type,
                                biblurl=biblio_url,
                                soughttype=biblio_howto['type']))
                    elif actual_type == 'application/rdf+xml':
                        root = etree.fromstring(biblio_req.content)
                        payload_element = root.xpath(
                            biblio_howto['payload_xpath'],
                            namespaces=biblio_howto['namespaces'])[0]
                        payload = etree.tostring(payload_element, encoding='unicode')
                    else:
                        raise IOError('parsing content of type {actualtype} '
                            + 'is not supported'.format(
                                actualtype=actual_type))
                    payload_type = biblio_howto['payload_type']
                    if payload_type == 'application/mods+xml':
                        biblio_data = mods.extract(payload)
                    else:
                        raise NotImplementedError('parsing payload of type {payloadtype} '
                            + 'is not supported'.format(
                                payloadtype=payload_type))
                    params = {}
                    for k in [k for k in list(biblio_data.keys()) if k not in ['record_change_date', 'record_creation_date', 'name']]:
                        if k == 'uri':
                            value = (k, biblio_data[k])
                        elif k == 'language':
                            value = [lang[0] for lang in biblio_data[k]]
                        elif k == 'url':
                            value = biblio_data[k][0]
                            if len(biblio_data[k]) > 1:
                                raise Exception
                        else:
                            value = biblio_data[k]
                        try:
                            rk = MODS2RESOURCES[k]
                        except KeyError:
                            rk = k
                        params[rk] = value
                    params['domain'] = domain_from_url(biblio_data['url'][0])
                    top_resource = self._make_resource(**params)
                    try:
                        updated = biblio_data['record_change_date'][0]
                    except KeyError:
                        updated = biblio_data['record_creation_date'][0]
                    try:
                        rx = biblio_howto['date_fixer']
                    except KeyError:
                        pass
                    else:
                        m = rx.match(updated)
                        if m:
                            d = {}
                            for k in ['year', 'month', 'day', 'hour', 'minute', 'second']:
                                d[k] = m.group(k)
                            logger.debug(d)
                            updated = '{year}-{month}-{day}T{hour}:{minute}:{second}'.format(**d)
                    resource_fields = sorted([k for k in list(params.keys()) if '_' != k[0]])
                    top_resource.set_provenance(biblio_url, 'citesAsDataSource', updated, resource_fields)
                    if domain == 'zenon.dainst.org':
                        top_resource.zenon_id = url.split('/')[-1]
                else:
                    raise IOError("unsuccessfull attempt (status code {0}) " +
                        "to get bibliograhic data from {1}".format(
                            biblio_req.status_code, biblio_url))
            return top_resource

    def _get_subordinate_resources(self, article, parent_package, start_anchor=None):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        resources = []
        anchors = self._get_anchors()
        index = 0
        if start_anchor is not None:
            for i,a in enumerate(anchors):
                if a == start_anchor:
                    index = i
                    break
            anchors = [a for a in anchors[index:]]

        parent_domain = domain_from_url(parent_package['url'])
        anchors = [a for a in anchors if parent_domain in a.get('href')]

        for a in anchors:
            # title
            title_context = self._get_anchor_ancestor_for_title(a)
            title = clean_string(title_context.get_text(' '))

            # try to extract volume and year
            try:
                volume, issue, year = self._grok_analytic_title(title)
            except TypeError:
                volume = year = issue = None
            if volume is not None and year is None and issue is not None:
                # sometimes more than one volume falls in a single list item b/c same year or parts
                try:
                    parent_li = a.find_parents('li')[0]
                except:
                    pass
                else:
                    try:
                        raw = parent_li.get_text().strip()[0:4]
                    except IndexError:
                        pass
                    else:
                        try:
                            cooked = str(int(raw))
                        except ValueError:
                            pass
                        else:
                            if cooked == raw:
                                year = cooked

            # description
            next_node = title_context.next_sibling
            desc_text = self._get_description(next_node, title=title)

            # parse identifiers
            identifiers = self._parse_identifiers(desc_text)

            # language
            language = self._get_language(title, desc_text)

            # determine keywords
            keywords = self._parse_keywords(resource_title=title, resource_text=desc_text)

            # create and populate the resource object
            params = {
                'url': a.get('href'),
                'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0],
                'title': title,
                'is_part_of': parent_package
            }
            if desc_text is not None:
                params['description'] = desc_text
            if len(list(identifiers.keys())) > 0:
                params['identifiers'] = identifiers
            if language is not None:
                params['languages'] = language
            if len(keywords) > 0:
                params['keywords'] = keywords
            if volume is not None:
                params['volume'] = volume
            if year is not None:
                params['year'] = year
            if issue is not None:
                params['issue'] = issue
            resource = self._make_resource(**params)

            self._set_provenance(resource, article)

            resources.append(resource)
        return resources

    def _get_unique_urls(self):
        c = self.content
        if c['unique_urls'] is not None:
            return c['unique_urls']
        else:
            anchors = self._get_anchors()
        urls = [a.get('href') for a in anchors if a.get('href') is not None]
        unique_urls = list(set(urls))
        c['unique_urls'] = unique_urls
        return unique_urls

    def _grok_analytic_title(self, title):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        for g in RX_ANALYTIC_TITLES:
            m = g['rx'].match(title)
            if m is not None:
                break
        if m is not None:
            try:
                volume = m.group(g['volume'])
            except KeyError:
                volume = None
            try:
                issue = m.group(g['issue'])
            except KeyError:
                issue = None
            try:
                year = m.group(g['year'])
            except KeyError:
                year = None
            return (volume, issue, year)

    # keyword methods
    def _mine_keywords(self, *args):
        tags = []
        for s in args:
            if s is not None:
                lower_s = s.lower()
                # mine for terms (i.e., single-word keys)
                lower_list = list(set(lower_s.split()))
                for k in list(TITLE_SUBSTRING_TERMS.keys()):
                    if k in lower_list:
                        tag = TITLE_SUBSTRING_TERMS[k]
                        tags.append(tag)
                if 'open' in lower_list and 'access' in lower_list:
                    if 'partial' in lower_list:
                        if 'partial open access' in lower_s:
                            tags.append('mixed access')
                    else:
                        if 'open access' in lower_s:
                            tags.append('open access')
                if 'series' in lower_list and 'lecture' not in lower_list:
                    tags.append('series')
                # mine for phrases
                for k in list(TITLE_SUBSTRING_PHRASES.keys()):
                    if k in lower_s:
                        tag = TITLE_SUBSTRING_PHRASES[k]
                        tags.append(tag)
        return tags

    def _parse_keywords(self, post_title=None, resource_title=None, post_categories=[], resource_text=None):
        """Infer and normalize resource tags."""

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        # mine keywords from content
        tags = self._mine_keywords(post_title, resource_title)

        # convert post categories to tags
        for c in post_categories:
            tag = c['term'].lower()
            if 'kind#post' not in tag:
                if tag in list(TITLE_SUBSTRING_TAGS.keys()):
                    tag = TITLE_SUBSTRING_TAGS[tag]
                else:
                    logger.error('unexpected category tag "{0}" in post with title "{1}"'.format(c['term'], post_title))
                    raise Exception
                tags.append(tag)
        return self._clean_keywords(tags)

    def _clean_keywords(self, raw_tags):
        tags = list(set(raw_tags))
        keywords = []
        for tag in tags:
            if tag == '':
                pass
            elif ',' in tag:
                keywords.extend(tag.split(','))
            else:
                keywords.append(tag)
        keywords = sorted([normalize_space(kw) for kw in list(set(keywords))], key=lambda s: s.lower())
        for tag in keywords:
            if tag == tag.upper():
                pass
            elif tag.lower() in list(TITLE_SUBSTRING_TAGS.keys()):
                pass
            elif tag != tag.lower():
                raise ValueError('keyword "{0}" lacks an appropriate entry in awol_title_strings.csv'.format(tag))
        return list(set(keywords))

    def _make_resource(self, **kwargs):
        r = Resource()
        for k,v in list(kwargs.items()):
            if v is not None:

                if type(v) == list:
                    value = v
                elif type(v) in [str, str]:
                    if k == 'url':
                        value = v
                    else:
                        value = [v, ]
                elif type(v) == tuple:
                    value = v
                elif type(v) == dict:
                    value = v
                else:
                    value = list(v)
                try:
                    curv = getattr(r, k)
                except AttributeError:
                    raise AttributeError('{k} is not a valid attribute for a resource'.format(k=k))
                else:
                    if curv == None and type(value) in [str, str, dict]:
                        setattr(r, k, value)
                    elif curv == None:
                        setattr(r, k, value[0])
                        if len(value) > 1:
                            raise Exception('rats')
                    elif type(curv) == list:
                        value_new = deepcopy(curv)
                        value_new.extend(value)
                        setattr(r, k, value_new)
                    elif type(curv) == dict and type(value) == tuple:
                        value_new = deepcopy(curv)
                        value_new[value[0]] = value[1]
                        setattr(r, k, value_new)
                    elif type(curv) == dict and type(value) == dict:
                        value_new = deepcopy(curv)
                        for kk in list(value.keys()):
                            value_new[kk] = value[kk]
                        setattr(r, k, value_new)
                    else:
                        raise RuntimeError('undefined error in _make_resource()')

        return r

    def _parse_authors(self, content_text):
        return self._parse_peeps(RX_AUTHORS, content_text)

    def _parse_editors(self, content_text):
        return self._parse_peeps(RX_EDITORS, content_text)

    def _parse_identifiers(self, content_text):
        """Parse identifying strings of interest from an AWOL blog post."""

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        identifiers = {}
        if content_text == None:
            return identifiers
        text = content_text.lower()
        words = list(set(text.split()))

        def get_candidates(k, kk, text):
            candidates = []
            rexx = RX_IDENTIFIERS[k]
            for rx in rexx[kk]:
                candidates.extend([''.join(groups) for groups in rx.findall(text)])
            if len(candidates) > 1:
                candidates = list(set(candidates))
            return candidates

        def extract(k, text):
            m = RX_IDENTIFIERS[k]['extract']['precise'].match(text)
            if m is not None:
                if len(m.groups()) == 1:
                    return m.groups()[0]
            else:
                try:
                    m = RX_IDENTIFIERS[k]['extract']['fallback'].match(text)
                except KeyError:
                    pass
                else:
                    if m is not None:
                        if len(m.groups()) == 1:
                            return m.groups()[0]
            raise Exception

        for k in list(RX_IDENTIFIERS.keys()):
            if k in ' '.join(words):
                if k not in list(identifiers.keys()):
                    identifiers[k] = {}
                for kk in ['electronic', 'generic']:
                    candidates = get_candidates(k, kk, text)
                    if len(candidates) > 0:
                        identifiers[k][kk] = []
                        for candidate in candidates:
                            extraction = extract(k, candidate)
                            identifiers[k][kk].append(extraction)
                        if len(identifiers[k][kk]) > 1:
                            identifiers[k][kk] = list(set(identifiers[k][kk]))
                if len(list(identifiers[k].keys())) == 0:
                    logger.error('expected but failed to match valid issn in {0}'.format(text))
                # regularize presentation form and deduplicate issns
                if k == 'issn':
                    try:
                        identifiers[k]['electronic'] = [issn.replace(' ', '-').upper() for issn in identifiers[k]['electronic']]
                    except KeyError:
                        pass
                    try:
                        identifiers[k]['generic'] = [issn.replace(' ', '-').upper() for issn in identifiers[k]['generic']]
                    except KeyError:
                        pass
                    if 'electronic' in list(identifiers[k].keys()) and 'generic' in list(identifiers[k].keys()):
                        for ident in identifiers[k]['generic']:
                            if ident in identifiers[k]['electronic']:
                                identifiers[k]['generic'].remove(ident)
                        if len(identifiers[k]['generic']) == 0:
                            del identifiers[k]['generic']
        return identifiers

    def _parse_peeps(self, rx_list, content_text):

        cooked = []
        raw = ''
        for rx in rx_list:
            m = rx.search(content_text)
            if m:
                raw = m.groups()[-1]
                break
        if len(raw) > 0:
            if ',' in raw:
                cracked = raw.split(',')
            else:
                cracked = [raw,]
            for chunk in cracked:
                if ' and ' in chunk:
                    cooked.extend(chunk.split(' and '))
                else:
                    cooked.append(chunk)
            cooked = [normalize_space(peep) for peep in cooked if len(normalize_space(peep)) > 0]
        return cooked

    def _reconcile_titles(self, anchor_title=None, article_title=None):

        if anchor_title is None and article_title is None:
            return None
        if anchor_title is None:
            return (check_colon(article_title),)
        if article_title is None:
            return (check_colon,)
        anchor_lower = anchor_title.lower()
        article_lower = article_title.lower()
        if anchor_lower == article_lower:
            return (article_title,)
        clean_article_title = check_colon(article_title)
        clean_article_lower = clean_article_title.lower()
        if clean_article_lower == anchor_lower:
            return (anchor_title,)
        elif clean_article_lower in anchor_lower:
            return (clean_article_title, anchor_title)
        else:
            return (anchor_title,)

    def _set_provenance(self, resource, article, fields=None):
        updated = article.root.xpath("//*[local-name()='updated']")[0].text.strip()
        if fields is None:
            resource_fields = sorted([k for k in list(resource.__dict__.keys()) if '_' != k[0]])
        else:
            resource_fields = fields
        resource.set_provenance(article.id, 'citesAsDataSource', updated, resource_fields)
        resource.set_provenance(article.url, 'citesAsMetadataDocument', updated)



In [None]:
# awol_parse_oi.py post 2to3 refactoring
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Parse HTML content for resources from the OI content aggregator.

This module defines the following classes:

 * AwolPerseeParser: parse AWOL blog post content for resources
"""

import logging
import sys

from isaw.awol.parse.awol_parse_domain import AwolDomainParser
from isaw.awol.parse.awol_parse import domain_from_url

NEVER_PRIMARY_DOMAINS = [
    'www.oxbowbooks.com',
]
MY_SKIP_URLS = [
    'http://oi.uchicago.edu/news/',
]

class Parser(AwolDomainParser):
    """Extract data from an AWOL blog post about content on OI."""

    def __init__(self):
        self.domain = 'oi.uchicago.edu'
        AwolDomainParser.__init__(self)
        
    def _get_primary_anchor(self):
        """Deal with OI peculiarities."""
        for pa in AwolDomainParser._get_anchors(self):
            url = pa.get('href')
            domain = domain_from_url(url)
            if domain not in NEVER_PRIMARY_DOMAINS and url not in MY_SKIP_URLS:
                break
        return pa


In [None]:
# resource.py post 2to3 refactoring
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Define classes and methods for working with resources extracted from blog.

This module defines the following classes:

 * Resource: Extracts and represents key information about a web resource.
"""

import copy
import datetime
import io
import json
import logging
import pprint
import sys

from wikidata_suggest import suggest

PROVENANCE_VERBS = {
    'citesAsMetadataDocument': 'http://purl.org/spar/cito/citesAsMetadataDocument',
    'citesAsDataSource': 'http://purl.org/spar/cito/citesAsDataSource',
    'hasWorkflowMotif': 'http://purl.org/net/wf-motifs#hasWorkflowMotif',
    'Combine': 'http://purl.org/net/wf-motifs#Combine'
}

class Resource:
    """Store, manipulate, and export data about a single information resource."""

    def __init__(self):
        """Set all attributes to default values."""

        self.authors = []
        self.contributors = []
        self.description = None
        self.domain = None
        self.editors = []
        self.end_date = None
        self.extent = None
        self.form = None
        self.frequency = None
        self.identifiers = {}
        self.is_part_of = None
        self.issue = None
        self.issuance = None
        self.issued_dates = None
        self.keywords = []
        self.languages = []
        self.places = []
        self.provenance = []
        self.publishers = []
        self.related_resources = []
        self.responsibility = []
        self.start_date = None
        self.subordinate_resources = []
        self.title = None
        self.title_alternates = []
        self.title_extended = None
        self.type = None
        self.url = None
        self.url_alternates = []
        self.volume = None
        self.year = None
        self.zenon_id = None
        self.zotero_id = None

    def json_dumps(self, formatted=False):
        """Dump resource to JSON as a UTF-8 string."""

        logger = logging.getLogger(sys._getframe().f_code.co_name)
        dump = self.__dict__.copy()
        for k,v in dump.items():
            logger.debug("{0} ({1})".format(k, type(v)))
        if formatted:
            return json.dumps(dump, indent=4, sort_keys=True, ensure_ascii=False).encode('utf8')
        else:
            return json.dumps(dump, ensure_ascii=False).encode('utf8')

    def json_dump(self, filename, formatted=False):
        """Dump resource as JSON to a UTF-8 encoded file."""
        dumps = self.json_dumps(formatted) # get utf8-encoded JSON dump
        with open(filename, 'w') as f:
            f.write(dumps)
        del dumps


    def json_loads(self, s):
        """Parse resource from a UTF-8 JSON string."""
        self.__dict__ = json.loads(str(s))

    def json_load(self, filename):
        """Parse resource from a json file."""
        with io.open(filename, 'r', encoding='utf8') as f:
            self.__dict__ = json.load(f)

    def package(self):
        """Return a summary package of resource information."""
        pkg = {}
        try:
            title = self.extended_title
        except AttributeError:
            title = self.title
        pkg['title_full'] = title
        pkg['url'] = self.url
        if title != self.title:
            pkg['title'] = self.title
        return pkg


    def zotero_add(self, zot, creds, extras={}):
        """Upload as a record to Zotero."""

        logger = logging.getLogger(sys._getframe().f_code.co_name)

        try:
            issn = self.identifiers['issn']
        except KeyError:
            if 'journal' in self.keywords:
                zot_type = 'journalArticle'
            else:
                zot_type = 'webpage'
        else:
            zot_type = 'journalArticle'
        template = zot.item_template(zot_type)
        template['abstractNote'] = self.description
        if 'issn' in locals():
            template['issn'] = issn
        template['tags'] = self.keywords
        template['extra'] = ', '.join([':'.join((k,'"{0}"'.format(v))) for k,v in extras.items()])
        try:
            template['language'] = self.language[0]
        except TypeError:
            pass
        template['title'] = self.title
        template['url'] = self.url
        resp = zot.create_items([template])
        try:
            zot_id = resp['success']['0']
            logger.debug("zot_id: {0}".format(zot_id))
        except KeyError:
            logger.error('Zotero upload appears to have failed with {0}'.format(repr(resp)))
            raise
        else:
            self.zotero_id = {
                'libraryType': creds['libraryType'],
                'libraryID': creds['libraryID'],
                'itemID': zot_id
            }
            logger.debug(repr(self.zotero_id))

    def wikidata_suggest(self, resource_title):
        wikidata = suggest(resource_title)
        if wikidata:
            return wikidata['id']
        else:
            return None

    def set_provenance(self, object, verb='citesAsMetadataDocument', object_date=None, fields=None):
        """Add an entry to the provenance list."""

        d = {
            'term': PROVENANCE_VERBS[verb],
            'when': datetime.datetime.utcnow().isoformat(),
            'resource': object
        }
        if object_date is not None:
            d['resource_date'] = object_date
        if fields is not None:
            if fields is list:
                d['fields'] = fields
            else:
                d['fields'] = list(fields)
        self.provenance.append(d)

    def __str__(self):
        return pprint.pformat(self.__dict__, indent=4, width=120)


def merge(r1, r2):
    """Merge two resources into oneness."""
    logger = logging.getLogger(sys._getframe().f_code.co_name)
    r3 = Resource()
    modified_fields = []
    k1 = list(r1.__dict__.keys())
    k2 = list(r2.__dict__.keys())
    all_keys = list(set(k1 + k2))
    domain = r1.domain
    for k in all_keys:
        modified = False
        v3 = None
        try:
            v1 = copy.deepcopy(r1.__dict__[k])
        except KeyError:
            v1 = None
        try:
            v2 = copy.deepcopy(r2.__dict__[k])
        except KeyError:
            v2 = None

        if k in ['url',]:
            if v1 != v2:
                if v1.startswith(v2):
                    v3 = v2
                    r3.__dict__['url_alternates'].append(v1)
                elif v2.startswith(v1):
                    v3 = v1
                    r3.__dict__['url_alternates'].append(v2)
                else:
                    protocol1, path1 = v1.split('://')
                    protocol2, path2 = v2.split('://')
                    if path1 == path2 and (protocol1 == 'https' or protocol2 == 'https'):
                        v3 = 'https://' + path1
                    else:
                        raise ValueError('could not reconcile url mismatch in merge: {1} vs. {2}'.format(k, v1, v2))
            else:
                v3 = v1
        else:
            modified = True
            if v1 is None and v2 is None:
                v3 = None
                modified = False
            # prefer some data over no data
            elif v1 is None and v2 is not None:
                v3 = v2
            elif v1 is not None and v2 is None:
                v3 = v1
            elif k == 'is_part_of':
                if v1 == v2:
                    v3 = v1
                    modified = False
                else:
                    if domain in v1['url']:
                        v3 = v1
                    elif domain in v2['url']:
                        v3 = v2
                    elif 'issn' in list(v1.keys()) and not('issn' in list(v2.keys())):
                        v3 = v1
                    elif 'issn' in list(v2.keys()) and not('issn' in list(v1.keys())):
                        v3 = v2
                    else:
                        v3 = None
            elif k in ['volume', 'year', 'zenon_id', 'issue', 'zotero_id']:
                if v1 == v2:
                    v3 = v1
                    modified = False
                elif v1 is None and v1 is not None:
                    v3 = v2
                elif v1 is not None and v2 is None:
                    v3 = v1
                else:
                    raise ValueError('cannot merge two resources in which the {0} field differs: "{1}" vs. "{2}"'.format(k, v1, v2))
            elif k == 'languages':
                if len(v1) == 0 and len(v2) > 0:
                    v3 = copy.deepcopy(v2)
                elif len(v1) > 0 and len(v2) == 0:
                    v3 = copy.deepcopy(v1)
                elif len(v1) > 0 and len(v2) > 0:
                    v3 = list(set(v1 + v2))
                else:
                    v3 = []
            elif k == 'identifiers':
                if len(v1) == 0 and len(v2) > 0:
                    v3 = copy.deepcopy(v2)
                elif len(v1) > 0 and len(v2) == 0:
                    v3 = copy.deepcopy(v1)
                elif len(v1) > 0 and len(v2) > 0:
                    v3 = {}
                    idfams = list(set(list(v1.keys()) + list(v2.keys())))
                    for idfam in idfams:
                        thisval1 = None
                        thisval2 = None
                        try:
                            thisval1 = v1[idfam]
                        except KeyError:
                            pass
                        try:
                            thisval2 = v2[idfam]
                        except KeyError:
                            pass
                        if type(thisval1) == list or type(thisval2) == list:
                            v3[idfam] = []
                            if thisval1 is not None:
                                v3[idfam].extend(thisval1)
                            if thisval2 is not None:
                                v3[idfam].extend(thisval2)
                            v3[idfam] = list(set(v3[idfam]))
                        elif type(thisval1) == dict or type(thisval2) == dict:
                            if thisval1 is None and thisval2 is not None:
                                v3 = copy.deepcopy(v2)
                            elif thisval1 is not None and thisval2 is None:
                                v3 = copy.deepcopy(v1)
                            else:
                                v3[idfam] = {}
                                idtypes = list(set(list(thisval1.keys()) + list(thisval2.keys())))
                                for idtype in idtypes:
                                    thissubval1 = None
                                    thissubval2 = None
                                    try:
                                        thissubval1 = v1[idfam][idtype]
                                    except KeyError:
                                        pass
                                    try:
                                        thissubval2 = v2[idfam][idtype]
                                    except KeyError:
                                        pass
                                    v3[idfam][idtype] = []
                                    if thissubval1 is not None:
                                        v3[idfam][idtype].extend(thissubval1)
                                    if thissubval2 is not None:
                                        v3[idfam][idtype].extend(thissubval2)
                                    v3[idfam][idtype] = list(set(v3[idfam][idtype]))
                else:
                    v3 = {}

            elif k in ['subordinate_resources', 'related_resources']:
                if len(v1) == 0 and len(v2) == 0:
                    modified = False
                v3 = v1 + v2
                seen = []
                for v3_child in v3:
                    if v3_child['url'] in seen:
                        del(v3_child)
                    else:
                        seen.append(v3_child['url'])
                del(seen)
            elif k == 'provenance':
                modified = False
                v3 = v1 + v2
            elif type(v1) == list and type(v2) == list:
                if len(v1) == 0 and len(v2) == 0:
                    modified = False
                    v3 = []
                elif len(v1) == 0 and len(v2) > 0:
                    v3 = v2
                elif len(v1) > 0 and len(v2) == 0:
                    v3 = v1
                else:
                    v3 = list(set(v1 + v2))
            elif type(v1) in [str, str]:
                if len(v1) == 0 and len(v2) == 0:
                    modified = False
                    v3 = v1
                elif v1 == v2:
                    modified = False
                    v3 = v1
                # if one contains the other, prefer the container
                elif v1 in v2:
                    v3 = v2
                elif v2 in v1:
                    v3 = v1
                # prefer the longer of the two
                elif len(v1) > len(v2):
                    v3 = v1
                else:
                    v3 = v2
            else:
                raise Exception
        r3.__dict__[k] = v3
        if modified:
            modified_fields.append(k)
    r3.set_provenance('http://purl.org/net/wf-motifs#Combine', 'hasWorkflowMotif', fields=modified_fields)
    return r3


def scriptinfo():
    '''
    Returns a dictionary with information about the running top level Python
    script:
    ---------------------------------------------------------------------------
    dir:    directory containing script or compiled executable
    name:   name of script or executable
    source: name of source code file
    ---------------------------------------------------------------------------
    "name" and "source" are identical if and only if running interpreted code.
    When running code compiled by py2exe or cx_freeze, "source" contains
    the name of the originating Python script.
    If compiled by PyInstaller, "source" contains no meaningful information.
    '''

    import os, sys, inspect
    #---------------------------------------------------------------------------
    # scan through call stack for caller information
    #---------------------------------------------------------------------------
    for teil in inspect.stack():
        # skip system calls
        if teil[1].startswith("<"):
            continue
        if teil[1].upper().startswith(sys.exec_prefix.upper()):
            continue
        trc = teil[1]

    # trc contains highest level calling script name
    # check if we have been compiled
    if getattr(sys, 'frozen', False):
        scriptdir, scriptname = os.path.split(sys.executable)
        return {"dir": scriptdir,
                "name": scriptname,
                "source": trc}

    # from here on, we are in the interpreted case
    scriptdir, trc = os.path.split(trc)
    # if trc did not contain directory information,
    # the current working directory is what we need
    if not scriptdir:
        scriptdir = os.getcwd()

    scr_dict ={"name": trc,
               "source": trc,
               "dir": scriptdir}
    return scr_dict

