Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions scrapi/base/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
DOI_REGEX = re.compile(r'(doi:10\.\S*)')
DOE_AFFILIATIONS_REGEX = re.compile(r'\s*\[(.*?)\]')
DOE_EMAIL_REGEX = re.compile(r'((?:,? (?:Email|email|E-mail|e-mail):\s*)?(\S*@\S*))')
DOE_ORCID_REGEX = re.compile(r'(\(ORCID:\s*(\S*)\))')


def CONSTANT(x):
Expand Down Expand Up @@ -377,6 +380,65 @@ def datetime_formatter(datetime_string):
return date_time.isoformat()


def doe_name_parser(name):
if name.strip() == 'None':
return {'name': ''}
name, orcid = extract_and_replace_one(name, DOE_ORCID_REGEX)
name, email = extract_and_replace_one(name, DOE_EMAIL_REGEX)
name, affiliations = doe_extract_affiliations(name)

parsed_name = maybe_parse_name(name)
if affiliations:
parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
if orcid:
parsed_name['sameAs'] = ['https://orcid.org/{}'.format(orcid)]
if email:
parsed_name['email'] = email
return parsed_name


def extract_and_replace_one(text, pattern):
''' Works with regexes with two matches, where the text of the first match
is replaced and the text of the second is returned

In the case where there is a match:
>>> text = 'I feelvery happy'
>>> pattern = re.compile(r'.*(very\s*(\S*)).*')
>>> modified_text, match = extract_and_replace_one(text, pattern)
>>> print(modified_text)
I feel
>>> print(match)
happy

In the case where there is not a match:
>>> text = 'I feel happy'
>>> modified_text, match = extract_and_replace_one(text, pattern)
>>> modified_text == text
True
>>> match is None
True
'''
matches = pattern.findall(text)
if matches and len(matches) == 1:
return text.replace(matches[0][0], ''), matches[0][1]
return text, None


def doe_extract_affiliations(name):
affiliations = DOE_AFFILIATIONS_REGEX.findall(name)
for affiliation in affiliations:
name = name.replace('[{}]'.format(affiliation), '')
return name, affiliations


def doe_parse_affiliation(affiliation):
return {'name': affiliation} # TODO: Maybe parse out address?


def doe_process_contributors(names):
return list(map(doe_name_parser, names))


def xml_text_only_list(elems):
'''Return inner text of all elements in list'''
return [xml_text_only(elem) for elem in elems]
Expand Down
4 changes: 2 additions & 2 deletions scrapi/base/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
oai_process_uris,
build_properties,
datetime_formatter,
default_name_parser,
doe_process_contributors,
oai_process_contributors,
dif_process_contributors
)


DOESCHEMA = {
"description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
"contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)),
"contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)),
"title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
"providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
"uris": {
Expand Down
9 changes: 7 additions & 2 deletions scrapi/harvesters/nih.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,13 @@
from scrapi.base import XMLHarvester
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
from scrapi.base.schemas import default_name_parser
from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter
from scrapi.base.helpers import (
compose,
single_result,
build_properties,
datetime_formatter,
default_name_parser
)


logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion tests/vcr/doepages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ interactions:
\ xmlns:dcq=\"http://purl.org/dc/terms/\"><records count=\"4\" morepages=\"\
true\" start=\"1\" end=\"1\"><record rownumber=\"1\"><dc:title>Reliable Energy\
\ Level Alignment at Physisorbed Molecule\u2013Metal Interfaces from Density\
\ Functional Theory</dc:title><dc:creator>Egger, David A.; Liu, Zhen-Fei;\
\ Functional Theory</dc:title><dc:creator>Egger, David A. Email: egger@ergger.egger, (ORCID: 123123123123) [The suburbs]; Liu, Zhen-Fei;\
\ Neaton, Jeffrey B.; Kronik, Leeor</dc:creator><dc:subject/><dc:subjectRelated/><dc:description/><dcq:publisher>American\
\ Chemical Society</dcq:publisher><dcq:publisherAvailability/><dcq:publisherResearch>None</dcq:publisherResearch><dcq:publisherSponsor>USDOE\
\ Office of Science (SC), Basic Energy Sciences (BES) (SC-22)</dcq:publisherSponsor><dcq:publisherCountry>United\
Expand Down
4 changes: 2 additions & 2 deletions tests/vcr/scitech.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ interactions:
\ xmlns:dcq=\"http://purl.org/dc/terms/\"><records count=\"123\" morepages=\"\
true\" start=\"1\" end=\"100\"><record rownumber=\"1\"><dc:title>Capacity\
\ Specification for Hybrid Energy Storage System to Accommodate Fast PV Fluctuations</dc:title><dc:creator>Wang\
\ X.; Yue, M</dc:creator><dc:subject>25 ENERGY STORAGE</dc:subject><dc:subjectRelated/><dc:description>N/A</dc:description><dcq:publisher/><dcq:publisherAvailability/><dcq:publisherResearch>Brookhaven\
\ X.; Yue, E-mail: Yue@gmail.com, (ORCID: 12313123) [The streets] M</dc:creator><dc:subject>25 ENERGY STORAGE</dc:subject><dc:subjectRelated/><dc:description>N/A</dc:description><dcq:publisher/><dcq:publisherAvailability/><dcq:publisherResearch>Brookhaven\
\ National Laboratory (BNL)</dcq:publisherResearch><dcq:publisherSponsor>USDOE\
\ SC OFFICE OF SCIENCE (SC)</dcq:publisherSponsor><dcq:publisherCountry>United\
\ States</dcq:publisherCountry><dc:date>2015-07-26</dc:date><dc:language>English</dc:language><dc:type>Conference</dc:type><dcq:typeQualifier/><dc:relation>Conference:\
Expand Down Expand Up @@ -75,7 +75,7 @@ interactions:
\ ED</dc:format><dc:identifier>OSTI ID: 1165046, Legacy ID: OSTI ID: 1165046</dc:identifier><dc:identifierReport>DOE-UCSC-46232</dc:identifierReport><dcq:identifierDOEcontract>FG02-05ER46232</dcq:identifierDOEcontract><dc:identifierOther/><dc:doi>10.2172/1165046</dc:doi><dc:rights/><dc:dateEntry>2014-12-25</dc:dateEntry><dc:dateAdded>2014-12-24</dc:dateAdded><dc:ostiId>1165046</dc:ostiId><dcq:identifier-purl\
\ type=\"application/pdf\">http://www.osti.gov/scitech/servlets/purl/1165046</dcq:identifier-purl><dcq:identifier-citation>http://www.osti.gov/scitech/biblio/1165046</dcq:identifier-citation></record><record\
\ rownumber=\"5\"><dc:title>Soft X-ray Self-Seeding simulation methods and\
\ their application for the LCLS</dc:title><dc:creator>Serkez, Svitozar; /DESY;\
\ their application for the LCLS</dc:title><dc:creator>None; /DESY;\
\ Krzywinski, Jacek; Ding, Yuantao; Huang, Zhirong; /SLAC; , </dc:creator><dc:subject>ACCPHY,\
\ XFEL</dc:subject><dc:subjectRelated>ACCPHY, XFEL</dc:subjectRelated><dc:description>Abstract\
\ Not Provided</dc:description><dcq:publisher/><dcq:publisherAvailability>http://www-public.slac.stanford.edu/SciDoc/docMeta.aspx?slacPubNumber=slac-pub-16163</dcq:publisherAvailability><dcq:publisherResearch>SLAC\
Expand Down