From c896ee3daef30491acf81d7978a5d0deefd0dedb Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 21 Oct 2015 16:36:30 -0400 Subject: [PATCH 1/9] add new method for parsing affiliations out of doe services --- scrapi/base/helpers.py | 20 ++++++++++++++++++++ scrapi/base/schemas.py | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index b8fcc333..2a20b87f 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -17,6 +17,7 @@ URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)') DOI_REGEX = re.compile(r'(doi:10\.\S*)') +DOE_AFFILIATIONS = re.compile(r'\[(.*)\]') def CONSTANT(x): @@ -359,3 +360,22 @@ def datetime_formatter(datetime_string): if not date_time.tzinfo: date_time = date_time.replace(tzinfo=pytz.UTC) return date_time.isoformat() + + +def doe_name_parser(name): + if name == 'None': + return {'name': ''} + affiliations = DOE_AFFILIATIONS.findall(name) + for affiliation in affiliations: + name = name.replace('[{}]'.format(affiliation), '') + parsed_name = maybe_parse_name(name) + parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations)) + return parsed_name + + +def doe_parse_affiliation(affiliation): + return {'name': affiliation} # TODO: Maybe parse out address? + + +def doe_process_contributors(names): + return list(map(doe_name_parser, names)) diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py index e9772d60..37deba9f 100644 --- a/scrapi/base/schemas.py +++ b/scrapi/base/schemas.py @@ -10,12 +10,13 @@ build_properties, default_name_parser, oai_process_contributors, + doe_process_contributors, ) DOESCHEMA = { "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)), - "contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)), + "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)), "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)), "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)), "uris": { From 0dbee89846bf16cd6e88a79efd621ac0cbd19feb Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 21 Oct 2015 16:59:03 -0400 Subject: [PATCH 2/9] flake8 --- scrapi/base/schemas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py index 37deba9f..1b1861fc 100644 --- a/scrapi/base/schemas.py +++ b/scrapi/base/schemas.py @@ -8,7 +8,6 @@ datetime_formatter, oai_process_uris, build_properties, - default_name_parser, oai_process_contributors, doe_process_contributors, ) From ca179ad3c60f258ee3d46d3d18378da8f605bd24 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 21 Oct 2015 16:36:30 -0400 Subject: [PATCH 3/9] add new method for parsing affiliations out of doe services --- scrapi/base/helpers.py | 20 ++++++++++++++++++++ scrapi/base/schemas.py | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index b8fcc333..2a20b87f 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -17,6 +17,7 @@ URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)') DOI_REGEX = re.compile(r'(doi:10\.\S*)') +DOE_AFFILIATIONS = re.compile(r'\[(.*)\]') def CONSTANT(x): @@ -359,3 +360,22 @@ def datetime_formatter(datetime_string): if not date_time.tzinfo: date_time = date_time.replace(tzinfo=pytz.UTC) return date_time.isoformat() + + +def doe_name_parser(name): + if name == 'None': + return {'name': ''} + affiliations = DOE_AFFILIATIONS.findall(name) + for affiliation in affiliations: + name = name.replace('[{}]'.format(affiliation), '') + parsed_name = maybe_parse_name(name) + parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations)) + return parsed_name + + +def doe_parse_affiliation(affiliation): + return {'name': affiliation} # TODO: Maybe parse out address? + + +def doe_process_contributors(names): + return list(map(doe_name_parser, names)) diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py index e9772d60..37deba9f 100644 --- a/scrapi/base/schemas.py +++ b/scrapi/base/schemas.py @@ -10,12 +10,13 @@ build_properties, default_name_parser, oai_process_contributors, + doe_process_contributors, ) DOESCHEMA = { "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)), - "contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)), + "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)), "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)), "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)), "uris": { From 088957be2b41a85f57aa81a61d19b0c7fe39a86f Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 21 Oct 2015 16:59:03 -0400 Subject: [PATCH 4/9] flake8 --- scrapi/base/schemas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py index 37deba9f..1b1861fc 100644 --- a/scrapi/base/schemas.py +++ b/scrapi/base/schemas.py @@ -8,7 +8,6 @@ datetime_formatter, oai_process_uris, build_properties, - default_name_parser, oai_process_contributors, doe_process_contributors, ) From bfe9811167027f41b3ed3b7f6ae2a35726be8cb4 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 18 Nov 2015 16:24:31 -0500 Subject: [PATCH 5/9] Fix broken import in NIH --- scrapi/harvesters/nih.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scrapi/harvesters/nih.py b/scrapi/harvesters/nih.py index b59de332..038f7c24 100644 --- a/scrapi/harvesters/nih.py +++ b/scrapi/harvesters/nih.py @@ -27,8 +27,13 @@ from scrapi.base import XMLHarvester from scrapi.util import copy_to_unicode from scrapi.linter.document import RawDocument -from scrapi.base.schemas import default_name_parser -from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter +from scrapi.base.helpers import ( + compose, + single_result, + build_properties, + datetime_formatter, + default_name_parser +) logger = logging.getLogger(__name__) From 4a9973aa66dca7bff0101f2019c22d4d7b3e5b96 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 18 Nov 2015 16:25:06 -0500 Subject: [PATCH 6/9] Parse out orcid, email, and affiliations from DOE sources --- scrapi/base/helpers.py | 47 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index 2a20b87f..894c4e9b 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -17,7 +17,9 @@ URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)') DOI_REGEX = re.compile(r'(doi:10\.\S*)') -DOE_AFFILIATIONS = re.compile(r'\[(.*)\]') +DOE_AFFILIATIONS_REGEX = re.compile(r'\s*\[(.*?)\]') +DOE_EMAIL_REGEX = re.compile(r'((?:,? (?:Email|email|E-mail|e-mail):\s*)?(\S*@\S*))') +DOE_ORCID_REGEX = re.compile(r'(\(ORCID:\s*(\S*)\))') def CONSTANT(x): @@ -365,14 +367,51 @@ def datetime_formatter(datetime_string): def doe_name_parser(name): if name == 'None': return {'name': ''} - affiliations = DOE_AFFILIATIONS.findall(name) - for affiliation in affiliations: - name = name.replace('[{}]'.format(affiliation), '') + name, orcid = extract_and_replace_one(name, DOE_ORCID_REGEX) + name, email = extract_and_replace_one(name, DOE_EMAIL_REGEX) + name, affiliations = doe_extract_affiliations(name) + parsed_name = maybe_parse_name(name) parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations)) + parsed_name['sameAs'] = [orcid] if orcid else [] + parsed_name['email'] = email return parsed_name +def extract_and_replace_one(text, pattern): + ''' Works with regexes with two matches, where the text of the first match + is replaced and the text of the second is returned + + In the case where there is a match: + >>> text = 'I feelvery happy' + >>> pattern = re.compile(r'.*(very\s*(\S*)).*') + >>> modified_text, match = extract_and_replace_one(text, pattern) + >>> print(modified_text) + I feel + >>> print(match) + happy + + In the case where there is not a match: + >>> text = 'I feel happy' + >>> modified_text, match = extract_and_replace_one(text, pattern) + >>> modified_text == text + True + >>> match is None + True + ''' + matches = pattern.findall(text) + if matches and len(matches) == 1: + return text.replace(matches[0][0], ''), matches[0][1] + return text, None + + +def doe_extract_affiliations(name): + affiliations = DOE_AFFILIATIONS_REGEX.findall(name) + for affiliation in affiliations: + name = name.replace('[{}]'.format(affiliation), '') + return name, affiliations + + def doe_parse_affiliation(affiliation): return {'name': affiliation} # TODO: Maybe parse out address? From a92aed8aab1124bb14a40d0337b7b35c47842cfa Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 18 Nov 2015 16:56:22 -0500 Subject: [PATCH 7/9] make sure to conditionally add fields --- scrapi/base/helpers.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index 894c4e9b..e991ede8 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -372,9 +372,12 @@ def doe_name_parser(name): name, affiliations = doe_extract_affiliations(name) parsed_name = maybe_parse_name(name) - parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations)) - parsed_name['sameAs'] = [orcid] if orcid else [] - parsed_name['email'] = email + if affiliations: + parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations)) + if orcid: + parsed_name['sameAs'] = [orcid] + if email: + parsed_name['email'] = email return parsed_name From 912543d0d1d2ea28beb0fbb00b8cac28b7dfbc21 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 18 Nov 2015 17:18:05 -0500 Subject: [PATCH 8/9] sameAs should only contain URLs --- scrapi/base/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index e991ede8..8cc3ea89 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -365,7 +365,7 @@ def datetime_formatter(datetime_string): def doe_name_parser(name): - if name == 'None': + if name.strip() == 'None': return {'name': ''} name, orcid = extract_and_replace_one(name, DOE_ORCID_REGEX) name, email = extract_and_replace_one(name, DOE_EMAIL_REGEX) @@ -375,7 +375,7 @@ def doe_name_parser(name): if affiliations: parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations)) if orcid: - parsed_name['sameAs'] = [orcid] + parsed_name['sameAs'] = ['https://orcid.org/{}'.format(orcid)] if email: parsed_name['email'] = email return parsed_name From fa58f934f06b8b3440dbd7c8e7110fe594a50322 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 18 Nov 2015 17:18:31 -0500 Subject: [PATCH 9/9] add some more cases to the VCRs, to thoroughly test normalization --- tests/vcr/doepages.yaml | 2 +- tests/vcr/scitech.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/vcr/doepages.yaml b/tests/vcr/doepages.yaml index 2d21cd19..2194fd68 100644 --- a/tests/vcr/doepages.yaml +++ b/tests/vcr/doepages.yaml @@ -14,7 +14,7 @@ interactions: \ xmlns:dcq=\"http://purl.org/dc/terms/\">Reliable Energy\ \ Level Alignment at Physisorbed Molecule\u2013Metal Interfaces from Density\ - \ Functional TheoryEgger, David A.; Liu, Zhen-Fei;\ + \ Functional TheoryEgger, David A. Email: egger@ergger.egger, (ORCID: 123123123123) [The suburbs]; Liu, Zhen-Fei;\ \ Neaton, Jeffrey B.; Kronik, LeeorAmerican\ \ Chemical SocietyNoneUSDOE\ \ Office of Science (SC), Basic Energy Sciences (BES) (SC-22)United\ diff --git a/tests/vcr/scitech.yaml b/tests/vcr/scitech.yaml index 56f38610..7a8171a9 100644 --- a/tests/vcr/scitech.yaml +++ b/tests/vcr/scitech.yaml @@ -14,7 +14,7 @@ interactions: \ xmlns:dcq=\"http://purl.org/dc/terms/\">Capacity\ \ Specification for Hybrid Energy Storage System to Accommodate Fast PV FluctuationsWang\ - \ X.; Yue, M25 ENERGY STORAGEN/ABrookhaven\ + \ X.; Yue, E-mail: Yue@gmail.com, (ORCID: 12313123) [The streets] M25 ENERGY STORAGEN/ABrookhaven\ \ National Laboratory (BNL)USDOE\ \ SC OFFICE OF SCIENCE (SC)United\ \ States2015-07-26EnglishConferenceConference:\ @@ -75,7 +75,7 @@ interactions: \ EDOSTI ID: 1165046, Legacy ID: OSTI ID: 1165046DOE-UCSC-46232FG02-05ER4623210.2172/11650462014-12-252014-12-241165046http://www.osti.gov/scitech/servlets/purl/1165046http://www.osti.gov/scitech/biblio/1165046Soft X-ray Self-Seeding simulation methods and\ - \ their application for the LCLSSerkez, Svitozar; /DESY;\ + \ their application for the LCLSNone; /DESY;\ \ Krzywinski, Jacek; Ding, Yuantao; Huang, Zhirong; /SLAC; , ACCPHY,\ \ XFELACCPHY, XFELAbstract\ \ Not Providedhttp://www-public.slac.stanford.edu/SciDoc/docMeta.aspx?slacPubNumber=slac-pub-16163SLAC\