From c896ee3daef30491acf81d7978a5d0deefd0dedb Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 21 Oct 2015 16:36:30 -0400
Subject: [PATCH 1/9] add new method for parsing affiliations out of doe
 services

---
 scrapi/base/helpers.py | 20 ++++++++++++++++++++
 scrapi/base/schemas.py |  3 ++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index b8fcc333..2a20b87f 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -17,6 +17,7 @@
 
 URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
+DOE_AFFILIATIONS = re.compile(r'\[(.*)\]')
 
 
 def CONSTANT(x):
@@ -359,3 +360,22 @@ def datetime_formatter(datetime_string):
     if not date_time.tzinfo:
         date_time = date_time.replace(tzinfo=pytz.UTC)
     return date_time.isoformat()
+
+
+def doe_name_parser(name):
+    if name == 'None':
+        return {'name': ''}
+    affiliations = DOE_AFFILIATIONS.findall(name)
+    for affiliation in affiliations:
+        name = name.replace('[{}]'.format(affiliation), '')
+    parsed_name = maybe_parse_name(name)
+    parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
+    return parsed_name
+
+
+def doe_parse_affiliation(affiliation):
+    return {'name': affiliation}  # TODO: Maybe parse out address?
+
+
+def doe_process_contributors(names):
+    return list(map(doe_name_parser, names))
diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
index e9772d60..37deba9f 100644
--- a/scrapi/base/schemas.py
+++ b/scrapi/base/schemas.py
@@ -10,12 +10,13 @@
     build_properties,
     default_name_parser,
     oai_process_contributors,
+    doe_process_contributors,
 )
 
 
 DOESCHEMA = {
     "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
-    "contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)),
+    "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)),
     "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
     "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
     "uris": {

From 0dbee89846bf16cd6e88a79efd621ac0cbd19feb Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 21 Oct 2015 16:59:03 -0400
Subject: [PATCH 2/9] flake8

---
 scrapi/base/schemas.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
index 37deba9f..1b1861fc 100644
--- a/scrapi/base/schemas.py
+++ b/scrapi/base/schemas.py
@@ -8,7 +8,6 @@
     datetime_formatter,
     oai_process_uris,
     build_properties,
-    default_name_parser,
     oai_process_contributors,
     doe_process_contributors,
 )

From ca179ad3c60f258ee3d46d3d18378da8f605bd24 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 21 Oct 2015 16:36:30 -0400
Subject: [PATCH 3/9] add new method for parsing affiliations out of doe
 services

---
 scrapi/base/helpers.py | 20 ++++++++++++++++++++
 scrapi/base/schemas.py |  3 ++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index b8fcc333..2a20b87f 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -17,6 +17,7 @@
 
 URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
+DOE_AFFILIATIONS = re.compile(r'\[(.*)\]')
 
 
 def CONSTANT(x):
@@ -359,3 +360,22 @@ def datetime_formatter(datetime_string):
     if not date_time.tzinfo:
         date_time = date_time.replace(tzinfo=pytz.UTC)
     return date_time.isoformat()
+
+
+def doe_name_parser(name):
+    if name == 'None':
+        return {'name': ''}
+    affiliations = DOE_AFFILIATIONS.findall(name)
+    for affiliation in affiliations:
+        name = name.replace('[{}]'.format(affiliation), '')
+    parsed_name = maybe_parse_name(name)
+    parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
+    return parsed_name
+
+
+def doe_parse_affiliation(affiliation):
+    return {'name': affiliation}  # TODO: Maybe parse out address?
+
+
+def doe_process_contributors(names):
+    return list(map(doe_name_parser, names))
diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
index e9772d60..37deba9f 100644
--- a/scrapi/base/schemas.py
+++ b/scrapi/base/schemas.py
@@ -10,12 +10,13 @@
     build_properties,
     default_name_parser,
     oai_process_contributors,
+    doe_process_contributors,
 )
 
 
 DOESCHEMA = {
     "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
-    "contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)),
+    "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)),
     "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
     "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
     "uris": {

From 088957be2b41a85f57aa81a61d19b0c7fe39a86f Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 21 Oct 2015 16:59:03 -0400
Subject: [PATCH 4/9] flake8

---
 scrapi/base/schemas.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
index 37deba9f..1b1861fc 100644
--- a/scrapi/base/schemas.py
+++ b/scrapi/base/schemas.py
@@ -8,7 +8,6 @@
     datetime_formatter,
     oai_process_uris,
     build_properties,
-    default_name_parser,
     oai_process_contributors,
     doe_process_contributors,
 )

From bfe9811167027f41b3ed3b7f6ae2a35726be8cb4 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 18 Nov 2015 16:24:31 -0500
Subject: [PATCH 5/9] Fix broken import in NIH

---
 scrapi/harvesters/nih.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/scrapi/harvesters/nih.py b/scrapi/harvesters/nih.py
index b59de332..038f7c24 100644
--- a/scrapi/harvesters/nih.py
+++ b/scrapi/harvesters/nih.py
@@ -27,8 +27,13 @@
 from scrapi.base import XMLHarvester
 from scrapi.util import copy_to_unicode
 from scrapi.linter.document import RawDocument
-from scrapi.base.schemas import default_name_parser
-from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter
+from scrapi.base.helpers import (
+    compose,
+    single_result,
+    build_properties,
+    datetime_formatter,
+    default_name_parser
+)
 
 
 logger = logging.getLogger(__name__)

From 4a9973aa66dca7bff0101f2019c22d4d7b3e5b96 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 18 Nov 2015 16:25:06 -0500
Subject: [PATCH 6/9] Parse out orcid, email, and affiliations from DOE sources

---
 scrapi/base/helpers.py | 47 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index 2a20b87f..894c4e9b 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -17,7 +17,9 @@
 
 URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
-DOE_AFFILIATIONS = re.compile(r'\[(.*)\]')
+DOE_AFFILIATIONS_REGEX = re.compile(r'\s*\[(.*?)\]')
+DOE_EMAIL_REGEX = re.compile(r'((?:,? (?:Email|email|E-mail|e-mail):\s*)?(\S*@\S*))')
+DOE_ORCID_REGEX = re.compile(r'(\(ORCID:\s*(\S*)\))')
 
 
 def CONSTANT(x):
@@ -365,14 +367,51 @@ def datetime_formatter(datetime_string):
 def doe_name_parser(name):
     if name == 'None':
         return {'name': ''}
-    affiliations = DOE_AFFILIATIONS.findall(name)
-    for affiliation in affiliations:
-        name = name.replace('[{}]'.format(affiliation), '')
+    name, orcid = extract_and_replace_one(name, DOE_ORCID_REGEX)
+    name, email = extract_and_replace_one(name, DOE_EMAIL_REGEX)
+    name, affiliations = doe_extract_affiliations(name)
+
     parsed_name = maybe_parse_name(name)
     parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
+    parsed_name['sameAs'] = [orcid] if orcid else []
+    parsed_name['email'] = email
     return parsed_name
 
 
+def extract_and_replace_one(text, pattern):
+    ''' Works with regexes with two matches, where the text of the first match
+        is replaced and the text of the second is returned
+
+        In the case where there is a match:
+        >>> text = 'I feelvery happy'
+        >>> pattern = re.compile(r'.*(very\s*(\S*)).*')
+        >>> modified_text, match = extract_and_replace_one(text, pattern)
+        >>> print(modified_text)
+        I feel
+        >>> print(match)
+        happy
+
+        In the case where there is not a match:
+        >>> text = 'I feel happy'
+        >>> modified_text, match = extract_and_replace_one(text, pattern)
+        >>> modified_text == text
+        True
+        >>> match is None
+        True
+    '''
+    matches = pattern.findall(text)
+    if matches and len(matches) == 1:
+        return text.replace(matches[0][0], ''), matches[0][1]
+    return text, None
+
+
+def doe_extract_affiliations(name):
+    affiliations = DOE_AFFILIATIONS_REGEX.findall(name)
+    for affiliation in affiliations:
+        name = name.replace('[{}]'.format(affiliation), '')
+    return name, affiliations
+
+
 def doe_parse_affiliation(affiliation):
     return {'name': affiliation}  # TODO: Maybe parse out address?
 

From a92aed8aab1124bb14a40d0337b7b35c47842cfa Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 18 Nov 2015 16:56:22 -0500
Subject: [PATCH 7/9] make sure to conditionally add fields

---
 scrapi/base/helpers.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index 894c4e9b..e991ede8 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -372,9 +372,12 @@ def doe_name_parser(name):
     name, affiliations = doe_extract_affiliations(name)
 
     parsed_name = maybe_parse_name(name)
-    parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
-    parsed_name['sameAs'] = [orcid] if orcid else []
-    parsed_name['email'] = email
+    if affiliations:
+        parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
+    if orcid:
+        parsed_name['sameAs'] = [orcid]
+    if email:
+        parsed_name['email'] = email
     return parsed_name
 
 

From 912543d0d1d2ea28beb0fbb00b8cac28b7dfbc21 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 18 Nov 2015 17:18:05 -0500
Subject: [PATCH 8/9] sameAs should only contain URLs

---
 scrapi/base/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index e991ede8..8cc3ea89 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -365,7 +365,7 @@ def datetime_formatter(datetime_string):
 
 
 def doe_name_parser(name):
-    if name == 'None':
+    if name.strip() == 'None':
         return {'name': ''}
     name, orcid = extract_and_replace_one(name, DOE_ORCID_REGEX)
     name, email = extract_and_replace_one(name, DOE_EMAIL_REGEX)
@@ -375,7 +375,7 @@ def doe_name_parser(name):
     if affiliations:
         parsed_name['affiliation'] = list(map(doe_parse_affiliation, affiliations))
     if orcid:
-        parsed_name['sameAs'] = [orcid]
+        parsed_name['sameAs'] = ['https://orcid.org/{}'.format(orcid)]
     if email:
         parsed_name['email'] = email
     return parsed_name

From fa58f934f06b8b3440dbd7c8e7110fe594a50322 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 18 Nov 2015 17:18:31 -0500
Subject: [PATCH 9/9] add some more cases to the VCRs, to thoroughly test
 normalization

---
 tests/vcr/doepages.yaml | 2 +-
 tests/vcr/scitech.yaml  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/vcr/doepages.yaml b/tests/vcr/doepages.yaml
index 2d21cd19..2194fd68 100644
--- a/tests/vcr/doepages.yaml
+++ b/tests/vcr/doepages.yaml
@@ -14,7 +14,7 @@ interactions:
         \ xmlns:dcq=\"http://purl.org/dc/terms/\"><records count=\"4\" morepages=\"\
         true\" start=\"1\" end=\"1\"><record rownumber=\"1\"><dc:title>Reliable Energy\
         \ Level Alignment at Physisorbed Molecule\u2013Metal Interfaces from Density\
-        \ Functional Theory</dc:title><dc:creator>Egger, David A.; Liu, Zhen-Fei;\
+        \ Functional Theory</dc:title><dc:creator>Egger, David A. Email: egger@ergger.egger, (ORCID: 123123123123) [The suburbs]; Liu, Zhen-Fei;\
         \ Neaton, Jeffrey B.; Kronik, Leeor</dc:creator><dc:subject/><dc:subjectRelated/><dc:description/><dcq:publisher>American\
         \ Chemical Society</dcq:publisher><dcq:publisherAvailability/><dcq:publisherResearch>None</dcq:publisherResearch><dcq:publisherSponsor>USDOE\
         \ Office of Science (SC), Basic Energy Sciences (BES) (SC-22)</dcq:publisherSponsor><dcq:publisherCountry>United\
diff --git a/tests/vcr/scitech.yaml b/tests/vcr/scitech.yaml
index 56f38610..7a8171a9 100644
--- a/tests/vcr/scitech.yaml
+++ b/tests/vcr/scitech.yaml
@@ -14,7 +14,7 @@ interactions:
         \ xmlns:dcq=\"http://purl.org/dc/terms/\"><records count=\"123\" morepages=\"\
         true\" start=\"1\" end=\"100\"><record rownumber=\"1\"><dc:title>Capacity\
         \ Specification for Hybrid Energy Storage System to Accommodate Fast PV Fluctuations</dc:title><dc:creator>Wang\
-        \ X.; Yue, M</dc:creator><dc:subject>25 ENERGY STORAGE</dc:subject><dc:subjectRelated/><dc:description>N/A</dc:description><dcq:publisher/><dcq:publisherAvailability/><dcq:publisherResearch>Brookhaven\
+        \ X.; Yue, E-mail: Yue@gmail.com, (ORCID: 12313123) [The streets] M</dc:creator><dc:subject>25 ENERGY STORAGE</dc:subject><dc:subjectRelated/><dc:description>N/A</dc:description><dcq:publisher/><dcq:publisherAvailability/><dcq:publisherResearch>Brookhaven\
         \ National Laboratory (BNL)</dcq:publisherResearch><dcq:publisherSponsor>USDOE\
         \ SC OFFICE OF SCIENCE (SC)</dcq:publisherSponsor><dcq:publisherCountry>United\
         \ States</dcq:publisherCountry><dc:date>2015-07-26</dc:date><dc:language>English</dc:language><dc:type>Conference</dc:type><dcq:typeQualifier/><dc:relation>Conference:\
@@ -75,7 +75,7 @@ interactions:
         \ ED</dc:format><dc:identifier>OSTI ID: 1165046, Legacy ID: OSTI ID: 1165046</dc:identifier><dc:identifierReport>DOE-UCSC-46232</dc:identifierReport><dcq:identifierDOEcontract>FG02-05ER46232</dcq:identifierDOEcontract><dc:identifierOther/><dc:doi>10.2172/1165046</dc:doi><dc:rights/><dc:dateEntry>2014-12-25</dc:dateEntry><dc:dateAdded>2014-12-24</dc:dateAdded><dc:ostiId>1165046</dc:ostiId><dcq:identifier-purl\
         \ type=\"application/pdf\">http://www.osti.gov/scitech/servlets/purl/1165046</dcq:identifier-purl><dcq:identifier-citation>http://www.osti.gov/scitech/biblio/1165046</dcq:identifier-citation></record><record\
         \ rownumber=\"5\"><dc:title>Soft X-ray Self-Seeding simulation methods and\
-        \ their application for the LCLS</dc:title><dc:creator>Serkez, Svitozar; /DESY;\
+        \ their application for the LCLS</dc:title><dc:creator>None; /DESY;\
         \ Krzywinski, Jacek; Ding, Yuantao; Huang, Zhirong; /SLAC; , </dc:creator><dc:subject>ACCPHY,\
         \ XFEL</dc:subject><dc:subjectRelated>ACCPHY, XFEL</dc:subjectRelated><dc:description>Abstract\
         \ Not Provided</dc:description><dcq:publisher/><dcq:publisherAvailability>http://www-public.slac.stanford.edu/SciDoc/docMeta.aspx?slacPubNumber=slac-pub-16163</dcq:publisherAvailability><dcq:publisherResearch>SLAC\