Skip to content

Commit

Permalink
stop munging up names
Browse files Browse the repository at this point in the history
skip parsing out name parts, we don't need it and it's slow
  • Loading branch information
aaxelb committed Jan 12, 2021
1 parent 6b58d31 commit 0c80cd0
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 54 deletions.
2 changes: 1 addition & 1 deletion project/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw):
'NODE_STEPS': [
'tokenize_tags',
'whitespace',
# 'normalize_agent_names',
'normalize_agent_names',
'cited_as',
('normalize_iris', {
'node_types': ['workidentifier'],
Expand Down
47 changes: 11 additions & 36 deletions share/regulate/steps/normalize_agent_names.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import re
import collections

from share.regulate.steps import NodeStep
from share.transform.chain.links import GuessAgentTypeLink
from share.schema import ShareV2Schema
from share.util import strip_whitespace
from share.util import nameparser


class NormalizeAgentNames(NodeStep):
Expand All @@ -18,12 +16,6 @@ class NormalizeAgentNames(NodeStep):
```
"""
NULL_RE = re.compile(r'^(?:\s*(none|null|empty)\s*)?$', re.I)
NAME_PARTS = collections.OrderedDict([
('first', 'given_name'),
('middle', 'additional_name'),
('last', 'family_name'),
('suffix', 'suffix'),
])

def valid_target(self, node):
return node.concrete_type == 'abstractagent'
Expand All @@ -35,45 +27,28 @@ def regulate_node(self, node):
self._normalize_non_person(node)

def _normalize_person(self, node):
attrs = node.attrs()
name = max(
' '.join(filter(None, (
attrs.get(x, '')
for x in self.NAME_PARTS.values()
))),
attrs.get('name', ''),
key=len
)
name = strip_whitespace(node['name'] or '')
if not name:
name = strip_whitespace(' '.join((
node['given_name'] or '',
node['additional_name'] or '',
node['family_name'] or '',
node['suffix'] or '',
)))

if not name or self.NULL_RE.match(name):
self.info('Discarding unnamed person', node.id)
node.delete()
return

human = nameparser.HumanName(name)
for part_name, field_name in self.NAME_PARTS.items():
part = human[part_name]
if part:
node[field_name] = part.title()

node['name'] = ' '.join(filter(None, (
node[k] for k in self.NAME_PARTS.values()
)))
else:
node['name'] = name

def _normalize_non_person(self, node):
# TODO reevaluate everything in this method

attrs = node.attrs()
name = attrs.get('name')

name = node['name']
if not name or self.NULL_RE.match(name):
self.info('Discarding unnamed agent', node.id)
node.delete()
return

# Slightly more intelligent title casing
name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name)

maybe_type_name = GuessAgentTypeLink(default=node.type).execute(name)
maybe_type = ShareV2Schema().get_type(maybe_type_name)
# If the new type is MORE specific, upgrade. Otherwise ignore
Expand Down
2 changes: 1 addition & 1 deletion tests/share/disambiguation/test_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def ingest_initial(self, Graph, ingest):
([Publication(identifiers=[WorkIdentifier(5)])], models.Publication, 0),
([Publication(identifiers=[WorkIdentifier(3)])], models.Publication, 1),
([Organization(name='Aperture Science')], models.Organization, 0),
([Organization(name='Aperture science')], models.Organization, 0),
([Organization(name='Aperture science')], models.Organization, 1),
])
def test_disambiguate(self, input, model, delta, Graph, ingest_initial, ingest):
Graph.reseed()
Expand Down
6 changes: 0 additions & 6 deletions tests/share/normalize/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

from share import models
from share.transform.chain.links import IRILink
from share.util import nameparser
from share.util import TopologicalSorter
from share.util.graph import MutableGraph, MutableNode

Expand Down Expand Up @@ -311,11 +310,6 @@ def parse(self, _, parse, **kwargs):
self[k]
for k in ['given_name', 'additional_name', 'family_name', 'suffix']
)))
else:
human = nameparser.HumanName(name)
for hk, sk in [('first', 'given_name'), ('middle', 'additional_name'), ('last', 'family_name'), ('suffix', 'suffix')]:
if human[hk]:
self[sk] = human[hk]


class TagFactory(GraphNodeFactory):
Expand Down
30 changes: 20 additions & 10 deletions tests/share/normalize/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,24 @@ def test_normalize_tags_on_work(self, input, output, Graph, ExpectedGraph):
@pytest.mark.parametrize('input, output', [(i, o) for input, o in [
([
Person(name='Smith, J'),
Person(name='J Smith '),
Person(name='Smith, J'),
], Person(name='Smith, J')),
([
Person(name='J Smith '),
], Person(name='J Smith')),
([
Person(given_name='J', family_name='Smith'),
Person(given_name=' J', family_name='\n\nSmith'),
], Person(name='J Smith', family_name='Smith', given_name='J')),
([
Person(name='Johnathan James Doe'),
], Person(name='Johnathan James Doe')),
([
Person(name='johnathan james doe'),
], Person(name='Johnathan James Doe', family_name='Doe', given_name='Johnathan', additional_name='James')),
], Person(name='johnathan james doe')),
([
Person(name='johnathan james doe JR'),
], Person(name='Johnathan James Doe Jr', family_name='Doe', given_name='Johnathan', additional_name='James', suffix='Jr')),
], Person(name='johnathan james doe JR')),
([
Person(name='none'),
Person(name=''),
Expand Down Expand Up @@ -169,7 +175,7 @@ def test_normalize_person(self, input, output, Graph, ExpectedGraph):
Person(name='Dylan, B', identifiers=[AgentIdentifier(1)]),
Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
Person(name='B. D. Dylan', identifiers=[AgentIdentifier(1)])
], [Person(name='Barb D. Dylan', identifiers=[AgentIdentifier(1)])]),
], [Person(name='B. D. Dylan', identifiers=[AgentIdentifier(1)])]),
])
def test_normalize_person_relation(self, input, output, Graph, ExpectedGraph):
graph = Graph(*input)
Expand All @@ -190,7 +196,7 @@ def test_normalize_person_relation(self, input, output, Graph, ExpectedGraph):
(Agent(name='DPTA'), Organization(name='DPTA')),
(Agent(name='B. Verkin Institute for Low Temperatures Physics & Engineering, Kharkov, Ukraine'), Institution(name='B. Verkin Institute for Low Temperatures Physics & Engineering', location='Kharkov, Ukraine', type='institution')),
(Agent(name='Physikalisches Institut, University Wuerzburg, Germany'), Agent(name='Physikalisches Institut', location='University Wuerzburg, Germany', type='institution')),
(Agent(name='Centro de Biotecnologia e Departamento de Biofísica; UFRGS; Av Bento Goncalves 9500, Predio 43431 sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi'), Agent(name='UFRGS - Centro de Biotecnologia e Departamento de Biofísica', location='Av Bento Goncalves 9500, Predio 43431 Sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi')),
(Agent(name='Centro de Biotecnologia e Departamento de Biofísica; UFRGS; Av Bento Goncalves 9500, Predio 43431 sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi'), Agent(name='UFRGS - Centro de Biotecnologia e Departamento de Biofísica', location='Av Bento Goncalves 9500, Predio 43431 sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi')),
(Agent(name='Department of Chemistry; ZheJiang University; HangZhou ZheJiang CHINA'), Institution(name='ZheJiang University - Department of Chemistry', location='HangZhou ZheJiang CHINA')),
(Agent(name='Marine Evolution and Conservation; Groningen Institute for Evolutionary Life Sciences; University of Groningen; Nijenborgh 7, 9747 AG Groningen The Netherlands'), Institution(name='University of Groningen - Marine Evolution and Conservation; Groningen Institute for Evolutionary Life Sciences', location='Nijenborgh 7, 9747 AG Groningen The Netherlands')),
(Agent(name='Institute of Marine Research; PO Box 1870 Nordnes, 5817 Bergen Norway'), Institution(name='Institute of Marine Research', location='PO Box 1870 Nordnes, 5817 Bergen Norway')),
Expand Down Expand Up @@ -353,14 +359,14 @@ def test_normalize_mixed_agent_relation(self, input, output, Graph, ExpectedGrap
Creator(cited_as='Bob Dylan', agent=Person(id=0, name='Bob Dylan', identifiers=[AgentIdentifier(1, id=0)])),
Contributor(cited_as='Bob Dylan', agent=Person(id=1, name='Bob Dylan', identifiers=[AgentIdentifier(1, id=1)])),
], [
Creator(cited_as='Bob Dylan', agent=Person(id=0, name='Bob Dylan', given_name='Bob', family_name='Dylan', identifiers=[AgentIdentifier(1, id=1)]))
Creator(cited_as='Bob Dylan', agent=Person(id=0, name='Bob Dylan', identifiers=[AgentIdentifier(1, id=1)]))
]),
# same identifier, different name, different type
([
Creator(cited_as='B. Dylan', agent=Person(id=0, name='B. Dylan', identifiers=[AgentIdentifier(1, id=0)])),
Contributor(cited_as='Bob Dylan', agent=Person(id=1, name='Bob Dylan', identifiers=[AgentIdentifier(1, id=1)])),
], [
Creator(cited_as='Bob Dylan', agent=Person(id=0, name='Bob Dylan', given_name='Bob', family_name='Dylan', identifiers=[AgentIdentifier(1, id=1)]))
Creator(cited_as='Bob Dylan', agent=Person(id=0, name='Bob Dylan', identifiers=[AgentIdentifier(1, id=1)]))
]),
# same name, one identifier, add identifier
([
Expand Down Expand Up @@ -443,19 +449,23 @@ def test_normalize_creativework(self, input, output, Graph, ExpectedGraph):
assert graph == ExpectedGraph(CreativeWork(**output))

@pytest.mark.parametrize('input, output', [
(input, Creator(cited_as='James Bond', agent=Person(name='James Bond', family_name='Bond', given_name='James')),)
(input, Creator(cited_as='James Bond', agent=Person(name='James Bond')),)
for input in [
Creator(cited_as=' \t James\n Bond \t ', agent=Person(name='James Bond')),
Creator(cited_as='', agent=Person(name='James Bond')),
Creator(cited_as='', agent=Person(name='James Bond')),
Creator(cited_as='', agent=Person(given_name='James', family_name='Bond')),
]
] + [
(input, Contributor(cited_as='James Bond', agent=Person(name='James Bond', family_name='Bond', given_name='James')),)
(input, Contributor(cited_as='James Bond', agent=Person(name='James Bond')),)
for input in [
Contributor(cited_as=' \t James\n Bond \t ', agent=Person(name='James Bond')),
Contributor(cited_as='', agent=Person(name='James Bond')),
]
] + [
(
Creator(cited_as='', agent=Person(given_name='James', family_name='Bond')),
Creator(cited_as='James Bond', agent=Person(name='James Bond', given_name='James', family_name='Bond')),
),
])
def test_normalize_agentworkrelation(self, input, output, Graph, ExpectedGraph):
graph = Graph(input)
Expand Down

0 comments on commit 0c80cd0

Please sign in to comment.