Skip to content

Commit

Permalink
books: fixed arXiv eprints and collaboration rules
Browse files Browse the repository at this point in the history
* add ignored keys
* tweak record match queries
  • Loading branch information
kpsherva committed Jul 21, 2020
1 parent 4b54fba commit 1806a5b
Show file tree
Hide file tree
Showing 16 changed files with 150 additions and 80 deletions.
95 changes: 56 additions & 39 deletions cds_dojson/marc21/fields/books/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

from cds_dojson.marc21.fields.books.errors import MissingRequiredField, \
UnexpectedValue
from cds_dojson.marc21.fields.books.utils import extract_volume_number
from cds_dojson.marc21.fields.books.values_mapping import ACQUISITION_METHOD, \
ARXIV_CATEGORIES, COLLECTION, DOCUMENT_TYPE, EXTERNAL_SYSTEM_IDENTIFIERS, \
EXTERNAL_SYSTEM_IDENTIFIERS_TO_IGNORE, MATERIALS, MEDIUM_TYPES, \
Expand All @@ -39,7 +40,6 @@
build_contributor_books, clean_email, clean_pages_range, clean_val, \
filter_list_values, get_week_start, out_strip, related_url, \
replace_in_result
from cds_dojson.marc21.fields.books.utils import extract_volume_number
from cds_dojson.marc21.models.books.base import model

from .utils import extract_parts, is_excluded
Expand Down Expand Up @@ -200,8 +200,8 @@ def alt_authors(self, key, value):
return _authors


@model.over('corporate_authors', '(^110)|(^710_[a_]+)')
@out_strip
@model.over('authors', '(^110)|(^710_[a_]+)')
@filter_list_values
def corporate_authors(self, key, value):
"""Translates the corporate authors field."""
_corporate_authors = self.get('authors', [])
Expand All @@ -212,28 +212,28 @@ def corporate_authors(self, key, value):
_corporate_authors.append({'full_name': clean_val('a', v, str),
'type': 'ORGANISATION'})
else:
self['collaborations'] = collaborations(self, key, value)
self['authors'] = collaborations(self, key, value)
raise IgnoreKey('corporate_authors')
else:
_corporate_authors.append({'full_name': clean_val('a', v, str),
'type': 'ORGANISATION'})
self['authors'] = _corporate_authors
raise IgnoreKey('corporate_authors')
return _corporate_authors


# TODO waiting for verification
@model.over('collaborations', '^710__')
@replace_in_result('Collaboration', '', key='value')
@model.over('authors', '^710__')
@replace_in_result('Collaboration', '', key='full_name')
@filter_list_values
def collaborations(self, key, value):
"""Translates collaborations."""
_collaborations = self.get('collaborations', [])
_authors = self.get('authors', [])
for v in force_list(value):
if 'g' in v:
_collaborations.append({'value': clean_val('g', v, str)})
_authors.append({'full_name': clean_val('g', v, str),
'type': 'ORGANISATION'})
elif '5' in v:
_collaborations.append({'value': clean_val('5', v, str)})
return _collaborations
_authors.append({'full_name': clean_val('5', v, str),
'type': 'ORGANISATION'})
return _authors


@model.over('publication_info', '(^773__)')
Expand Down Expand Up @@ -513,9 +513,8 @@ def get_value_rn(f_a, f_z, f_9, rn_obj):
raise MissingRequiredField(subfield='9 or a or z')

if sub_9 == 'arXiv':
# Todo arxiv under verification
self['arxiv_eprints'] = arxiv_eprints(self, key, value)
raise IgnoreKey('report_numbers')
arxiv_eprints(self, key, value)
raise IgnoreKey('identifiers')
else:
get_value_rn(sub_a, sub_z, sub_9, entry)
_identifiers.append(entry)
Expand Down Expand Up @@ -553,11 +552,17 @@ def barcodes(self, key, value):
raise IgnoreKey('barcodes')


# TODO verify if needed
@model.over('arxiv_eprints', '(^037__)|(^695__)')
@model.over('subjects', '(^037__)|(^695__)')
@filter_list_values
def arxiv_eprints(self, key, value):
"""Translates arxiv_eprints fields."""
"""Translates arxiv_eprints fields.
output:
{
'alternative_identifiers': [{'scheme': 'arXiv', 'value': `037__a`}],
'subjects': [{'scheme': 'arXiv', 'value': `037__c | 695__a`}]
}
"""

def check_category(field, val):
category = clean_val(field, val, str)
Expand All @@ -567,36 +572,47 @@ def check_category(field, val):
raise UnexpectedValue(subfield=field)

if key == '037__':
_arxiv_eprints = self.get('arxiv_eprints', [])
_alternative_identifiers = self.get('alternative_identifiers', [])
for v in force_list(value):
eprint_id = clean_val('a', v, str, req=True)
duplicated = [
elem
for i, elem in enumerate(_arxiv_eprints)
if elem['value'] == eprint_id
for i, elem in enumerate(_alternative_identifiers)
if elem['value'] == eprint_id and elem['scheme'] == 'arXiv'
]
category = check_category('c', v)
if not duplicated:
eprint = {'value': eprint_id}
if category:
eprint.update({'categories': [category]})
_arxiv_eprints.append(eprint)
else:
duplicated[0]['categories'].append(category)
return _arxiv_eprints
eprint = {'value': eprint_id, 'scheme': 'arXiv'}
_alternative_identifiers.append(eprint)
self['alternative_identifiers'] = _alternative_identifiers
if category:
_subjects = self.get('subjects', [])
subject = {'scheme': 'arXiv', 'value': category}
_subjects.append(subject) if subject not in _subjects else None
self['subjects'] = _subjects
raise IgnoreKey('subjects')

if key == '695__':
_arxiv_eprints = self.get('arxiv_eprints', [])
_alternative_identifiers = self.get('alternative_identifiers', [])
has_arxiv_id = False
for id_entry in _alternative_identifiers:
if id_entry['scheme'] == 'arXiv':
has_arxiv_id = True
break
category = check_category('a', value)
if not _arxiv_eprints:
raise ManualMigrationRequired(message='037__ is missing')
if not has_arxiv_id and category:
raise ManualMigrationRequired(
message='arXiv ID in 037__ missing, '
'but arXiv subject category found')

if clean_val('9', value, str) != 'LANL EDS':
raise UnexpectedValue(subfield='9')
_entry = _arxiv_eprints[0]
if category in _entry['categories']:
raise IgnoreKey('arxiv_eprints')
_entry['categories'].append(category)
return _arxiv_eprints
_subjects = self.get('subjects', [])
entry = {'scheme': 'arXiv', 'value': category}
if entry in _subjects:
raise IgnoreKey('subjects')
_subjects.append(entry)
return _subjects


@model.over('languages', '^041__')
Expand Down Expand Up @@ -796,12 +812,13 @@ def note(self, key, value):
def alternative_abstracts(self, key, value):
"""Translates abstracts fields."""
abstract = self.get('abstract', None)
_alternative_abstracts = self.get('alternative_abstracts', [])
if not abstract:
# takes first abstract as main
self["abstract"] = clean_val('a', value, str, req=True)
raise IgnoreKey('alternative_abstracts')
return clean_val('a', value, str, req=True)
# 'source': clean_val('9', value, str) # TODO until the answer comes
new_abstract = clean_val('a', value, str, req=True)
return new_abstract if new_abstract not in _alternative_abstracts else None


@model.over('licenses', '^540__')
Expand Down
5 changes: 3 additions & 2 deletions cds_dojson/marc21/fields/books/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@

from cds_dojson.marc21.fields.books.errors import ManualMigrationRequired, \
MissingRequiredField, UnexpectedValue
from cds_dojson.marc21.fields.books.utils import is_excluded, extract_parts, \
extract_volume_number
from cds_dojson.marc21.fields.books.utils import extract_parts, \
extract_volume_number, is_excluded
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.models.books.book import model

from .base import alternative_titles as alternative_titles_base


Expand Down
12 changes: 6 additions & 6 deletions cds_dojson/marc21/fields/books/multipart.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
import re

from dojson.errors import IgnoreKey
from dojson.utils import for_each_value, filter_values, force_list
from dojson.utils import for_each_value, force_list

from cds_dojson.marc21.fields.books.base import book_series as base_book_series
from cds_dojson.marc21.fields.books.errors import UnexpectedValue, \
ManualMigrationRequired, MissingRequiredField
from cds_dojson.marc21.fields.books.errors import MissingRequiredField, \
UnexpectedValue
from cds_dojson.marc21.fields.books.utils import extract_parts, \
extract_volume_number, extract_volume_info
from cds_dojson.marc21.fields.utils import clean_val, out_strip, \
filter_list_values
extract_volume_info, extract_volume_number
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.models.books.multipart import model


Expand Down
16 changes: 13 additions & 3 deletions cds_dojson/marc21/fields/books/serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Books fields."""
from dojson.utils import filter_values, for_each_value
from dojson.utils import for_each_value

from cds_dojson.marc21.fields.books.errors import UnexpectedValue
from cds_dojson.marc21.fields.utils import clean_val, out_strip
from cds_dojson.marc21.fields.books.multipart import \
isbns as multipart_identifiers
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.models.books.serial import model


Expand All @@ -42,3 +44,11 @@ def title(self, key, value):
self['identifiers'] = _identifiers
self['mode_of_issuance'] = 'SERIAL'
return clean_val('a', value, str, req=True)


@model.over('identifiers', '^020__')
@filter_list_values
@for_each_value
def identifiers(self, key, value):
"""Translates identifiers fields."""
multipart_identifiers(self, key, value)
3 changes: 1 addition & 2 deletions cds_dojson/marc21/fields/books/standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
from __future__ import unicode_literals

from dojson.errors import IgnoreKey
from dojson.utils import filter_values, for_each_value

from cds_dojson.marc21.fields.books.errors import UnexpectedValue
from cds_dojson.marc21.fields.books.utils import is_excluded, extract_parts
from cds_dojson.marc21.fields.books.utils import extract_parts, is_excluded
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.models.books.standard import model
Expand Down
1 change: 0 additions & 1 deletion cds_dojson/marc21/fields/books/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from cds_dojson.marc21.fields.books.errors import MissingRequiredField


MAX_PAGES_NUMBER = 8192

RE_STR_VOLUME_NUMBER = r'(v(ol(ume)?)?|part|p|pt)[\s\.]*(\d+)'
Expand Down
12 changes: 11 additions & 1 deletion cds_dojson/marc21/models/books/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
from ....overdo import OverdoJSONSchema
from ..base import model as cds_base


COMMON_IGNORE_FIELDS = {
'003',
'005',
'020__q',
'020__c',
'0248_a',
'0248_p',
'041__h', # 206 cds-dojson
Expand All @@ -41,8 +41,10 @@
'082042',
'0820_2',
'082__2',
'100__9',
'111__d',
'111__f',
'246__i',
'269__a', # preprint info
'269__b', # preprint info
'269__c', # preprint date
Expand All @@ -57,26 +59,32 @@
'502__c', # thesis_info/institutions
'502__d', # thesis_info/date (publication)
'5208_a', # 206 cds-dojson
'520__9',
'536__a', # founding info, dropped
'536__c',
'536__f',
'536__r',
'595__z',
'650172',
'65017a',
'650272',
'65027a',
'690__c', # 206 cds-dojson
'694__9',
'694__a',
'700__9',
'773__r', # publication_info/parent_report_number
'773__z', # publication_info/parent_isbn
'775__c', # related edition's year (it will be resolved)
'852__c',
'852__h',
'852__p',
'900__s', # 206 cds-dojson
'900__u', # 206 cds-dojson
'900__y', # 206 cds-dojson
'901__a', # record affiliation
'901__u',
'916__a',
'916__d',
'916__e',
'916__y',
Expand All @@ -90,6 +98,8 @@
'963__a',
'964__a',
'970__a',
'970__d',
'980__c',
'981__a',
}

Expand Down
4 changes: 2 additions & 2 deletions cds_dojson/marc21/models/books/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
from __future__ import unicode_literals

from ..base import model as cds_base
from .base import CDSOverdoBookBase, COMMON_IGNORE_FIELDS
from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase
from .base import model as books_base


class CDSBook(CDSOverdoBookBase):
"""Translation Index for CDS Books."""

__query__ = '690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
'690C_:BOOKSUGGESTION OR 980__:PROCEEDINGS OR 980__:PERI OR ' \
'980__:PROCEEDINGS OR 980__:PERI OR ' \
'(-980:STANDARD 980:BOOK) OR ' \
'697C_:LEGSERLIB ' \
'-980__:DELETED -980__:MIGRATED -980:__STANDARD' \
Expand Down
8 changes: 6 additions & 2 deletions cds_dojson/marc21/models/books/multipart.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,22 @@
from __future__ import unicode_literals

from ..base import model as cds_base
from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase
from .base import model as books_base
from .base import CDSOverdoBookBase, COMMON_IGNORE_FIELDS


class CDSMultipart(CDSOverdoBookBase):
"""Translation Index for CDS Books."""

__query__ = '(690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
'690C_:BOOKSUGGESTION OR 980__:PROCEEDINGS OR 980__:PERI OR ' \
'980__:PROCEEDINGS OR ' \
'697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)'\
'AND 246__:/[a-zA-Z0-9]+/ '

__schema__ = 'https://127.0.0.1:5000/schemas/series/series-v1.0.0.json'

__model_ignore_keys__ = {
'020__c',
'021__a',
'021__b',
'022__a',
Expand Down Expand Up @@ -73,6 +74,7 @@ class CDSMultipart(CDSOverdoBookBase):
'100__a',
'100__e',
'100__u',
'100__9',
'110__a',
'111__9',
'111__a',
Expand All @@ -85,6 +87,7 @@ class CDSMultipart(CDSOverdoBookBase):
'210__a',
'222__a',
'242__a',
'246__i',
'250__a',
'260__a',
'260__b',
Expand Down Expand Up @@ -165,6 +168,7 @@ class CDSMultipart(CDSOverdoBookBase):
'775__b',
'775__c',
'775__w',
'852__p'
'8564_8',
'8564_s',
'8564_t',
Expand Down

0 comments on commit 1806a5b

Please sign in to comment.