From 8c65fe03b2dc56acbe53e3f78e8d0e28ee62c0ad Mon Sep 17 00:00:00 2001 From: Karolina Przerwa Date: Tue, 29 Sep 2020 10:24:03 +0200 Subject: [PATCH] books: rules update * fixes broken fields * tested on 100k data set * fixes migration additional info object --- cds_dojson/marc21/fields/books/base.py | 184 ++++---- cds_dojson/marc21/fields/books/utils.py | 2 +- .../marc21/fields/books/values_mapping.py | 6 +- cds_dojson/marc21/fields/utils.py | 26 +- cds_dojson/marc21/models/books/base.py | 40 +- cds_dojson/marc21/models/books/book.py | 32 +- cds_dojson/marc21/models/books/multipart.py | 5 +- cds_dojson/marc21/models/books/standard.py | 19 +- tests/test_books.py | 408 ++++++++---------- tests/test_matcher.py | 4 +- tests/test_standard.py | 15 +- 11 files changed, 390 insertions(+), 351 deletions(-) diff --git a/cds_dojson/marc21/fields/books/base.py b/cds_dojson/marc21/fields/books/base.py index aa7e22c5..019ea6e8 100644 --- a/cds_dojson/marc21/fields/books/base.py +++ b/cds_dojson/marc21/fields/books/base.py @@ -72,15 +72,17 @@ def created(self, key, value): if key == '916__': if 's' in value: - _created_by.update( - {'type': mapping(ACQUISITION_METHOD, - clean_val('s', value, str))}) + _created_by.update({'type': mapping(ACQUISITION_METHOD, + clean_val('s', value, str, + default="migration"), + raise_exception=True + )}) self['created_by'] = _created_by - date = clean_val('w', value, int, regex_format=r'\d{4}$') - if date: - year, week = str(date)[:4], str(date)[4:] - date = get_week_start(int(year), int(week)) - return date.isoformat() + date = clean_val('w', value, int, regex_format=r'\d{4}$') + if date: + year, week = str(date)[:4], str(date)[4:] + date = get_week_start(int(year), int(week)) + return date.isoformat() elif key == '595__': try: sub_a = clean_val('a', value, str, @@ -89,7 +91,8 @@ def created(self, key, value): source = sub_a[:3] self["source"] = source year, month = int(sub_a[3:7]), int(sub_a[7:]) - return datetime.date(year, month, 1).isoformat() + self['_created'] = datetime.date(year, month, 1).isoformat() + raise IgnoreKey('_created') except UnexpectedValue as e: e.subfield = 'a' self['internal_notes'] = internal_notes(self, key, value) @@ -99,17 +102,13 @@ def created(self, key, value): @model.over('internal_notes', '^595__') -@filter_list_values def internal_notes(self, key, value): """Translates private notes field.""" _internal_notes = self.get('internal_notes', []) - for v in force_list(value): - note = {'value': clean_val('a', v, str, req=True), - # TODO, waiting for an answer - 'source': clean_val('9', v, str), - } - _internal_notes.append(note) + internal_note = {'value': clean_val('a', v, str, req=True)} + if internal_note not in _internal_notes: + _internal_notes.append(internal_note) return _internal_notes @@ -118,22 +117,20 @@ def internal_notes(self, key, value): @out_strip def collection(self, key, value): """Translates collection field - WARNING - also document type field.""" - _migration = self.get('_migration', {}) - _tags = _migration.get('tags', []) - + _migration = self['_migration'] + _tags = _migration['tags'] for v in force_list(value): result_a = mapping(COLLECTION, clean_val('a', v, str)) result_b = mapping(COLLECTION, clean_val('b', v, str)) if result_a: - _tags.append(result_a) + _tags.append(result_a) if result_a not in _tags else None _migration['has_tags'] = True if result_b: - _tags.append(result_b) + _tags.append(result_b) if result_b not in _tags else None _migration['has_tags'] = True if not result_a and not result_b: self['document_type'] = document_type(self, key, value) raise IgnoreKey('_migration') - _migration['tags'] = _tags return _migration @@ -268,22 +265,24 @@ def publication_info(self, key, value): @model.over('extensions', '(^925__)') -@filter_list_values -@for_each_value +@filter_values def standard_review(self, key, value): """Translates standard_status field.""" - return{'standard_review:applicability': clean_val('i', value, str), - 'standard_review:validity': clean_val('v', value, str), - 'standard_review:checkdate': clean_val('z', value, str), - 'standard_review:expert': clean_val('p', value, str) - } + _extensions = self.get('extensions', {}) + _extensions.update( + {'standard_review:applicability': clean_val('i', value, str), + 'standard_review:validity': clean_val('v', value, str), + 'standard_review:checkdate': clean_val('z', value, str), + 'standard_review:expert': clean_val('p', value, str) + }) + return _extensions @model.over('publication_info', '^962__') def publication_additional(self, key, value): """Translates additional publication info.""" _publication_info = self.get('publication_info', []) - _migration = self.get("_migration", {}) + _migration = self["_migration"] empty = not bool(_publication_info) for i, v in enumerate(force_list(value)): temp_info = {} @@ -297,7 +296,6 @@ def publication_additional(self, key, value): # assume that if we have a parent journal # then the doc is a periodical issue self['document_type'] = 'PERIODICAL_ISSUE' - self['_migration'] = _migration n_subfield = clean_val('n', v, str) if n_subfield.upper() == 'BOOK': temp_info.update({'material': 'BOOK'}) @@ -324,8 +322,8 @@ def related_records(self, key, value): RELATED records """ - _migration = self.get('_migration', {}) - _related = _migration.get('related', []) + _migration = self['_migration'] + _related = _migration['related'] relation_type = 'other' try: if key == '775__' and 'b' in value: @@ -346,33 +344,73 @@ def related_records(self, key, value): @model.over('extensions', '^693__') -@filter_list_values -@for_each_value +@filter_values def accelerator_experiments(self, key, value): """Translates accelerator_experiments field.""" - return {'unit:accelerator': clean_val('a', value, str), - 'unit:experiment': clean_val('e', value, str), - 'unit:project': clean_val('p', value, str) - } + _extensions = self.get('extensions', {}) + + sub_a = clean_val('a', value, str) + sub_e = clean_val('e', value, str) + sub_p = clean_val('p', value, str) + + accelerators = _extensions.get('unit:accelerator', []) + experiment = _extensions.get('unit:experiment', []) + project = _extensions.get('unit:project', []) + + if sub_a and sub_a not in accelerators: + accelerators.append(sub_a) + if sub_e and sub_e not in experiment: + experiment.append(sub_e) + if sub_p and sub_p not in project: + project.append(sub_p) + + _extensions.update({ + "unit:accelerator": accelerators, + "unit:experiment": experiment, + "unit:project": project + }) + return _extensions -# TODO - discuss how we would like to keep links to holdings (files and ebooks) @model.over('urls', '^8564_') +@filter_list_values @for_each_value -@filter_values def urls(self, key, value): """Translates urls field.""" - try: - clean_val('y', value, str, manual=True) - except ManualMigrationRequired as e: - e.subfield = 't' - raise e - url = clean_val('u', value, str, req=True) - if 'cds.cern.ch' not in url: - return {'value': url} - # TODO: instead of IgnoreKey if link starts with cds.cern.ch it should be - # linked as files to the record, issue #200 - raise IgnoreKey('urls') + sub_y = clean_val('y', value, str, default='') + sub_u = clean_val('u', value, str, req=True) + + eitems_ebl = self['_migration']['eitems_ebl'] + eitems_external = self['_migration']['eitems_external'] + eitems_proxy = self['_migration']['eitems_proxy'] + eitems_files = self['_migration']['eitems_file_links'] + + url = {'value': sub_u} + if sub_y and sub_y != 'ebook': + url['description'] = sub_y + + # EBL publisher login required + if all([elem in sub_u for elem in ['cds', '.cern.ch' '/auth.py']]): + eitems_ebl.append(url) + self['_migration']['eitems_has_ebl'] = True + # EzProxy links + elif 'ezproxy.cern.ch' in sub_u: + url['value'] = url['value'].replace( + 'https://ezproxy.cern.ch/login?url=', '') + eitems_proxy.append(url) + self['_migration']['eitems_has_proxy'] = True + # local files + elif all([elem in sub_u for elem in + ['cds', '.cern.ch/record/', '/files']]): + eitems_files.append(url) + self['_migration']['eitems_has_files'] = True + elif sub_y == 'ebook': + eitems_external.append(url) + self['_migration']['eitems_has_external'] = True + else: + # if none of the above, it is just external url + # attached to the document + return url @model.over('identifiers', '^020__', ) @@ -443,8 +481,11 @@ def alternative_identifiers(self, key, value): else: raise UnexpectedValue(subfield='2') if key == '035__': + if 'CERCER' in sub_a: + raise IgnoreKey('alternative_identifiers') sub_9 = clean_val('9', value, str, req=True) - + if 'CERCER' in sub_9: + raise IgnoreKey('alternative_identifiers') # conference_info.identifiers mixed data if sub_9.upper() == 'INSPIRE-CNUM': _conference_info = self.get('conference_info', {}) @@ -453,7 +494,7 @@ def alternative_identifiers(self, key, value): {'scheme': 'INSPIRE_CNUM', 'value': sub_a}) _conference_info.update({'identifiers': _prev_identifiers}) self['conference_info'] = _conference_info - raise IgnoreKey('external_system_identifiers') + raise IgnoreKey('alternative_identifiers') elif sub_9.upper() in EXTERNAL_SYSTEM_IDENTIFIERS: indentifier_entry.update({'value': sub_a, @@ -472,22 +513,19 @@ def alternative_identifiers(self, key, value): @filter_list_values def dois(self, key, value): """Translates dois fields.""" - # TODO might be moved to item level or prefixed _migration, - # decision pending _identifiers = self.get('identifiers', []) for v in force_list(value): material = mapping(MATERIALS, clean_val('q', v, str, transform='lower'), raise_exception=True) - - _identifiers.append( - { - 'value': clean_val('a', v, str, req=True), - 'material': material, - 'source': clean_val('9', v, str), # TODO sources - 'scheme': 'DOI', - } - ) + doi = { + 'value': clean_val('a', v, str, req=True), + 'material': material, + 'source': clean_val('9', v, str), + 'scheme': 'DOI', + } + if doi not in _identifiers: + _identifiers.append(doi) return _identifiers @@ -541,7 +579,7 @@ def barcodes(self, key, value): val_n = clean_val('n', value, str) val_x = clean_val('x', value, str) - _migration = self.get('_migration', {'volumes': []}) + _migration = self['_migration'] _migration['volumes'].append(dict( volume=extract_volume_number( val_n, @@ -550,7 +588,6 @@ def barcodes(self, key, value): ), barcode=val_x )) - self['_migration'] = _migration raise IgnoreKey('barcodes') @@ -633,7 +670,7 @@ def subject_classification(self, key, value): raise IgnoreKey('subjects') -@model.over('keywords', '(^6531_)') +@model.over('keywords', '^6531_') @for_each_value @filter_values def keywords(self, key, value): @@ -727,12 +764,14 @@ def imprint(self, key, value): date = parser.parse(clean_val('c', value, str, req=True)) except ParserError: raise UnexpectedValue(subfield='c') + except Exception: + raise UnexpectedValue(subfield='c') self['publication_year'] = str(date.date().year) return { 'date': clean_val('c', value, str, req=True), 'place': clean_val('a', value, str), 'publisher': clean_val('b', value, str), - 'reprint': reprint, + 'reprint_date': reprint, } @@ -743,16 +782,13 @@ def book_series(self, key, value): val_n = clean_val('n', value, str) val_x = clean_val('x', value, str) - _migration = self.get('_migration', {}) - if 'serials' not in _migration: - _migration['serials'] = [] + _migration = self['_migration'] _migration['serials'].append({ 'title': clean_val('a', value, str), 'volume': clean_val('v', value, str), - 'issn': clean_val('x', value, str), + 'issn': val_x }) _migration['has_serial'] = True - self['_migration'] = _migration raise IgnoreKey('book_series') diff --git a/cds_dojson/marc21/fields/books/utils.py b/cds_dojson/marc21/fields/books/utils.py index 8d032454..2e4391a8 100644 --- a/cds_dojson/marc21/fields/books/utils.py +++ b/cds_dojson/marc21/fields/books/utils.py @@ -24,7 +24,7 @@ MAX_PAGES_NUMBER = 8192 -RE_STR_VOLUME_NUMBER = r'(v(ol(ume)?)?|part|p|pt)[\s\.]*(\d+)' +RE_STR_VOLUME_NUMBER = r'(v(ol(ume)?)?|part|p|pt|t)[\s\.]*(\d+)' RE_VOLUME_NUMBER = re.compile(RE_STR_VOLUME_NUMBER, re.IGNORECASE) RE_VOLUME_INFO = re.compile( diff --git a/cds_dojson/marc21/fields/books/values_mapping.py b/cds_dojson/marc21/fields/books/values_mapping.py index 24a81f0b..290a2f57 100644 --- a/cds_dojson/marc21/fields/books/values_mapping.py +++ b/cds_dojson/marc21/fields/books/values_mapping.py @@ -42,6 +42,8 @@ ACQUISITION_METHOD = { 'user': ['H'], 'batchuploader': ['N', 'M'], + 'migration': ['migration'], + 'r': 'user' } MEDIUM_TYPES = [ @@ -142,7 +144,7 @@ ] -def mapping(field_map, val, raise_exception=False): +def mapping(field_map, val, raise_exception=False, default_val=None): """ Maps the old value to a new one according to the map. @@ -165,5 +167,7 @@ def mapping(field_map, val, raise_exception=False): elif isinstance(field_map, list): if val in field_map: return val + elif default_val: + return default_val if raise_exception: raise UnexpectedValue diff --git a/cds_dojson/marc21/fields/utils.py b/cds_dojson/marc21/fields/utils.py index 7098f688..92ee6b4a 100644 --- a/cds_dojson/marc21/fields/utils.py +++ b/cds_dojson/marc21/fields/utils.py @@ -98,14 +98,19 @@ def clean_val(subfield, value, var_type, req=False, regex_format=None, return default raise MissingRequiredField if to_clean is not None: - if var_type is str: - return clean_str(to_clean, regex_format, req, transform) - elif var_type is bool: - return bool(to_clean) - elif var_type is int: - return int(to_clean) - else: - raise NotImplementedError + try: + if var_type is str: + return clean_str(to_clean, regex_format, req, transform) + elif var_type is bool: + return bool(to_clean) + elif var_type is int: + return int(to_clean) + else: + raise NotImplementedError + except ValueError: + raise UnexpectedValue(subfield=subfield) + except TypeError: + raise UnexpectedValue(subfield=subfield) def clean_email(value): @@ -328,7 +333,10 @@ def _get_correct_books_contributor_role(subfield, role): 'ill.': 'Ilustrator', 'ill': 'Ilustrator', } - clean_role = role.lower() + if isinstance(role, str): + clean_role = role.lower() + else: + raise UnexpectedValue(subfield=subfield, message=' unknown role') if clean_role not in translations: raise UnexpectedValue(subfield=subfield, message=' unknown role') return translations[clean_role] diff --git a/cds_dojson/marc21/models/books/base.py b/cds_dojson/marc21/models/books/base.py index f1074c22..3749fa25 100644 --- a/cds_dojson/marc21/models/books/base.py +++ b/cds_dojson/marc21/models/books/base.py @@ -19,6 +19,8 @@ """Base models for common fields.""" from __future__ import unicode_literals +from copy import deepcopy + from dojson._compat import iteritems from dojson.errors import IgnoreKey, MissingRule from dojson.utils import GroupableOrderedDict @@ -26,6 +28,33 @@ from ....overdo import OverdoJSONSchema from ..base import model as cds_base + +def get_migration_dict(): + """Return migration extra data.""" + __migration_dict__ = dict( + record_type='document', + volumes=[], + serials=[], + has_serial=False, + is_multipart=False, + has_tags=False, + has_related=False, + has_journal=False, + tags=[], + journal_record_legacy_id='', + eitems_proxy=[], + eitems_has_proxy=False, + eitems_file_links=[], + eitems_has_files=False, + eitems_external=[], + eitems_has_external=False, + eitems_ebl=[], + eitems_has_ebl=False, + related=[], + ) + + return deepcopy(__migration_dict__) + COMMON_IGNORE_FIELDS = { '003', '005', @@ -48,6 +77,7 @@ '100__9', '111__d', '111__f', + '145__a', '246__i', '269__a', # preprint info '269__b', # preprint info @@ -57,6 +87,7 @@ '340__a', '440_3a', # 206 cds-dojson '541__9', + '541__a', '541__h', '502__a', # thesis_info/defense_date '502__b', # thesis_info/degree_type @@ -71,6 +102,7 @@ '540__b', '540__f', '595__z', + '595__9', '650172', '65017a', '650272', @@ -89,6 +121,9 @@ '852__c', '852__h', '852__p', + '8564_8', # bibdoc id + '8564_s', # file identifier + '8564_x', # subformat identifier '900__s', # 206 cds-dojson '900__u', # 206 cds-dojson '900__y', # 206 cds-dojson @@ -117,7 +152,7 @@ class CDSOverdoBookBase(OverdoJSONSchema): """Translation base Index for CDS Books.""" - def do(self, blob, ignore_missing=True, exception_handlers=None): + def do(self, blob, ignore_missing=True, exception_handlers=None, init_fields=None): """Translate blob values and instantiate new model instance. Raises ``MissingRule`` when no rule matched and ``ignore_missing`` @@ -155,6 +190,9 @@ def clean_missing(exc, output, key, value): output = {} + if init_fields: + output.update(**init_fields) + if self.index is None: self.build() if isinstance(blob, GroupableOrderedDict): diff --git a/cds_dojson/marc21/models/books/book.py b/cds_dojson/marc21/models/books/book.py index ed8a25ed..ebd5276b 100644 --- a/cds_dojson/marc21/models/books/book.py +++ b/cds_dojson/marc21/models/books/book.py @@ -19,8 +19,10 @@ """Book model.""" from __future__ import unicode_literals +from copy import deepcopy + from ..base import model as cds_base -from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase +from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase, get_migration_dict from .base import model as books_base @@ -31,33 +33,27 @@ class CDSBook(CDSOverdoBookBase): '980__:PROCEEDINGS OR 980__:PERI OR ' \ '(-980:STANDARD 980:BOOK) OR ' \ '697C_:LEGSERLIB ' \ - '-980__:DELETED -980__:MIGRATED -980:__STANDARD' \ + '-980__:DELETED -980__:MIGRATED -980:__STANDARD -596:MULTIVOLUMES' __schema__ = 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json' - __ignore_keys__ = COMMON_IGNORE_FIELDS + __model_ignore_keys__ = { + '020__b', # this field is used to match multipart monograph items as volumes + } + + __ignore_keys__ = COMMON_IGNORE_FIELDS | __model_ignore_keys__ + + __json_init_dict__ = {'_migration': {**get_migration_dict()}} def do(self, blob, ignore_missing=True, exception_handlers=None): """Set schema after translation depending on the model.""" json = super(CDSBook, self).do( blob=blob, ignore_missing=ignore_missing, - exception_handlers=exception_handlers) + exception_handlers=exception_handlers, + init_fields=deepcopy(self.__json_init_dict__) + ) json['$schema'] = self.__class__.__schema__ - - if '_migration' not in json: - json['_migration'] = {} - json['_migration'].setdefault('record_type', 'document') - json['_migration'].setdefault('volumes', []) - json['_migration'].setdefault('serials', []) - json['_migration'].setdefault('has_serial', False) - json['_migration'].setdefault('is_multipart', False) - json['_migration'].setdefault('has_tags', False) - json['_migration'].setdefault('has_related', False) - json['_migration'].setdefault('has_journal', False) - json['_migration'].setdefault('tags', []) - json['_migration'].setdefault('journal_record_legacy_recid', '') - return json diff --git a/cds_dojson/marc21/models/books/multipart.py b/cds_dojson/marc21/models/books/multipart.py index 217f055b..48b0d2bd 100644 --- a/cds_dojson/marc21/models/books/multipart.py +++ b/cds_dojson/marc21/models/books/multipart.py @@ -27,10 +27,9 @@ class CDSMultipart(CDSOverdoBookBase): """Translation Index for CDS Books.""" - __query__ = '(690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \ + __query__ = '(596__:MULTIVOLUMES AND 690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \ '980__:PROCEEDINGS OR ' \ - '697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)'\ - 'AND 246__:/[a-zA-Z0-9]+/ ' + '697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)' __schema__ = 'https://127.0.0.1:5000/schemas/series/series-v1.0.0.json' diff --git a/cds_dojson/marc21/models/books/standard.py b/cds_dojson/marc21/models/books/standard.py index 1d3d7fd2..f508925d 100644 --- a/cds_dojson/marc21/models/books/standard.py +++ b/cds_dojson/marc21/models/books/standard.py @@ -20,8 +20,10 @@ from __future__ import unicode_literals +from copy import deepcopy + from ..base import model as cds_base -from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase +from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase, get_migration_dict from .base import model as books_base @@ -34,21 +36,18 @@ class CDSStandard(CDSOverdoBookBase): __ignore_keys__ = COMMON_IGNORE_FIELDS + __json_init_dict__ = {'_migration': {**get_migration_dict()}} + def do(self, blob, ignore_missing=True, exception_handlers=None): """Set schema after translation depending on the model.""" json = super(CDSStandard, self).do( blob=blob, ignore_missing=ignore_missing, - exception_handlers=exception_handlers) + exception_handlers=exception_handlers, + init_fields=deepcopy(self.__json_init_dict__), + ) + json['$schema'] = self.__class__.__schema__ - json['_migration'] = { - 'record_type': 'document', - 'has_serial': False, - 'is_multipart': False, - 'has_keywords': False, - 'has_related': False, - 'volumes': [] - } return json diff --git a/tests/test_books.py b/tests/test_books.py index f0bc95ec..84469dd1 100644 --- a/tests/test_books.py +++ b/tests/test_books.py @@ -20,12 +20,15 @@ from __future__ import absolute_import, print_function, unicode_literals +from copy import deepcopy + import pytest from dojson.errors import MissingRule from cds_dojson.marc21.fields.books.errors import ManualMigrationRequired, \ MissingRequiredField, UnexpectedValue from cds_dojson.marc21.fields.books.values_mapping import MATERIALS, mapping +from cds_dojson.marc21.models.books.base import get_migration_dict from cds_dojson.marc21.models.books.book import model from cds_dojson.marc21.utils import create_record @@ -36,21 +39,14 @@ def check_transformation(marcxml_body, json_body): """Check transformation.""" blob = create_record(marcxml.format(marcxml_body)) - record = model.do(blob, ignore_missing=False) + + record = {'_migration': {**get_migration_dict()}} + + record.update(**model.do(blob, ignore_missing=False)) + expected = { '$schema': 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json', - '_migration': { - 'has_tags': False, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], - 'tags': [], - } + '_migration': {**get_migration_dict()} } expected.update(**json_body) @@ -218,6 +214,9 @@ def test_created(app): random text + + random text + """, { 'internal_notes': [ {'value': 'random text'}, @@ -253,16 +252,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIB'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, }) check_transformation( @@ -272,16 +264,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIB'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, }) check_transformation( @@ -291,16 +276,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIB'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, }) check_transformation( @@ -311,16 +289,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIBINTLAW'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, } ) @@ -332,16 +303,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['BOOKSHOP'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, } ) @@ -353,16 +317,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['BOOKSHOP'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, } ) @@ -374,16 +331,9 @@ def test_collections(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIBLEGRES'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, } ) @@ -471,16 +421,9 @@ def test_document_type_collection(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIB'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, 'document_type': 'BOOK', }) @@ -494,16 +437,9 @@ def test_document_type_collection(app): """, { '_migration': { + **get_migration_dict(), 'tags': ['LEGSERLIB'], 'has_tags': True, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'serials': [], }, 'document_type': 'BOOK', }) @@ -514,44 +450,49 @@ def test_urls(app): with app.app_context(): check_transformation( """ - - 42 - - - cds.cern.ch + + 1336159 + 726479 + + http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-AC6.pdf + + + Description + - """, { - 'document_type': 'PROCEEDINGS', - }) + """, { + "_migration": + { + **get_migration_dict(), + 'eitems_has_files': True, + 'eitems_file_links': [ + {'description': 'Description', + 'value': 'http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-AC6.pdf'}] + } + } + ) check_transformation( """ - cds.cern.ch + https://cds.cern.ch/record/12345/files/abc.pdf """, { + "_migration": + { + **get_migration_dict(), + 'eitems_has_files': True, + 'eitems_file_links': [ + { + 'value': 'https://cds.cern.ch/record/12345/files/abc.pdf'}] + } }) check_transformation( """ - - 1336158 - 3334918 - - http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-A2.pdf?subformat=pdfa - - pdfa - 1336158 2445021 http://awesome.domain/with/a/path - - 1336159 - 726479 - - http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-AC6.pdf - - 1336157 2412918 @@ -564,17 +505,98 @@ def test_urls(app): {'value': 'http://another.domain/with/a/path'}, ], }) - with pytest.raises(ManualMigrationRequired): - check_transformation( - """ - - cds.cern.ch - description - - """, { - 'urls': [{'value': 'cds.cern.ch', - 'description': 'description'}], - }) + check_transformation( + """ + + https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560 + ebook + + """, + { + "_migration": + { + **get_migration_dict(), + 'eitems_has_ebl': True, + 'eitems_ebl': [ + { + 'value': 'https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560'}] + } + } + ) + check_transformation( + """ + + https://learning.oreilly.com/library/view/-/9781118491300/?ar + ebook + + """, + { + "_migration": + { + **get_migration_dict(), + 'eitems_has_external': True, + 'eitems_external': [ + { + 'value': 'https://learning.oreilly.com/library/view/-/9781118491300/?ar'}] + } + } + ) + check_transformation( + """ + + https://ezproxy.cern.ch/login?url=https://www.worldscientific.com/toc/rast/10 + ebook + + """, + { + "_migration": + { + **get_migration_dict(), + 'eitems_has_proxy': True, + 'eitems_proxy': [ + { + 'value': 'https://www.worldscientific.com/toc/rast/10'}] + } + } + ) + check_transformation( + """ + + https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560 + ebook + + + https://learning.oreilly.com/library/view/-/9781118491300/?ar + ebook + + """, + { + "_migration": + { + **get_migration_dict(), + 'eitems_has_ebl': True, + 'eitems_ebl': [ + { + 'value': 'https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560'}, + ], + 'eitems_external': [ + { + 'value': 'https://learning.oreilly.com/library/view/-/9781118491300/?ar'}, + ], + 'eitems_has_external': True, + } + } + ) + check_transformation( + """ + + google.com + description + + """, { + 'urls': [{'value': 'google.com', + 'description': 'description'}], + }) def test_authors(app): @@ -846,16 +868,9 @@ def test_publication_info(app): """, {'document_type': 'PERIODICAL_ISSUE', '_migration': { - 'serials': [], + **get_migration_dict(), 'journal_record_legacy_recid': "2155631", - 'has_tags': False, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, 'has_journal': True, - 'record_type': 'document', - 'volumes': [], - 'tags': [], }, 'conference_info': { 'identifiers': [ @@ -928,11 +943,11 @@ def test_extensions(app): """, { - 'extensions': [{ + 'extensions': { 'standard_review:applicability': 'applicable at CERN', 'standard_review:checkdate': 'Reviewed in December 2019', 'standard_review:expert': 'Expert ICS-25.160', - }], + }, } ) check_transformation( @@ -945,12 +960,12 @@ def test_extensions(app): """, { - 'extensions': [{ + 'extensions': { 'standard_review:applicability': 'no longer applicable', 'standard_review:validity': 'withdrawn', 'standard_review:checkdate': 'Reviewed in December 2019', 'standard_review:expert': 'Expert ICS-25.160', - }], + }, } ) check_transformation( @@ -972,19 +987,16 @@ def test_extensions(app): """, { - 'extensions': [ - {'unit:accelerator': 'CERN LHC', - 'unit:experiment': 'ATLAS'}, - {'unit:accelerator': 'CERN LHC', - 'unit:experiment': 'CMS', - 'unit:project': 'FCC', - }, - {'standard_review:applicability': 'no longer applicable', + 'extensions': + {'unit:accelerator': ['CERN LHC'], + 'unit:experiment': ['ATLAS', 'CMS'], + 'unit:project': ['FCC'], + 'standard_review:applicability': 'no longer applicable', 'standard_review:validity': 'withdrawn', 'standard_review:checkdate': 'Reviewed in December 2019', 'standard_review:expert': 'Expert ICS-25.160', - }, - ], + } + } ) @@ -1002,16 +1014,8 @@ def test_related_record(app): """, { '_migration': { - 'serials': [], - 'has_tags': False, - 'is_multipart': False, + **get_migration_dict(), 'has_related': True, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'tags': [], 'related': [{ 'related_recid': '748392', "relation_type": "Test text" @@ -1029,16 +1033,8 @@ def test_related_record(app): """, { '_migration': { - 'serials': [], - 'has_tags': False, - 'is_multipart': False, + **get_migration_dict(), 'has_related': True, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'tags': [], 'related': [{'related_recid': '7483924', 'relation_type': "other"}], }, @@ -1055,16 +1051,8 @@ def test_related_record(app): """, { '_migration': { - 'serials': [], - 'has_tags': False, - 'is_multipart': False, + **get_migration_dict(), 'has_related': True, - 'has_serial': False, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'tags': [], 'related': [ {'related_recid': '7483924', 'relation_type': 'other'}, {'related_recid': '748', 'relation_type': 'other'}], @@ -1089,14 +1077,12 @@ def test_accelerator_experiments(app): """, { - 'extensions': [ - {'unit:accelerator': 'CERN LHC', - 'unit:experiment': 'ATLAS'}, - {'unit:accelerator': 'CERN LHC', - 'unit:experiment': 'CMS', - 'unit:project': 'FCC', + 'extensions': + {'unit:accelerator': ['CERN LHC'], + 'unit:experiment': ['ATLAS', 'CMS'], + 'unit:project': ['FCC'], } - ] + } ) @@ -1442,15 +1428,14 @@ def test_alternative_identifiers(app): """, { }) - with pytest.raises(UnexpectedValue): - check_transformation( - """ - - CERCER - 2365039 - - """, { - }) + check_transformation( + """ + + CERCER + 2365039 + + """, { + }) check_transformation( """ @@ -1689,7 +1674,7 @@ def test_imprint(app): 'place': 'Sydney', 'publisher': 'Allen & Unwin', 'date': '2013', - 'reprint': '2015', + 'reprint_date': '2015', }, }) @@ -2377,6 +2362,7 @@ def test_book_series(app): """, { '_migration': { + **get_migration_dict(), 'serials': [ { 'title': 'Minutes', @@ -2384,15 +2370,7 @@ def test_book_series(app): 'volume': None } ], - 'has_tags': False, - 'is_multipart': False, - 'has_related': False, 'has_serial': True, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'tags': [], } } ) @@ -2406,6 +2384,7 @@ def test_book_series(app): """, { '_migration': { + **get_migration_dict(), 'serials': [ { 'title': 'De Gruyter studies in mathematical physics', @@ -2413,15 +2392,7 @@ def test_book_series(app): 'volume': '16' } ], - 'has_tags': False, - 'is_multipart': False, - 'has_related': False, 'has_serial': True, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'tags': [], } } ) @@ -2434,6 +2405,7 @@ def test_book_series(app): """, { '_migration': { + **get_migration_dict(), 'serials': [ { 'title': 'Springer tracts in modern physics', @@ -2441,15 +2413,7 @@ def test_book_series(app): 'volume': '267' } ], - 'has_tags': False, - 'is_multipart': False, - 'has_related': False, 'has_serial': True, - 'has_journal': False, - 'journal_record_legacy_recid': '', - 'record_type': 'document', - 'volumes': [], - 'tags': [], } } ) @@ -2654,21 +2618,15 @@ def test_volume_barcodes(app): """, dict( title='Mathematische Methoden der Physik', - _migration=dict( - record_type='document', - has_serial=False, - is_multipart=False, - has_tags=False, - has_related=False, - has_journal=False, - journal_record_legacy_recid='', - volumes=[ - dict(barcode='80-1209-8', volume=1), - dict(barcode='B00004172', volume=1), - ], - serials=[], - tags=[], - ), + _migration={ + **get_migration_dict(), + **dict( + volumes=[ + dict(barcode='80-1209-8', volume=1), + dict(barcode='B00004172', volume=1), + ], + ), } + ) ) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 550a142a..0a6975ff 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -59,12 +59,14 @@ def test_marc21_matcher_books(): multipart_blob1 = { '690C_': [{'a': 'BOOK'}], '245__': [{'a': 'Test '}], + '596__': [{'a': 'MULTIVOLUMES'}], '246__': [{'p': 'Volume Title', 'n': '2'}] } multipart_blob2 = { '690C_': [{'a': 'BOOK'}], '245__': [{'a': 'Test '}], - '246__': [{'n': '2'}] + '246__': [{'n': '2'}], + '596__': [{'a': 'MULTIVOLUMES'}], } standard_blob1 = {'690C_': [{'a': 'STANDARD'}]} journal_blob = {'980__': [{'a': 'PERI'}]} diff --git a/tests/test_standard.py b/tests/test_standard.py index 7db7a8d8..823e8fb9 100644 --- a/tests/test_standard.py +++ b/tests/test_standard.py @@ -20,6 +20,9 @@ from __future__ import absolute_import, print_function, unicode_literals +from copy import deepcopy + +from cds_dojson.marc21.models.books.base import get_migration_dict from cds_dojson.marc21.models.books.standard import model from cds_dojson.marc21.utils import create_record @@ -29,16 +32,12 @@ def check_transformation(marcxml_body, json_body): blob = create_record(marcxml.format(marcxml_body)) - record = model.do(blob, ignore_missing=False) + record = {'_migration': {**get_migration_dict()}} + + record.update(**model.do(blob, ignore_missing=False)) expected = { '$schema': 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json', - '_migration': {'has_keywords': False, - 'is_multipart': False, - 'has_related': False, - 'has_serial': False, - 'record_type': 'document', - 'volumes': [] - } + '_migration': {**get_migration_dict()} } expected.update(**json_body) assert record == expected