From 8c65fe03b2dc56acbe53e3f78e8d0e28ee62c0ad Mon Sep 17 00:00:00 2001
From: Karolina Przerwa <karolina.m.przerwa@gmail.com>
Date: Tue, 29 Sep 2020 10:24:03 +0200
Subject: [PATCH] books: rules update

* fixes broken fields
* tested on 100k data set
* fixes migration additional info object
---
 cds_dojson/marc21/fields/books/base.py        | 184 ++++----
 cds_dojson/marc21/fields/books/utils.py       |   2 +-
 .../marc21/fields/books/values_mapping.py     |   6 +-
 cds_dojson/marc21/fields/utils.py             |  26 +-
 cds_dojson/marc21/models/books/base.py        |  40 +-
 cds_dojson/marc21/models/books/book.py        |  32 +-
 cds_dojson/marc21/models/books/multipart.py   |   5 +-
 cds_dojson/marc21/models/books/standard.py    |  19 +-
 tests/test_books.py                           | 408 ++++++++----------
 tests/test_matcher.py                         |   4 +-
 tests/test_standard.py                        |  15 +-
 11 files changed, 390 insertions(+), 351 deletions(-)

diff --git a/cds_dojson/marc21/fields/books/base.py b/cds_dojson/marc21/fields/books/base.py
index aa7e22c5..019ea6e8 100644
--- a/cds_dojson/marc21/fields/books/base.py
+++ b/cds_dojson/marc21/fields/books/base.py
@@ -72,15 +72,17 @@ def created(self, key, value):
 
     if key == '916__':
         if 's' in value:
-            _created_by.update(
-                {'type': mapping(ACQUISITION_METHOD,
-                                 clean_val('s', value, str))})
+            _created_by.update({'type': mapping(ACQUISITION_METHOD,
+                                                clean_val('s', value, str,
+                                                          default="migration"),
+                                                raise_exception=True
+                                                )})
             self['created_by'] = _created_by
-        date = clean_val('w', value, int, regex_format=r'\d{4}$')
-        if date:
-            year, week = str(date)[:4], str(date)[4:]
-            date = get_week_start(int(year), int(week))
-            return date.isoformat()
+            date = clean_val('w', value, int, regex_format=r'\d{4}$')
+            if date:
+                year, week = str(date)[:4], str(date)[4:]
+                date = get_week_start(int(year), int(week))
+                return date.isoformat()
     elif key == '595__':
         try:
             sub_a = clean_val('a', value, str,
@@ -89,7 +91,8 @@ def created(self, key, value):
                 source = sub_a[:3]
                 self["source"] = source
                 year, month = int(sub_a[3:7]), int(sub_a[7:])
-                return datetime.date(year, month, 1).isoformat()
+                self['_created'] = datetime.date(year, month, 1).isoformat()
+                raise IgnoreKey('_created')
         except UnexpectedValue as e:
             e.subfield = 'a'
             self['internal_notes'] = internal_notes(self, key, value)
@@ -99,17 +102,13 @@ def created(self, key, value):
 
 
 @model.over('internal_notes', '^595__')
-@filter_list_values
 def internal_notes(self, key, value):
     """Translates private notes field."""
     _internal_notes = self.get('internal_notes', [])
-
     for v in force_list(value):
-        note = {'value': clean_val('a', v, str, req=True),
-                # TODO, waiting for an answer
-                'source': clean_val('9', v, str),
-                }
-        _internal_notes.append(note)
+        internal_note = {'value': clean_val('a', v, str, req=True)}
+        if internal_note not in _internal_notes:
+            _internal_notes.append(internal_note)
     return _internal_notes
 
 
@@ -118,22 +117,20 @@ def internal_notes(self, key, value):
 @out_strip
 def collection(self, key, value):
     """Translates collection field - WARNING - also document type field."""
-    _migration = self.get('_migration', {})
-    _tags = _migration.get('tags', [])
-
+    _migration = self['_migration']
+    _tags = _migration['tags']
     for v in force_list(value):
         result_a = mapping(COLLECTION, clean_val('a', v, str))
         result_b = mapping(COLLECTION, clean_val('b', v, str))
         if result_a:
-            _tags.append(result_a)
+            _tags.append(result_a) if result_a not in _tags else None
             _migration['has_tags'] = True
         if result_b:
-            _tags.append(result_b)
+            _tags.append(result_b) if result_b not in _tags else None
             _migration['has_tags'] = True
         if not result_a and not result_b:
             self['document_type'] = document_type(self, key, value)
             raise IgnoreKey('_migration')
-    _migration['tags'] = _tags
     return _migration
 
 
@@ -268,22 +265,24 @@ def publication_info(self, key, value):
 
 
 @model.over('extensions', '(^925__)')
-@filter_list_values
-@for_each_value
+@filter_values
 def standard_review(self, key, value):
     """Translates standard_status field."""
-    return{'standard_review:applicability': clean_val('i', value, str),
-           'standard_review:validity': clean_val('v', value, str),
-           'standard_review:checkdate': clean_val('z', value, str),
-           'standard_review:expert': clean_val('p', value, str)
-           }
+    _extensions = self.get('extensions', {})
+    _extensions.update(
+        {'standard_review:applicability': clean_val('i', value, str),
+         'standard_review:validity': clean_val('v', value, str),
+         'standard_review:checkdate': clean_val('z', value, str),
+         'standard_review:expert': clean_val('p', value, str)
+         })
+    return _extensions
 
 
 @model.over('publication_info', '^962__')
 def publication_additional(self, key, value):
     """Translates additional publication info."""
     _publication_info = self.get('publication_info', [])
-    _migration = self.get("_migration", {})
+    _migration = self["_migration"]
     empty = not bool(_publication_info)
     for i, v in enumerate(force_list(value)):
         temp_info = {}
@@ -297,7 +296,6 @@ def publication_additional(self, key, value):
             # assume that if we have a parent journal
             # then the doc is a periodical issue
             self['document_type'] = 'PERIODICAL_ISSUE'
-            self['_migration'] = _migration
         n_subfield = clean_val('n', v, str)
         if n_subfield.upper() == 'BOOK':
             temp_info.update({'material': 'BOOK'})
@@ -324,8 +322,8 @@ def related_records(self, key, value):
 
     RELATED records
     """
-    _migration = self.get('_migration', {})
-    _related = _migration.get('related', [])
+    _migration = self['_migration']
+    _related = _migration['related']
     relation_type = 'other'
     try:
         if key == '775__' and 'b' in value:
@@ -346,33 +344,73 @@ def related_records(self, key, value):
 
 
 @model.over('extensions', '^693__')
-@filter_list_values
-@for_each_value
+@filter_values
 def accelerator_experiments(self, key, value):
     """Translates accelerator_experiments field."""
-    return {'unit:accelerator': clean_val('a', value, str),
-            'unit:experiment': clean_val('e', value, str),
-            'unit:project': clean_val('p', value, str)
-            }
+    _extensions = self.get('extensions', {})
+
+    sub_a = clean_val('a', value, str)
+    sub_e = clean_val('e', value, str)
+    sub_p = clean_val('p', value, str)
+
+    accelerators = _extensions.get('unit:accelerator', [])
+    experiment = _extensions.get('unit:experiment', [])
+    project = _extensions.get('unit:project', [])
+
+    if sub_a and sub_a not in accelerators:
+        accelerators.append(sub_a)
+    if sub_e and sub_e not in experiment:
+        experiment.append(sub_e)
+    if sub_p and sub_p not in project:
+        project.append(sub_p)
+
+    _extensions.update({
+        "unit:accelerator": accelerators,
+        "unit:experiment": experiment,
+        "unit:project": project
+    })
+    return _extensions
 
 
-# TODO - discuss how we would like to keep links to holdings (files and ebooks)
 @model.over('urls', '^8564_')
+@filter_list_values
 @for_each_value
-@filter_values
 def urls(self, key, value):
     """Translates urls field."""
-    try:
-        clean_val('y', value, str, manual=True)
-    except ManualMigrationRequired as e:
-        e.subfield = 't'
-        raise e
-    url = clean_val('u', value, str, req=True)
-    if 'cds.cern.ch' not in url:
-        return {'value': url}
-    # TODO: instead of IgnoreKey if link starts with cds.cern.ch it should be
-    # linked as files to the record, issue #200
-    raise IgnoreKey('urls')
+    sub_y = clean_val('y', value, str, default='')
+    sub_u = clean_val('u', value, str, req=True)
+
+    eitems_ebl = self['_migration']['eitems_ebl']
+    eitems_external = self['_migration']['eitems_external']
+    eitems_proxy = self['_migration']['eitems_proxy']
+    eitems_files = self['_migration']['eitems_file_links']
+
+    url = {'value': sub_u}
+    if sub_y and sub_y != 'ebook':
+        url['description'] = sub_y
+
+    # EBL publisher login required
+    if all([elem in sub_u for elem in ['cds', '.cern.ch' '/auth.py']]):
+        eitems_ebl.append(url)
+        self['_migration']['eitems_has_ebl'] = True
+    # EzProxy links
+    elif 'ezproxy.cern.ch' in sub_u:
+        url['value'] = url['value'].replace(
+            'https://ezproxy.cern.ch/login?url=', '')
+        eitems_proxy.append(url)
+        self['_migration']['eitems_has_proxy'] = True
+    # local files
+    elif all([elem in sub_u for elem in
+              ['cds', '.cern.ch/record/', '/files']]):
+        eitems_files.append(url)
+        self['_migration']['eitems_has_files'] = True
+    elif sub_y == 'ebook':
+        eitems_external.append(url)
+        self['_migration']['eitems_has_external'] = True
+    else:
+        # if none of the above, it is just external url
+        # attached to the document
+        return url
 
 
 @model.over('identifiers', '^020__', )
@@ -443,8 +481,11 @@ def alternative_identifiers(self, key, value):
         else:
             raise UnexpectedValue(subfield='2')
     if key == '035__':
+        if 'CERCER' in sub_a:
+            raise IgnoreKey('alternative_identifiers')
         sub_9 = clean_val('9', value, str, req=True)
-
+        if 'CERCER' in sub_9:
+            raise IgnoreKey('alternative_identifiers')
         # conference_info.identifiers mixed data
         if sub_9.upper() == 'INSPIRE-CNUM':
             _conference_info = self.get('conference_info', {})
@@ -453,7 +494,7 @@ def alternative_identifiers(self, key, value):
                 {'scheme': 'INSPIRE_CNUM', 'value': sub_a})
             _conference_info.update({'identifiers': _prev_identifiers})
             self['conference_info'] = _conference_info
-            raise IgnoreKey('external_system_identifiers')
+            raise IgnoreKey('alternative_identifiers')
 
         elif sub_9.upper() in EXTERNAL_SYSTEM_IDENTIFIERS:
             indentifier_entry.update({'value': sub_a,
@@ -472,22 +513,19 @@ def alternative_identifiers(self, key, value):
 @filter_list_values
 def dois(self, key, value):
     """Translates dois fields."""
-    # TODO might be moved to item level or prefixed _migration,
-    # decision pending
     _identifiers = self.get('identifiers', [])
     for v in force_list(value):
         material = mapping(MATERIALS,
                            clean_val('q', v, str, transform='lower'),
                            raise_exception=True)
-
-        _identifiers.append(
-            {
-                'value': clean_val('a', v, str, req=True),
-                'material': material,
-                'source': clean_val('9', v, str),  # TODO sources
-                'scheme': 'DOI',
-            }
-        )
+        doi = {
+            'value': clean_val('a', v, str, req=True),
+            'material': material,
+            'source': clean_val('9', v, str),
+            'scheme': 'DOI',
+        }
+        if doi not in _identifiers:
+            _identifiers.append(doi)
     return _identifiers
 
 
@@ -541,7 +579,7 @@ def barcodes(self, key, value):
     val_n = clean_val('n', value, str)
     val_x = clean_val('x', value, str)
 
-    _migration = self.get('_migration', {'volumes': []})
+    _migration = self['_migration']
     _migration['volumes'].append(dict(
         volume=extract_volume_number(
             val_n,
@@ -550,7 +588,6 @@ def barcodes(self, key, value):
         ),
         barcode=val_x
     ))
-    self['_migration'] = _migration
     raise IgnoreKey('barcodes')
 
 
@@ -633,7 +670,7 @@ def subject_classification(self, key, value):
         raise IgnoreKey('subjects')
 
 
-@model.over('keywords', '(^6531_)')
+@model.over('keywords', '^6531_')
 @for_each_value
 @filter_values
 def keywords(self, key, value):
@@ -727,12 +764,14 @@ def imprint(self, key, value):
         date = parser.parse(clean_val('c', value, str, req=True))
     except ParserError:
         raise UnexpectedValue(subfield='c')
+    except Exception:
+        raise UnexpectedValue(subfield='c')
     self['publication_year'] = str(date.date().year)
     return {
         'date': clean_val('c', value, str, req=True),
         'place': clean_val('a', value, str),
         'publisher': clean_val('b', value, str),
-        'reprint': reprint,
+        'reprint_date': reprint,
     }
 
 
@@ -743,16 +782,13 @@ def book_series(self, key, value):
     val_n = clean_val('n', value, str)
     val_x = clean_val('x', value, str)
 
-    _migration = self.get('_migration', {})
-    if 'serials' not in _migration:
-        _migration['serials'] = []
+    _migration = self['_migration']
     _migration['serials'].append({
         'title': clean_val('a', value, str),
         'volume': clean_val('v', value, str),
-        'issn': clean_val('x', value, str),
+        'issn': val_x
     })
     _migration['has_serial'] = True
-    self['_migration'] = _migration
     raise IgnoreKey('book_series')
 
 
diff --git a/cds_dojson/marc21/fields/books/utils.py b/cds_dojson/marc21/fields/books/utils.py
index 8d032454..2e4391a8 100644
--- a/cds_dojson/marc21/fields/books/utils.py
+++ b/cds_dojson/marc21/fields/books/utils.py
@@ -24,7 +24,7 @@
 
 MAX_PAGES_NUMBER = 8192
 
-RE_STR_VOLUME_NUMBER = r'(v(ol(ume)?)?|part|p|pt)[\s\.]*(\d+)'
+RE_STR_VOLUME_NUMBER = r'(v(ol(ume)?)?|part|p|pt|t)[\s\.]*(\d+)'
 
 RE_VOLUME_NUMBER = re.compile(RE_STR_VOLUME_NUMBER, re.IGNORECASE)
 RE_VOLUME_INFO = re.compile(
diff --git a/cds_dojson/marc21/fields/books/values_mapping.py b/cds_dojson/marc21/fields/books/values_mapping.py
index 24a81f0b..290a2f57 100644
--- a/cds_dojson/marc21/fields/books/values_mapping.py
+++ b/cds_dojson/marc21/fields/books/values_mapping.py
@@ -42,6 +42,8 @@
 ACQUISITION_METHOD = {
     'user': ['H'],
     'batchuploader': ['N', 'M'],
+    'migration': ['migration'],
+    'r': 'user'
 }
 
 MEDIUM_TYPES = [
@@ -142,7 +144,7 @@
 ]
 
 
-def mapping(field_map, val, raise_exception=False):
+def mapping(field_map, val, raise_exception=False, default_val=None):
     """
     Maps the old value to a new one according to the map.
 
@@ -165,5 +167,7 @@ def mapping(field_map, val, raise_exception=False):
         elif isinstance(field_map, list):
             if val in field_map:
                 return val
+        elif default_val:
+            return default_val
         if raise_exception:
             raise UnexpectedValue
diff --git a/cds_dojson/marc21/fields/utils.py b/cds_dojson/marc21/fields/utils.py
index 7098f688..92ee6b4a 100644
--- a/cds_dojson/marc21/fields/utils.py
+++ b/cds_dojson/marc21/fields/utils.py
@@ -98,14 +98,19 @@ def clean_val(subfield, value, var_type, req=False, regex_format=None,
             return default
         raise MissingRequiredField
     if to_clean is not None:
-        if var_type is str:
-            return clean_str(to_clean, regex_format, req, transform)
-        elif var_type is bool:
-            return bool(to_clean)
-        elif var_type is int:
-            return int(to_clean)
-        else:
-            raise NotImplementedError
+        try:
+            if var_type is str:
+                return clean_str(to_clean, regex_format, req, transform)
+            elif var_type is bool:
+                return bool(to_clean)
+            elif var_type is int:
+                return int(to_clean)
+            else:
+                raise NotImplementedError
+        except ValueError:
+            raise UnexpectedValue(subfield=subfield)
+        except TypeError:
+            raise UnexpectedValue(subfield=subfield)
 
 
 def clean_email(value):
@@ -328,7 +333,10 @@ def _get_correct_books_contributor_role(subfield, role):
         'ill.': 'Ilustrator',
         'ill': 'Ilustrator',
     }
-    clean_role = role.lower()
+    if isinstance(role, str):
+        clean_role = role.lower()
+    else:
+        raise UnexpectedValue(subfield=subfield, message=' unknown role')
     if clean_role not in translations:
         raise UnexpectedValue(subfield=subfield, message=' unknown role')
     return translations[clean_role]
diff --git a/cds_dojson/marc21/models/books/base.py b/cds_dojson/marc21/models/books/base.py
index f1074c22..3749fa25 100644
--- a/cds_dojson/marc21/models/books/base.py
+++ b/cds_dojson/marc21/models/books/base.py
@@ -19,6 +19,8 @@
 """Base models for common fields."""
 from __future__ import unicode_literals
 
+from copy import deepcopy
+
 from dojson._compat import iteritems
 from dojson.errors import IgnoreKey, MissingRule
 from dojson.utils import GroupableOrderedDict
@@ -26,6 +28,33 @@
 from ....overdo import OverdoJSONSchema
 from ..base import model as cds_base
 
+
+def get_migration_dict():
+    """Return migration extra data."""
+    __migration_dict__ = dict(
+        record_type='document',
+        volumes=[],
+        serials=[],
+        has_serial=False,
+        is_multipart=False,
+        has_tags=False,
+        has_related=False,
+        has_journal=False,
+        tags=[],
+        journal_record_legacy_id='',
+        eitems_proxy=[],
+        eitems_has_proxy=False,
+        eitems_file_links=[],
+        eitems_has_files=False,
+        eitems_external=[],
+        eitems_has_external=False,
+        eitems_ebl=[],
+        eitems_has_ebl=False,
+        related=[],
+    )
+
+    return deepcopy(__migration_dict__)
+
 COMMON_IGNORE_FIELDS = {
     '003',
     '005',
@@ -48,6 +77,7 @@
     '100__9',
     '111__d',
     '111__f',
+    '145__a',
     '246__i',
     '269__a',  # preprint info
     '269__b',  # preprint info
@@ -57,6 +87,7 @@
     '340__a',
     '440_3a',  # 206 cds-dojson
     '541__9',
+    '541__a',
     '541__h',
     '502__a',  # thesis_info/defense_date
     '502__b',  # thesis_info/degree_type
@@ -71,6 +102,7 @@
     '540__b',
     '540__f',
     '595__z',
+    '595__9',
     '650172',
     '65017a',
     '650272',
@@ -89,6 +121,9 @@
     '852__c',
     '852__h',
     '852__p',
+    '8564_8',  # bibdoc id
+    '8564_s',  # file identifier
+    '8564_x',  # subformat identifier
     '900__s',  # 206 cds-dojson
     '900__u',  # 206 cds-dojson
     '900__y',  # 206 cds-dojson
@@ -117,7 +152,7 @@
 class CDSOverdoBookBase(OverdoJSONSchema):
     """Translation base Index for CDS Books."""
 
-    def do(self, blob, ignore_missing=True, exception_handlers=None):
+    def do(self, blob, ignore_missing=True, exception_handlers=None, init_fields=None):
         """Translate blob values and instantiate new model instance.
 
         Raises ``MissingRule`` when no rule matched and ``ignore_missing``
@@ -155,6 +190,9 @@ def clean_missing(exc, output, key, value):
 
         output = {}
 
+        if init_fields:
+            output.update(**init_fields)
+
         if self.index is None:
             self.build()
         if isinstance(blob, GroupableOrderedDict):
diff --git a/cds_dojson/marc21/models/books/book.py b/cds_dojson/marc21/models/books/book.py
index ed8a25ed..ebd5276b 100644
--- a/cds_dojson/marc21/models/books/book.py
+++ b/cds_dojson/marc21/models/books/book.py
@@ -19,8 +19,10 @@
 """Book model."""
 from __future__ import unicode_literals
 
+from copy import deepcopy
+
 from ..base import model as cds_base
-from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase
+from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase, get_migration_dict
 from .base import model as books_base
 
 
@@ -31,33 +33,27 @@ class CDSBook(CDSOverdoBookBase):
                 '980__:PROCEEDINGS OR 980__:PERI OR ' \
                 '(-980:STANDARD 980:BOOK) OR ' \
                 '697C_:LEGSERLIB ' \
-                '-980__:DELETED -980__:MIGRATED -980:__STANDARD' \
+                '-980__:DELETED -980__:MIGRATED -980:__STANDARD -596:MULTIVOLUMES'
 
     __schema__ = 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json'
 
-    __ignore_keys__ = COMMON_IGNORE_FIELDS
+    __model_ignore_keys__ = {
+        '020__b',  # this field is used to match multipart monograph items as volumes
+    }
+
+    __ignore_keys__ = COMMON_IGNORE_FIELDS | __model_ignore_keys__
+
+    __json_init_dict__ = {'_migration': {**get_migration_dict()}}
 
     def do(self, blob, ignore_missing=True, exception_handlers=None):
         """Set schema after translation depending on the model."""
         json = super(CDSBook, self).do(
             blob=blob,
             ignore_missing=ignore_missing,
-            exception_handlers=exception_handlers)
+            exception_handlers=exception_handlers,
+            init_fields=deepcopy(self.__json_init_dict__)
+        )
         json['$schema'] = self.__class__.__schema__
-
-        if '_migration' not in json:
-            json['_migration'] = {}
-        json['_migration'].setdefault('record_type', 'document')
-        json['_migration'].setdefault('volumes', [])
-        json['_migration'].setdefault('serials', [])
-        json['_migration'].setdefault('has_serial', False)
-        json['_migration'].setdefault('is_multipart', False)
-        json['_migration'].setdefault('has_tags', False)
-        json['_migration'].setdefault('has_related', False)
-        json['_migration'].setdefault('has_journal', False)
-        json['_migration'].setdefault('tags', [])
-        json['_migration'].setdefault('journal_record_legacy_recid', '')
-
         return json
 
 
diff --git a/cds_dojson/marc21/models/books/multipart.py b/cds_dojson/marc21/models/books/multipart.py
index 217f055b..48b0d2bd 100644
--- a/cds_dojson/marc21/models/books/multipart.py
+++ b/cds_dojson/marc21/models/books/multipart.py
@@ -27,10 +27,9 @@
 class CDSMultipart(CDSOverdoBookBase):
     """Translation Index for CDS Books."""
 
-    __query__ = '(690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
+    __query__ = '(596__:MULTIVOLUMES AND 690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
                 '980__:PROCEEDINGS OR ' \
-                '697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)'\
-                'AND 246__:/[a-zA-Z0-9]+/ '
+                '697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)'
 
     __schema__ = 'https://127.0.0.1:5000/schemas/series/series-v1.0.0.json'
 
diff --git a/cds_dojson/marc21/models/books/standard.py b/cds_dojson/marc21/models/books/standard.py
index 1d3d7fd2..f508925d 100644
--- a/cds_dojson/marc21/models/books/standard.py
+++ b/cds_dojson/marc21/models/books/standard.py
@@ -20,8 +20,10 @@
 
 from __future__ import unicode_literals
 
+from copy import deepcopy
+
 from ..base import model as cds_base
-from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase
+from .base import COMMON_IGNORE_FIELDS, CDSOverdoBookBase, get_migration_dict
 from .base import model as books_base
 
 
@@ -34,21 +36,18 @@ class CDSStandard(CDSOverdoBookBase):
 
     __ignore_keys__ = COMMON_IGNORE_FIELDS
 
+    __json_init_dict__ = {'_migration': {**get_migration_dict()}}
+
     def do(self, blob, ignore_missing=True, exception_handlers=None):
         """Set schema after translation depending on the model."""
         json = super(CDSStandard, self).do(
             blob=blob,
             ignore_missing=ignore_missing,
-            exception_handlers=exception_handlers)
+            exception_handlers=exception_handlers,
+            init_fields=deepcopy(self.__json_init_dict__),
+        )
+
         json['$schema'] = self.__class__.__schema__
-        json['_migration'] = {
-            'record_type': 'document',
-            'has_serial': False,
-            'is_multipart': False,
-            'has_keywords': False,
-            'has_related': False,
-            'volumes': []
-        }
         return json
 
 
diff --git a/tests/test_books.py b/tests/test_books.py
index f0bc95ec..84469dd1 100644
--- a/tests/test_books.py
+++ b/tests/test_books.py
@@ -20,12 +20,15 @@
 
 from __future__ import absolute_import, print_function, unicode_literals
 
+from copy import deepcopy
+
 import pytest
 from dojson.errors import MissingRule
 
 from cds_dojson.marc21.fields.books.errors import ManualMigrationRequired, \
     MissingRequiredField, UnexpectedValue
 from cds_dojson.marc21.fields.books.values_mapping import MATERIALS, mapping
+from cds_dojson.marc21.models.books.base import get_migration_dict
 from cds_dojson.marc21.models.books.book import model
 from cds_dojson.marc21.utils import create_record
 
@@ -36,21 +39,14 @@
 def check_transformation(marcxml_body, json_body):
     """Check transformation."""
     blob = create_record(marcxml.format(marcxml_body))
-    record = model.do(blob, ignore_missing=False)
+
+    record = {'_migration': {**get_migration_dict()}}
+
+    record.update(**model.do(blob, ignore_missing=False))
+
     expected = {
         '$schema': 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json',
-        '_migration': {
-            'has_tags': False,
-            'is_multipart': False,
-            'has_related': False,
-            'has_serial': False,
-            'has_journal': False,
-            'journal_record_legacy_recid': '',
-            'record_type': 'document',
-            'volumes': [],
-            'serials': [],
-            'tags': [],
-        }
+        '_migration': {**get_migration_dict()}
     }
 
     expected.update(**json_body)
@@ -218,6 +214,9 @@ def test_created(app):
             <datafield tag="595" ind1=" " ind2=" ">
                 <subfield code="a">random text</subfield>
             </datafield>
+            <datafield tag="595" ind1=" " ind2=" ">
+                <subfield code="a">random text</subfield>
+            </datafield>
             """, {
                 'internal_notes': [
                     {'value': 'random text'},
@@ -253,16 +252,9 @@ def test_collections(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIB'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             })
         check_transformation(
@@ -272,16 +264,9 @@ def test_collections(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIB'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             })
         check_transformation(
@@ -291,16 +276,9 @@ def test_collections(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIB'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             })
         check_transformation(
@@ -311,16 +289,9 @@ def test_collections(app):
             """,
             {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIBINTLAW'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             }
         )
@@ -332,16 +303,9 @@ def test_collections(app):
             """,
             {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['BOOKSHOP'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             }
         )
@@ -353,16 +317,9 @@ def test_collections(app):
             """,
             {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['BOOKSHOP'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             }
         )
@@ -374,16 +331,9 @@ def test_collections(app):
             """,
             {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIBLEGRES'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
             }
         )
@@ -471,16 +421,9 @@ def test_document_type_collection(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIB'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
                 'document_type': 'BOOK',
             })
@@ -494,16 +437,9 @@ def test_document_type_collection(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'tags': ['LEGSERLIB'],
                     'has_tags': True,
-                    'is_multipart': False,
-                    'has_related': False,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'serials': [],
                 },
                 'document_type': 'BOOK',
             })
@@ -514,44 +450,49 @@ def test_urls(app):
     with app.app_context():
         check_transformation(
             """
-            <datafield tag="960" ind1=" " ind2=" ">
-                <subfield code="a">42</subfield>
-            </datafield>
-            <datafield tag="8564" ind1=" " ind2=" ">
-                <subfield code="u">cds.cern.ch</subfield>
+            <datafield tag="856" ind1="4" ind2=" ">
+                <subfield code="8">1336159</subfield>
+                <subfield code="s">726479</subfield>
+                <subfield code="u">
+                    http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-AC6.pdf
+                </subfield>
+                <subfield code="y">
+                Description
+                </subfield>
             </datafield>
-            """, {
-                'document_type': 'PROCEEDINGS',
-            })
+        """, {
+                "_migration":
+                    {
+                        **get_migration_dict(),
+                        'eitems_has_files': True,
+                        'eitems_file_links': [
+                            {'description': 'Description',
+                             'value': 'http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-AC6.pdf'}]
+                    }
+            }
+        )
         check_transformation(
             """
             <datafield tag="8564" ind1=" " ind2=" ">
-                <subfield code="u">cds.cern.ch</subfield>
+                <subfield code="u">https://cds.cern.ch/record/12345/files/abc.pdf</subfield>
             </datafield>
             """, {
+                "_migration":
+                    {
+                        **get_migration_dict(),
+                        'eitems_has_files': True,
+                        'eitems_file_links': [
+                            {
+                                'value': 'https://cds.cern.ch/record/12345/files/abc.pdf'}]
+                    }
             })
         check_transformation(
             """
-            <datafield tag="856" ind1="4" ind2=" ">
-                <subfield code="8">1336158</subfield>
-                <subfield code="s">3334918</subfield>
-                <subfield code="u">
-                http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-A2.pdf?subformat=pdfa
-                </subfield>
-                <subfield code="x">pdfa</subfield>
-            </datafield>
             <datafield tag="856" ind1="4" ind2=" ">
                 <subfield code="8">1336158</subfield>
                 <subfield code="s">2445021</subfield>
                 <subfield code="u">http://awesome.domain/with/a/path</subfield>
             </datafield>
-            <datafield tag="856" ind1="4" ind2=" ">
-                <subfield code="8">1336159</subfield>
-                <subfield code="s">726479</subfield>
-                <subfield code="u">
-                http://cds.cern.ch/record/1393420/files/NF-EN-13480-2-AC6.pdf
-                </subfield>
-            </datafield>
             <datafield tag="856" ind1="4" ind2=" ">
                 <subfield code="8">1336157</subfield>
                 <subfield code="s">2412918</subfield>
@@ -564,17 +505,98 @@ def test_urls(app):
                     {'value': 'http://another.domain/with/a/path'},
                 ],
             })
-        with pytest.raises(ManualMigrationRequired):
-            check_transformation(
-                """
-                <datafield tag="8564" ind1=" " ind2=" ">
-                    <subfield code="u">cds.cern.ch</subfield>
-                    <subfield code="y">description</subfield>
-                </datafield>
-                """, {
-                    'urls': [{'value': 'cds.cern.ch',
-                              'description': 'description'}],
-                })
+        check_transformation(
+            """
+            <datafield tag="856" ind1="4" ind2=" ">
+                <subfield code="u">https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560</subfield>
+                <subfield code="y">ebook</subfield>
+            </datafield>
+            """,
+            {
+                "_migration":
+                    {
+                        **get_migration_dict(),
+                        'eitems_has_ebl': True,
+                        'eitems_ebl': [
+                            {
+                                'value': 'https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560'}]
+                    }
+            }
+        )
+        check_transformation(
+            """
+            <datafield tag="856" ind1="4" ind2=" ">
+                <subfield code="u">https://learning.oreilly.com/library/view/-/9781118491300/?ar</subfield>
+                <subfield code="y">ebook</subfield>
+            </datafield>
+            """,
+            {
+                "_migration":
+                    {
+                        **get_migration_dict(),
+                        'eitems_has_external': True,
+                        'eitems_external': [
+                            {
+                                'value': 'https://learning.oreilly.com/library/view/-/9781118491300/?ar'}]
+                    }
+            }
+        )
+        check_transformation(
+            """
+            <datafield tag="856" ind1="4" ind2=" ">
+                <subfield code="u">https://ezproxy.cern.ch/login?url=https://www.worldscientific.com/toc/rast/10</subfield>
+                <subfield code="y">ebook</subfield>
+            </datafield>
+            """,
+            {
+                "_migration":
+                    {
+                        **get_migration_dict(),
+                        'eitems_has_proxy': True,
+                        'eitems_proxy': [
+                            {
+                                'value': 'https://www.worldscientific.com/toc/rast/10'}]
+                    }
+            }
+        )
+        check_transformation(
+            """
+            <datafield tag="856" ind1="4" ind2=" ">
+                <subfield code="u">https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560</subfield>
+                <subfield code="y">ebook</subfield>
+            </datafield>
+            <datafield tag="856" ind1="4" ind2=" ">
+                <subfield code="u">https://learning.oreilly.com/library/view/-/9781118491300/?ar</subfield>
+                <subfield code="y">ebook</subfield>
+            </datafield>
+            """,
+            {
+                "_migration":
+                    {
+                        **get_migration_dict(),
+                        'eitems_has_ebl': True,
+                        'eitems_ebl': [
+                            {
+                                'value': 'https://cdsweb.cern.ch/auth.py?r=EBLIB_P_1139560'},
+                        ],
+                        'eitems_external': [
+                            {
+                                'value': 'https://learning.oreilly.com/library/view/-/9781118491300/?ar'},
+                        ],
+                        'eitems_has_external': True,
+                    }
+            }
+        )
+        check_transformation(
+            """
+            <datafield tag="8564" ind1=" " ind2=" ">
+                <subfield code="u">google.com</subfield>
+                <subfield code="y">description</subfield>
+            </datafield>
+            """, {
+                'urls': [{'value': 'google.com',
+                          'description': 'description'}],
+            })
 
 
 def test_authors(app):
@@ -846,16 +868,9 @@ def test_publication_info(app):
             """,
             {'document_type': 'PERIODICAL_ISSUE',
              '_migration': {
-                 'serials': [],
+                 **get_migration_dict(),
                  'journal_record_legacy_recid': "2155631",
-                 'has_tags': False,
-                 'is_multipart': False,
-                 'has_related': False,
-                 'has_serial': False,
                  'has_journal': True,
-                 'record_type': 'document',
-                 'volumes': [],
-                 'tags': [],
              },
              'conference_info': {
                  'identifiers': [
@@ -928,11 +943,11 @@ def test_extensions(app):
             </datafield>
             """,
             {
-                'extensions': [{
+                'extensions': {
                     'standard_review:applicability': 'applicable at CERN',
                     'standard_review:checkdate': 'Reviewed in December 2019',
                     'standard_review:expert': 'Expert ICS-25.160',
-                }],
+                },
             }
         )
         check_transformation(
@@ -945,12 +960,12 @@ def test_extensions(app):
             </datafield>
             """,
             {
-                'extensions': [{
+                'extensions': {
                     'standard_review:applicability': 'no longer applicable',
                     'standard_review:validity': 'withdrawn',
                     'standard_review:checkdate': 'Reviewed in December 2019',
                     'standard_review:expert': 'Expert ICS-25.160',
-                }],
+                },
             }
         )
         check_transformation(
@@ -972,19 +987,16 @@ def test_extensions(app):
             </datafield>
             """,
             {
-                'extensions': [
-                    {'unit:accelerator': 'CERN LHC',
-                     'unit:experiment': 'ATLAS'},
-                    {'unit:accelerator': 'CERN LHC',
-                     'unit:experiment': 'CMS',
-                     'unit:project': 'FCC',
-                     },
-                    {'standard_review:applicability': 'no longer applicable',
+                'extensions':
+                    {'unit:accelerator': ['CERN LHC'],
+                     'unit:experiment': ['ATLAS', 'CMS'],
+                     'unit:project': ['FCC'],
+                     'standard_review:applicability': 'no longer applicable',
                      'standard_review:validity': 'withdrawn',
                      'standard_review:checkdate': 'Reviewed in December 2019',
                      'standard_review:expert': 'Expert ICS-25.160',
-                     },
-                ],
+                     }
+
             }
         )
 
@@ -1002,16 +1014,8 @@ def test_related_record(app):
             """,
             {
                 '_migration': {
-                    'serials': [],
-                    'has_tags': False,
-                    'is_multipart': False,
+                    **get_migration_dict(),
                     'has_related': True,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'tags': [],
                     'related': [{
                         'related_recid': '748392',
                         "relation_type": "Test text"
@@ -1029,16 +1033,8 @@ def test_related_record(app):
                 """,
                 {
                     '_migration': {
-                        'serials': [],
-                        'has_tags': False,
-                        'is_multipart': False,
+                        **get_migration_dict(),
                         'has_related': True,
-                        'has_serial': False,
-                        'has_journal': False,
-                        'journal_record_legacy_recid': '',
-                        'record_type': 'document',
-                        'volumes': [],
-                        'tags': [],
                         'related': [{'related_recid': '7483924',
                                      'relation_type': "other"}],
                     },
@@ -1055,16 +1051,8 @@ def test_related_record(app):
             """,
             {
                 '_migration': {
-                    'serials': [],
-                    'has_tags': False,
-                    'is_multipart': False,
+                    **get_migration_dict(),
                     'has_related': True,
-                    'has_serial': False,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'tags': [],
                     'related': [
                         {'related_recid': '7483924', 'relation_type': 'other'},
                         {'related_recid': '748', 'relation_type': 'other'}],
@@ -1089,14 +1077,12 @@ def test_accelerator_experiments(app):
             </datafield>
             """,
             {
-                'extensions': [
-                    {'unit:accelerator': 'CERN LHC',
-                     'unit:experiment': 'ATLAS'},
-                    {'unit:accelerator': 'CERN LHC',
-                     'unit:experiment': 'CMS',
-                     'unit:project': 'FCC',
+                'extensions':
+                    {'unit:accelerator': ['CERN LHC'],
+                     'unit:experiment': ['ATLAS', 'CMS'],
+                     'unit:project': ['FCC'],
                      }
-                ]
+
             }
         )
 
@@ -1442,15 +1428,14 @@ def test_alternative_identifiers(app):
                 """, {
                 })
 
-        with pytest.raises(UnexpectedValue):
-            check_transformation(
-                """
-                <datafield tag="035" ind1=" " ind2=" ">
-                    <subfield code="9">CERCER</subfield>
-                    <subfield code="a">2365039</subfield>
-                </datafield>
-                """, {
-                })
+        check_transformation(
+            """
+            <datafield tag="035" ind1=" " ind2=" ">
+                <subfield code="9">CERCER</subfield>
+                <subfield code="a">2365039</subfield>
+            </datafield>
+            """, {
+            })
 
         check_transformation(
             """
@@ -1689,7 +1674,7 @@ def test_imprint(app):
                     'place': 'Sydney',
                     'publisher': 'Allen & Unwin',
                     'date': '2013',
-                    'reprint': '2015',
+                    'reprint_date': '2015',
                 },
             })
 
@@ -2377,6 +2362,7 @@ def test_book_series(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'serials': [
                         {
                             'title': 'Minutes',
@@ -2384,15 +2370,7 @@ def test_book_series(app):
                             'volume': None
                         }
                     ],
-                    'has_tags': False,
-                    'is_multipart': False,
-                    'has_related': False,
                     'has_serial': True,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'tags': [],
                 }
             }
         )
@@ -2406,6 +2384,7 @@ def test_book_series(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'serials': [
                         {
                             'title': 'De Gruyter studies in mathematical physics',
@@ -2413,15 +2392,7 @@ def test_book_series(app):
                             'volume': '16'
                         }
                     ],
-                    'has_tags': False,
-                    'is_multipart': False,
-                    'has_related': False,
                     'has_serial': True,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'tags': [],
                 }
             }
         )
@@ -2434,6 +2405,7 @@ def test_book_series(app):
             </datafield>
             """, {
                 '_migration': {
+                    **get_migration_dict(),
                     'serials': [
                         {
                             'title': 'Springer tracts in modern physics',
@@ -2441,15 +2413,7 @@ def test_book_series(app):
                             'volume': '267'
                         }
                     ],
-                    'has_tags': False,
-                    'is_multipart': False,
-                    'has_related': False,
                     'has_serial': True,
-                    'has_journal': False,
-                    'journal_record_legacy_recid': '',
-                    'record_type': 'document',
-                    'volumes': [],
-                    'tags': [],
                 }
             }
         )
@@ -2654,21 +2618,15 @@ def test_volume_barcodes(app):
             """,
             dict(
                 title='Mathematische Methoden der Physik',
-                _migration=dict(
-                    record_type='document',
-                    has_serial=False,
-                    is_multipart=False,
-                    has_tags=False,
-                    has_related=False,
-                    has_journal=False,
-                    journal_record_legacy_recid='',
-                    volumes=[
-                        dict(barcode='80-1209-8', volume=1),
-                        dict(barcode='B00004172', volume=1),
-                    ],
-                    serials=[],
-                    tags=[],
-                ),
+                _migration={
+                    **get_migration_dict(),
+                    **dict(
+                        volumes=[
+                            dict(barcode='80-1209-8', volume=1),
+                            dict(barcode='B00004172', volume=1),
+                        ],
+                    ), }
+
             )
         )
 
diff --git a/tests/test_matcher.py b/tests/test_matcher.py
index 550a142a..0a6975ff 100644
--- a/tests/test_matcher.py
+++ b/tests/test_matcher.py
@@ -59,12 +59,14 @@ def test_marc21_matcher_books():
     multipart_blob1 = {
         '690C_': [{'a': 'BOOK'}],
         '245__': [{'a': 'Test '}],
+        '596__': [{'a': 'MULTIVOLUMES'}],
         '246__': [{'p': 'Volume Title', 'n': '2'}]
     }
     multipart_blob2 = {
         '690C_': [{'a': 'BOOK'}],
         '245__': [{'a': 'Test '}],
-        '246__': [{'n': '2'}]
+        '246__': [{'n': '2'}],
+        '596__': [{'a': 'MULTIVOLUMES'}],
     }
     standard_blob1 = {'690C_': [{'a': 'STANDARD'}]}
     journal_blob = {'980__': [{'a': 'PERI'}]}
diff --git a/tests/test_standard.py b/tests/test_standard.py
index 7db7a8d8..823e8fb9 100644
--- a/tests/test_standard.py
+++ b/tests/test_standard.py
@@ -20,6 +20,9 @@
 
 from __future__ import absolute_import, print_function, unicode_literals
 
+from copy import deepcopy
+
+from cds_dojson.marc21.models.books.base import get_migration_dict
 from cds_dojson.marc21.models.books.standard import model
 from cds_dojson.marc21.utils import create_record
 
@@ -29,16 +32,12 @@
 
 def check_transformation(marcxml_body, json_body):
     blob = create_record(marcxml.format(marcxml_body))
-    record = model.do(blob, ignore_missing=False)
+    record = {'_migration': {**get_migration_dict()}}
+
+    record.update(**model.do(blob, ignore_missing=False))
     expected = {
         '$schema': 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json',
-        '_migration': {'has_keywords': False,
-                       'is_multipart': False,
-                       'has_related': False,
-                       'has_serial': False,
-                       'record_type': 'document',
-                       'volumes': []
-                       }
+        '_migration': {**get_migration_dict()}
     }
     expected.update(**json_body)
     assert record == expected