Skip to content

Commit

Permalink
multipart monographs transformations
Browse files Browse the repository at this point in the history
  • Loading branch information
FlorianCassayre committed Nov 23, 2020
1 parent 7da8f7a commit 076630d
Show file tree
Hide file tree
Showing 9 changed files with 541 additions and 330 deletions.
4 changes: 2 additions & 2 deletions cds_dojson/marc21/fields/books/base.py
Expand Up @@ -735,13 +735,13 @@ def alternative_titles(self, key, value):
_alternative_titles.append({
'value': clean_val('a', value, str, req=True),
'type': 'TRANSLATED_TITLE',
'language': 'en',
'language': 'EN',
})
if 'b' in value:
_alternative_titles.append({
'value': clean_val('b', value, str, req=True),
'type': 'TRANSLATED_SUBTITLE',
'language': 'en',
'language': 'EN',
})
return _alternative_titles

Expand Down
241 changes: 211 additions & 30 deletions cds_dojson/marc21/fields/books/multipart.py
Expand Up @@ -18,32 +18,39 @@
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Books fields."""
import re
from copy import deepcopy

from dojson.errors import IgnoreKey
from dojson.utils import for_each_value, force_list

from cds_dojson.marc21.fields.books.base import book_series as base_book_series
from cds_dojson.marc21.fields.books.errors import MissingRequiredField, \
UnexpectedValue
from cds_dojson.marc21.fields.books.utils import extract_parts, \
extract_volume_info, extract_volume_number
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.fields.utils import build_contributor_books, \
clean_val, filter_list_values, out_strip
from cds_dojson.marc21.models.books.multipart import model

from .base import alternative_identifiers as alternative_identifiers_base
from .base import urls as urls_base
from .values_mapping import MATERIALS, mapping

@model.over('legacy_recid', '^001')
def recid(self, key, value):
"""Record Identifier."""
return int(value)

def _insert_volume(_migration, volume_number, volume_obj):
"""Finds the corresponding volume, or create it, and insert the attribute."""
volumes = _migration['volumes']
volume_obj = deepcopy(volume_obj)
volume_obj['volume'] = volume_number
volumes.append(volume_obj)
return volume_obj

@model.over('identifiers', '^020__')

@model.over('identifiers', '^020__', override=True)
@filter_list_values
@for_each_value
def isbns(self, key, value):
"""Translates isbns stored in the record."""
_migration = self.get('_migration', {'volumes': []})
_migration = self['_migration']
_identifiers = self.get('identifiers', [])

val_u = clean_val('u', value, str)
Expand All @@ -57,21 +64,28 @@ def isbns(self, key, value):
if volume_info:
# if we have volume there it means that the ISBN is of the volume
volume_obj = {
'volume': volume_info['volume'],
'isbn': clean_val('a', value, str),
'physical_description': volume_info['description'].strip(),
'is_electronic': val_b is not None,
}
_migration['volumes'].append(volume_obj)
self['_migration'] = _migration
_insert_volume(_migration, volume_info['volume'], volume_obj)
raise IgnoreKey('identifiers')
if set_search:
self['physical_description'] = set_search.group(1).strip()
isbn = {'scheme': 'ISBN', 'value': val_a}
return isbn if isbn not in _identifiers else None
if not volume_info:
# Try to find a volume number
if extract_volume_number(val_u, search=True):
volume_number = extract_volume_number(val_u)
if volume_number:
# volume, but without description
volume_obj = {
'isbn': clean_val('a', value, str),
'is_electronic': val_b is not None,
}
_insert_volume(_migration, volume_number, volume_obj)
raise IgnoreKey('identifiers')
elif extract_volume_number(val_u, search=True):
raise UnexpectedValue(
subfield='u',
message=' found volume but failed to parse description'
Expand All @@ -92,6 +106,106 @@ def isbns(self, key, value):
raise UnexpectedValue(subfield='a', message=' isbn not provided')


@model.over('dois', '^0247_', override=True)
@filter_list_values
def dois(self, key, value):
"""Translates DOIs."""
_migration = self['_migration']
_identifiers = self.get('identifiers', [])
for v in force_list(value):
val_2 = clean_val('2', v, str)
if val_2 and val_2 != 'DOI':
raise UnexpectedValue(subfield='2', message=' field is not equal to DOI')
val_q = clean_val('q', v, str, transform='lower')
volume_info = extract_volume_info(val_q)
doi = {
'value': clean_val('a', v, str, req=True),
'source': clean_val('9', v, str),
'scheme': 'DOI',
}
if volume_info:
# this identifier is for a specific volume
volume_obj = {
'doi': doi['value'],
'material': mapping(MATERIALS, volume_info['description'], raise_exception=True),
'source': doi['source']
}
_insert_volume(_migration, volume_info['volume'], volume_obj)
else:
if re.match(r'.* \(.*\)', val_q):
raise UnexpectedValue(subfield='q',
message=' found a volume number but could not extract it')
doi['material'] = mapping(MATERIALS, val_q, raise_exception=True)
if doi not in _identifiers:
_identifiers.append(doi)
if len(_identifiers) > 0:
self['identifiers'] = _identifiers


@model.over('alternative_identifiers', '(^035__)|(^036__)', override=True)
def alternative_identifiers(self, key, value):
"""Translates external_system_identifiers fields."""
return alternative_identifiers_base(self, key, value)


@model.over('barcode', '^088__', override=True)
def barcode(self, key, value):
"""Translates the barcodes."""
_migration = self['_migration']
for v in force_list(value):
val_a = clean_val('a', v, str)
val_n = clean_val('n', v, str)
val_x = clean_val('x', v, str)
val_9 = clean_val('9', v, str)
if val_a or val_9:
if val_n or val_x or val_a and val_9:
raise UnexpectedValue()
identifier = {'scheme': 'report_number', 'value': val_a or val_9}
if val_9:
identifier['hidden'] = True
identifiers = self.get('identifiers', [])
identifiers.append(identifier)
self['identifiers'] = identifiers
raise IgnoreKey('barcode')

if val_n and val_x:
volume_number = extract_volume_number(
val_n,
raise_exception=True,
subfield='n'
)
_insert_volume(_migration, volume_number, {'barcode': val_x})
elif val_x:
raise MissingRequiredField(subfield='n',
message=' this record is missing a volume number')
else:
raise MissingRequiredField(subfield='x',
message=' this record is missing a barcode number')
raise IgnoreKey('barcode')


@model.over('authors', '(^100__)|(^700__)', override=True)
@filter_list_values
def authors(self, key, value):
"""Translates the authors field."""
_migration = self['_migration']
_authors = _migration.get('authors', [])
item = build_contributor_books(value)
if item and item not in _authors:
_authors.append(item)
try:
if 'u' in value:
other = ['et al.', 'et al']
val_u = list(force_list(value.get('u')))
if [i for i in other if i in val_u]:
_migration['other_authors'] = True
except UnexpectedValue:
pass
_migration['authors'] = _authors
self['authors'] = list(map(lambda author: author['full_name'], _authors))
raise IgnoreKey('authors')


@model.over('title', '^245__')
@out_strip
def title(self, key, value):
Expand All @@ -108,17 +222,16 @@ def title(self, key, value):
@model.over('_migration', '^246__')
def migration(self, key, value):
"""Translates volumes titles."""
_series_title = self.get('title', None)
volume_title = self.get('title', None)

# I added this in the model, I'm sure it's there
_migration = self.get('_migration', {})
if 'volumes' not in _migration:
_migration['volumes'] = []

for v in force_list(value):
# check if it is a multipart monograph
val_n = clean_val('n', v, str)
val_p = clean_val('p', v, str)
val_y = clean_val('y', v, str)
if not val_n and not val_p:
raise UnexpectedValue(
subfield='n', message=' this record is probably not a series')
Expand All @@ -127,12 +240,6 @@ def migration(self, key, value):
subfield='n', message=' volume title exists but no volume number'
)

if val_p and extract_volume_number(val_p, search=True):
# Some records have the volume number in p
raise UnexpectedValue(
subfield='p', message=' found volume number in the title'
)

volume_index = re.findall(r'\d+', val_n) if val_n else None
if volume_index and len(volume_index) > 1:
raise UnexpectedValue(subfield='n',
Expand All @@ -143,11 +250,16 @@ def migration(self, key, value):
raise_exception=True,
subfield='n'
)
volume_obj = {'title': val_p,
'volume': volume_number,
}
_migration['volumes'].append(volume_obj)
if not _series_title:
obj = {'title': val_p or volume_title}
if val_y:
if re.match('\\d+', val_y) and 1800 <= int(val_y) <= 2021:
obj['publication_year'] = int(val_y)
else:
raise UnexpectedValue(subfield='y',
message=' unrecognized publication year')

_insert_volume(_migration, volume_number, obj)
if not volume_title:
raise MissingRequiredField(
subfield='a', message=' this record is missing a main title')

Expand All @@ -173,8 +285,77 @@ def number_of_volumes(self, key, value):
raise IgnoreKey('number_of_volumes')


@model.over('book_series', '^490__')
@model.over('book_series', '^490__', override=True)
@filter_list_values
@for_each_value
def book_series(self, key, value):
"""Match barcodes to volumes."""
base_book_series(self, key, value)
"""Translates issns stored in the record."""
val_a = clean_val('a', value, str)
val_x = clean_val('x', value, str)

if val_a:
if 'title' in self:
raise UnexpectedValue(subfield='a', message=' series has already a title')
if val_x:
raise UnexpectedValue(subfield='x', message=' should not be present')
self['title'] = val_a
else:
raise UnexpectedValue(subfield='x')

raise IgnoreKey('book_series')


@model.over('is_singleton', '^596__')
def is_singleton(self, key, value):
"""Multivolume kind."""
val_a = clean_val('a', value, str)
_migration = self['_migration']
if val_a == 'MULTIVOLUMES-1':
parsed = True
elif val_a == 'MULTIVOLUMES-X' or val_a == 'MULTIVOLUMES-x':
parsed = False
elif val_a == 'MULTIVOLUMES-MANUAL':
raise Exception('This record should not be migrated!')
else:
raise UnexpectedValue(
subfield='a', message=' unrecognized migration multipart tag'
)
_migration['is_singleton'] = parsed
raise IgnoreKey('is_singleton')


@model.over('serial_id', '^597__')
def serial_id(self, key, value):
"""Volume serial id."""
val_a = clean_val('a', value, str)
_migration = self['_migration']
_migration['serial_id'] = val_a
raise IgnoreKey('serial_id')


@model.over('urls', '^8564_', override=True)
def urls(self, key, value):
"""Translates urls field."""
sub_y = clean_val('y', value, str, default='')
sub_u = clean_val('u', value, str, req=True)

_migration = self['_migration']

volume_info = extract_volume_info(sub_y) if sub_y else None

if volume_info:
# url for a specific volume
# TODO?
description = volume_info['description']
volume_number = volume_info['volume']
if description != 'ebook':
raise UnexpectedValue(subfield='y',
message=' unsupported value')
volume_obj = {
'url': sub_u,
'description': description,
}
_insert_volume(_migration, volume_info['volume'], volume_obj)
raise IgnoreKey('urls')
else:
return urls_base(self, key, value)
22 changes: 12 additions & 10 deletions cds_dojson/marc21/fields/books/utils.py
Expand Up @@ -24,13 +24,14 @@

MAX_PAGES_NUMBER = 8192

RE_STR_VOLUME_NUMBER = r'(v(ol(ume)?)?|part|p|pt|t)[\s\.]*(\d+)'
RE_STR_VOLUME_PREFIX = r'(?:(?:[Vv](?:ol(?:ume)?)?|[Pp](?:art(?:ie)?|t)?|[Tt](?:eil)?|[Bb]d|[Tt]ome?|course|conference|fasc(?:icule)?|book|unit|suppl|Tafeln|Tomo)[\s\.]*)'
RE_STR_ROMAN_NUMERAL = r'(?:(?:IX|IV|V?I{1,3})|X{1,3}(?:IX|IV|V?I{0,3}))'
RE_STR_VOLUME_SUFFIX = r'(\d{{1,4}}|\d\d?[a-zA-Z]|[a-zA-Z]\d|[A-H]|{})'.format(RE_STR_ROMAN_NUMERAL)
RE_STR_VOLUME = r'(?:{}?{})'.format(RE_STR_VOLUME_PREFIX, RE_STR_VOLUME_SUFFIX)
RE_STR_SPECIAL = r'[^0-9A-Za-zÀ-ÿ\-/]'

RE_VOLUME_NUMBER = re.compile(RE_STR_VOLUME_NUMBER, re.IGNORECASE)
RE_VOLUME_INFO = re.compile(
r'(.*?)\({}\)'.format(RE_STR_VOLUME_NUMBER),
re.IGNORECASE
)
RE_VOLUME_NUMBER = re.compile(r'(?:^|{}){}(?:$|{})'.format(RE_STR_SPECIAL, RE_STR_VOLUME, RE_STR_SPECIAL))
RE_VOLUME_INFO = re.compile(r'(.*?)\({}\)'.format(RE_STR_VOLUME))


def is_excluded(value):
Expand Down Expand Up @@ -99,14 +100,15 @@ def extract_parts(value):
def extract_volume_number(value, search=False, raise_exception=False,
subfield=None):
"""Extract the volume number from a string, returns None if not matched."""
regex = RE_VOLUME_NUMBER
if search:
func = RE_VOLUME_NUMBER.search
func = regex.search
else:
func = RE_VOLUME_NUMBER.match
func = regex.match

result = func(value.strip())
if result:
return int(result.group(4))
return result.group(1)

if raise_exception:
raise MissingRequiredField(
Expand All @@ -121,6 +123,6 @@ def extract_volume_info(value):
if result:
return dict(
description=result.group(1).strip(),
volume=int(result.group(5)),
volume=result.group(2),
)
return None
1 change: 1 addition & 0 deletions cds_dojson/marc21/fields/utils.py
Expand Up @@ -161,6 +161,7 @@ def filter_list_values(f):
def wrapper(self, key, value, **kwargs):
out = f(self, key, value)
if out:
print(out)
clean_list = [dict((k, v) for k, v in elem.items()
if v) for elem in out if elem]
clean_list = [elem for elem in clean_list if elem]
Expand Down

0 comments on commit 076630d

Please sign in to comment.