Skip to content

Commit

Permalink
migration: updated schema changes
Browse files Browse the repository at this point in the history
* changed all the identifiers to common structure
* removed all the emails
* removed funding info
* restructurised title objects
  • Loading branch information
kpsherva committed Jun 30, 2020
1 parent ac076e3 commit ad5b9cd
Show file tree
Hide file tree
Showing 14 changed files with 1,616 additions and 575 deletions.
409 changes: 234 additions & 175 deletions cds_dojson/marc21/fields/books/base.py

Large diffs are not rendered by default.

80 changes: 47 additions & 33 deletions cds_dojson/marc21/fields/books/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,39 +27,51 @@
MissingRequiredField, UnexpectedValue
from cds_dojson.marc21.fields.books.utils import is_excluded, extract_parts, \
extract_volume_number
from cds_dojson.marc21.fields.utils import clean_val
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.models.books.book import model
from .base import alternative_titles as alternative_titles_base


@model.over('alternative_titles', '^246__')
@for_each_value
@filter_values
@model.over('alternative_titles', '(^246__)|(^242__)')
@filter_list_values
def alternative_titles(self, key, value):
"""Alternative titles."""
if ('n' in value and 'p' not in value) or \
('n' not in value and 'p' in value):
raise MissingRequiredField(subfield='n or p')
_alternative_titles = self.get('alternative_titles', [])

if 'p' in value:
_migration = self.get('_migration', {})
if 'volumes' not in _migration:
_migration['volumes'] = []
if key == '242__':
_alternative_titles += (alternative_titles_base(self, key, value))
elif key == '246__':
if ('n' in value and 'p' not in value) or \
('n' not in value and 'p' in value):
raise MissingRequiredField(subfield='n or p')

val_n = clean_val('n', value, str)
_migration['volumes'].append({
'volume': extract_volume_number(val_n, raise_exception=True),
'title': clean_val('p', value, str),
})
_migration['is_multipart'] = True
_migration['record_type'] = 'multipart'
self['_migration'] = _migration
raise IgnoreKey('alternative_titles')
else:
return {
'title': clean_val('a', value, str, req=True),
'subtitle': clean_val('b', value, str),
'source': clean_val('i', value, str),
}
if 'p' in value:
_migration = self.get('_migration', {})
if 'volumes' not in _migration:
_migration['volumes'] = []

val_n = clean_val('n', value, str)
_migration['volumes'].append({
'volume': extract_volume_number(val_n, raise_exception=True),
'title': clean_val('p', value, str),
})
_migration['is_multipart'] = True
_migration['record_type'] = 'multipart'
self['_migration'] = _migration
raise IgnoreKey('alternative_titles')
else:
if 'a' in value:
_alternative_titles.append({
'value': clean_val('a', value, str, req=True),
'type': 'ALTERNATIVE_TITLE',
})
if 'b' in value:
_alternative_titles.append({
'value': clean_val('b', value, str, req=True),
'type': 'SUBTITLE',
})
return _alternative_titles


@model.over('number_of_pages', '^300__') # item
Expand All @@ -72,21 +84,23 @@ def number_of_pages(self, key, value):
parts = extract_parts(val)
if parts['has_extra']:
raise UnexpectedValue(subfield='a')
if parts['physical_description']:
self['physical_description'] = parts['physical_description']
if parts['physical_copy_description']:
self['physical_copy_description'] = parts['physical_copy_description']
if parts['number_of_pages']:
return parts['number_of_pages']
raise UnexpectedValue(subfield='a')


@model.over('title', '^245__')
@filter_values
@out_strip
def title(self, key, value):
"""Translates title."""
if 'title' in self:
raise UnexpectedValue()

return {
'title': clean_val('a', value, str, req=True),
'subtitle': clean_val('b', value, str),
}
if 'b' in value:
_alternative_titles = self.get('alternative_titles', [])
_alternative_titles.append(
{'value': clean_val('b', value, str), 'type': 'SUBTITLE'})
self['alternative_titles'] = _alternative_titles
return clean_val('a', value, str, req=True)
23 changes: 15 additions & 8 deletions cds_dojson/marc21/fields/books/multipart.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
ManualMigrationRequired, MissingRequiredField
from cds_dojson.marc21.fields.books.utils import extract_parts, \
extract_volume_number, extract_volume_info
from cds_dojson.marc21.fields.utils import clean_val, out_strip
from cds_dojson.marc21.fields.utils import clean_val, out_strip, \
filter_list_values
from cds_dojson.marc21.models.books.multipart import model


Expand All @@ -37,13 +38,13 @@ def recid(self, key, value):
return int(value)


@model.over('isbns', '^020__')
@out_strip
@model.over('identifiers', '^020__')
@filter_list_values
@for_each_value
def isbns(self, key, value):
"""Translates isbns stored in the record."""
_migration = self.get('_migration', {'volumes': []})
_isbns = self.get('isbns', [])
_identifiers = self.get('identifiers', [])

val_u = clean_val('u', value, str)
val_a = clean_val('a', value, str)
Expand All @@ -54,6 +55,7 @@ def isbns(self, key, value):
# if set found it means that the isbn is for the whole multipart
set_search = re.search('(.*?)\(set\.*\)', val_u)
if volume_info:
# if we have volume there it means that the ISBN is of the volume
volume_obj = {
'volume': volume_info['volume'],
'isbn': clean_val('a', value, str),
Expand All @@ -62,9 +64,11 @@ def isbns(self, key, value):
}
_migration['volumes'].append(volume_obj)
self['_migration'] = _migration
raise IgnoreKey('identifiers')
if set_search:
self['physical_description'] = set_search.group(1).strip()
return val_a if val_a not in _isbns else None # monograph isbn
isbn = {'scheme': 'ISBN', 'value': val_a}
return isbn if isbn not in _identifiers else None
if not volume_info:
# Try to find a volume number
if extract_volume_number(val_u, search=True):
Expand All @@ -74,13 +78,16 @@ def isbns(self, key, value):
)
else:
self['physical_description'] = val_u
return val_a if val_a not in _isbns else None
isbn = {'scheme': 'ISBN', 'value': val_a}
return isbn if isbn not in _identifiers else None
if not set_search and not volume_info:
self['physical_description'] = val_u
return val_a if val_a not in _isbns else None
isbn = {'scheme': 'ISBN', 'value': val_a}
return isbn if isbn not in _identifiers else None
elif not val_u and val_a:
# if I dont have volume info but only isbn
return val_a if val_a not in _isbns else None
isbn = {'scheme': 'ISBN', 'value': val_a}
return isbn if isbn not in _identifiers else None
else:
raise UnexpectedValue(subfield='a', message=' isbn not provided')

Expand Down
4 changes: 3 additions & 1 deletion cds_dojson/marc21/fields/books/serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ def recid(self, key, value):
@filter_values
def title(self, key, value):
"""Translates book series title."""
_identifiers = self.get('identifiers', [])
issn = clean_val('x', value, str)
if issn:
self['issn'] = issn
_identifiers.append({'scheme': 'ISSN', 'value': issn})
self['identifiers'] = _identifiers
self['mode_of_issuance'] = 'SERIAL'
return {'title': clean_val('a', value, str, req=True)}
48 changes: 30 additions & 18 deletions cds_dojson/marc21/fields/books/standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,34 @@

from cds_dojson.marc21.fields.books.errors import UnexpectedValue
from cds_dojson.marc21.fields.books.utils import is_excluded, extract_parts
from cds_dojson.marc21.fields.utils import clean_val
from cds_dojson.marc21.fields.utils import clean_val, filter_list_values, \
out_strip
from cds_dojson.marc21.models.books.standard import model


@model.over('title_translations', '^246__')
@for_each_value
@filter_values
@model.over('alternative_titles', '^246__')
@filter_list_values
def title_translations(self, key, value):
"""Translates title translations."""
return {
'title': clean_val('a', value, str, req=True),
'language': 'fr',
'subtitle': clean_val('b', value, str),
'source': clean_val('i', value, str),
}
_alternative_titles = self.get('alternative_titles', [])

if 'a' in value:
_alternative_titles.append({
'value': clean_val('a', value, str, req=True),
'type': 'TRANSLATED_TITLE',
'language': 'fr',
})
if 'b' in value:
_alternative_titles.append({
'value': clean_val('b', value, str, req=True),
'type': 'TRANSLATED_SUBTITLE',
'language': 'fr',
})
return _alternative_titles
# 'source': clean_val('i', value, str),

@model.over('number_of_pages', '^300__') # item

@model.over('number_of_pages', '^300__') # item
def number_of_pages(self, key, value):
"""Translates number_of_pages fields."""
val = clean_val('a', value, str)
Expand All @@ -51,21 +61,23 @@ def number_of_pages(self, key, value):
parts = extract_parts(val)
if parts['has_extra']:
raise UnexpectedValue(subfield='a')
if parts['physical_description']:
self['physical_description'] = parts['physical_description']
if parts['physical_copy_description']:
self['physical_copy_description'] = parts['physical_copy_description']
if parts['number_of_pages']:
return parts['number_of_pages']
raise UnexpectedValue(subfield='a')


@model.over('title', '^245__')
@filter_values
@out_strip
def title(self, key, value):
"""Translates title."""
if 'title' in self:
raise UnexpectedValue()

return {
'title': clean_val('a', value, str, req=True),
'subtitle': clean_val('b', value, str),
}
if 'b' in value:
_alternative_titles = self.get('alternative_titles', [])
_alternative_titles.append(
{'value': clean_val('b', value, str), 'type': 'SUBTITLE'})
self['alternative_titles'] = _alternative_titles
return clean_val('a', value, str, req=True)
2 changes: 1 addition & 1 deletion cds_dojson/marc21/fields/books/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def extract_parts(value):
return {
"has_extra": bool(valid_parts_count > 0),
"number_of_pages": number_of_pages,
"physical_description": physical_description,
"physical_copy_description": physical_description,
}


Expand Down
4 changes: 2 additions & 2 deletions cds_dojson/marc21/fields/books/values_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@
}

ACQUISITION_METHOD = {
'submitter': ['h'],
'batchuploader': ['n', 'm'],
'user': ['H'],
'batchuploader': ['N', 'M'],
}

MEDIUM_TYPES = [
Expand Down
27 changes: 15 additions & 12 deletions cds_dojson/marc21/fields/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def clean_val(subfield, value, var_type, req=False, regex_format=None,
def clean_email(value):
"""Cleans the email field."""
if value:
email = value.strip().replace(' [CERN]', '@cern.ch').\
email = value.strip().replace(' [CERN]', '@cern.ch'). \
replace('[CERN]', '@cern.ch')
return email

Expand All @@ -129,6 +129,7 @@ def get_week_start(year, week):

def replace_in_result(phrase, replace_with, key=None):
"""Replaces string values in list with given string."""

def the_decorator(fn_decorated):
def proxy(*args, **kwargs):
res = fn_decorated(*args, **kwargs)
Expand All @@ -140,12 +141,15 @@ def proxy(*args, **kwargs):
return [dict((k, v.replace(phrase, replace_with).strip())
for k, v in elem.items()) for elem in res]
return res

return proxy

return the_decorator


def filter_list_values(f):
"""Remove None and blank string values from list of dictionaries."""

@functools.wraps(f)
def wrapper(self, key, value, **kwargs):
out = f(self, key, value)
Expand All @@ -158,11 +162,13 @@ def wrapper(self, key, value, **kwargs):
return clean_list
else:
raise IgnoreKey(key)

return wrapper


def out_strip(fn_decorated):
"""Decorator cleaning output values of trailing and following spaces."""

def proxy(self, key, value, **kwargs):
res = fn_decorated(self, key, value, **kwargs)
if not res:
Expand All @@ -178,6 +184,7 @@ def proxy(self, key, value, **kwargs):
return cleaned
else:
return res

return proxy


Expand Down Expand Up @@ -365,28 +372,24 @@ def build_contributor_books(value):
if not value.get('a'):
return []

value_9 = clean_val('9', value, str)

contributor = {
'ids': _extract_json_ids(value, 'schema') or None,
'identifiers': _extract_json_ids(value, 'scheme') or None,
'full_name': value.get('name') or clean_val('a', value, str),
'email': clean_email(value.get('email')),
'roles': [
_get_correct_books_contributor_role(
'e', value.get('e', 'author')).lower()
],
'curated_relation': True if value_9 == '#BEARD#' else None
}

value_u = value.get('u')
if value_u:
contributor['affiliations'] = list(force_list(value_u))
values_u_list = list(force_list(value_u))
other = ['et al.', 'et al']
found_other = [i for i in other if i in contributor['affiliations']]
if found_other:
for x in found_other:
if x in contributor['affiliations']:
contributor['affiliations'].remove(x)
for x in other:
if x in values_u_list:
values_u_list.remove(x)
contributor['affiliations'] = [{'name': x} for x in
values_u_list]
contributor = dict(
(k, v) for k, v in iteritems(contributor) if v is not None
)
Expand Down
5 changes: 5 additions & 0 deletions cds_dojson/marc21/models/books/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
'269__a', # preprint info
'269__b', # preprint info
'269__c', # preprint date
'270__m', # conference email
'300__b', # 206 cds-dojson
'340__a',
'440_3a', # 206 cds-dojson
Expand All @@ -56,6 +57,10 @@
'502__c', # thesis_info/institutions
'502__d', # thesis_info/date (publication)
'5208_a', # 206 cds-dojson
'536__a', # founding info, dropped
'536__c',
'536__f',
'536__r',
'650172',
'65017a',
'650272',
Expand Down

0 comments on commit ad5b9cd

Please sign in to comment.