books: fixed record match queries

* fixed dates and pages types in the output
CERNDocumentServer · Jul 8, 2020 · 746653c · 746653c
1 parent f3664d9
commit 746653c
Show file tree

Hide file tree

Showing 11 changed files with 110 additions and 84 deletions.
diff --git a/cds_dojson/marc21/fields/books/base.py b/cds_dojson/marc21/fields/books/base.py
@@ -25,6 +25,7 @@
 
 import pycountry
 from dateutil import parser
+from dateutil.parser import ParserError
 from dojson.errors import IgnoreKey
 from dojson.utils import filter_values, flatten, for_each_value, force_list
 
@@ -433,7 +434,7 @@ def alternative_identifiers(self, key, value):
         if field_type and field_type.lower() == 'doi':
             # if 0247__2 == doi it is a DOI identifier
             self['identifiers'] = dois(self, key, value)
-            raise IgnoreKey('external_system_identifiers')
+            raise IgnoreKey('alternative_identifiers')
         elif field_type and field_type.lower() == 'asin':
             indentifier_entry.update({'value': sub_a,
                                       'scheme': 'ASIN'})
@@ -616,6 +617,7 @@ def languages(self, key, value):
 @out_strip
 def subject_classification(self, key, value):
     """Translates subject classification field."""
+    prev_subjects = self.get('subjects', [])
     _subject_classification = {'value': clean_val('a', value, str, req=True)}
     if key == '080__':
         _subject_classification.update({'scheme': 'UDC'})
@@ -630,7 +632,10 @@ def subject_classification(self, key, value):
             _subject_classification.update({'scheme': 'ICS'})
     elif key.startswith('050'):
         _subject_classification.update({'scheme': 'LoC'})
-    return _subject_classification
+    if _subject_classification not in prev_subjects:
+        return _subject_classification
+    else:
+        raise IgnoreKey('subjects')
 
 
 @model.over('keywords', '(^084__)|(^6531_)')
@@ -644,14 +649,14 @@ def keywords(self, key, value):
         sub_2 = clean_val('2', value, str)
         if sub_2 and sub_2 == 'PACS':
             _keywords.append({
-                'name': clean_val('a', value, str, req=True),
+                'value': clean_val('a', value, str, req=True),
                 'source': 'PACS',
             })
         else:
             raise IgnoreKey('keywords')
     elif key == '6531_':
         _keywords.append({
-            'name': clean_val('a', value, str),
+            'value': clean_val('a', value, str),
             'source': value.get('9') or value.get('g'),
             # Easier to solve here
         })
@@ -737,8 +742,11 @@ def imprint(self, key, value):
     reprint = clean_val('g', value, str)
     if reprint:
         reprint = reprint.lower().replace('repr.', '').strip()
-    date = parser.parse(clean_val('c', value, str, req=True))
-    self['publication_year'] = date.date().year
+    try:
+        date = parser.parse(clean_val('c', value, str, req=True))
+    except ParserError:
+        raise UnexpectedValue(subfield='c')
+    self['publication_year'] = str(date.date().year)
     return {
         'date': clean_val('c', value, str, req=True),
         'place': clean_val('a', value, str),
@@ -768,22 +776,18 @@ def book_series(self, key, value):
 
 
 @model.over('note', '^500__')
-@filter_values
+@out_strip
 def note(self, key, value):
     """Translates public notes."""
     # merge all found notes
-    _note = self.get('note', {})
-    _value = _note.get('value', '')
-    if _value:
-        _value = \
-            "{0} / {1}".format(_value, clean_val('a', value, str, req=True))
+    _note = self.get('note', "")
+    if _note:
+        _note = \
+            "{0} / {1}".format(_note, clean_val('a', value, str, req=True))
     else:
-        _value = clean_val('a', value, str, req=True)
+        _note = clean_val('a', value, str, req=True)
 
-    return {
-        'value': _value,
-        'source': clean_val('9', value, str)
-    }
+    return _note
 
 
 @model.over('alternative_abstracts', '^520__')

diff --git a/cds_dojson/marc21/fields/books/book.py b/cds_dojson/marc21/fields/books/book.py
@@ -87,7 +87,7 @@ def number_of_pages(self, key, value):
     if parts['physical_copy_description']:
         self['physical_copy_description'] = parts['physical_copy_description']
     if parts['number_of_pages']:
-        return parts['number_of_pages']
+        return str(parts['number_of_pages'])
     raise UnexpectedValue(subfield='a')
 
 

diff --git a/cds_dojson/marc21/fields/books/multipart.py b/cds_dojson/marc21/fields/books/multipart.py
@@ -93,13 +93,16 @@ def isbns(self, key, value):
 
 
 @model.over('title', '^245__')
-@filter_values
+@out_strip
 def title(self, key, value):
     """Translates book series title."""
     # assume that is goes by order of fields and check 245 first
-    return {'title': clean_val('a', value, str),
-            'subtitle': clean_val('b', value, str),
-            }
+    if 'b' in value:
+        _alternative_titles = self.get('alternative_titles', [])
+        _alternative_titles.append({'type': 'SUBTITLE',
+                                    'value': clean_val('b', value, str)})
+        self['alternative_titles'] = _alternative_titles
+    return clean_val('a', value, str)
 
 
 @model.over('_migration', '^246__')

diff --git a/cds_dojson/marc21/fields/books/serial.py b/cds_dojson/marc21/fields/books/serial.py
@@ -20,7 +20,7 @@
 from dojson.utils import filter_values, for_each_value
 
 from cds_dojson.marc21.fields.books.errors import UnexpectedValue
-from cds_dojson.marc21.fields.utils import clean_val
+from cds_dojson.marc21.fields.utils import clean_val, out_strip
 from cds_dojson.marc21.models.books.serial import model
 
 
@@ -32,7 +32,7 @@ def recid(self, key, value):
 
 @model.over('title', '^490__')
 @for_each_value
-@filter_values
+@out_strip
 def title(self, key, value):
     """Translates book series title."""
     _identifiers = self.get('identifiers', [])
@@ -41,4 +41,4 @@ def title(self, key, value):
         _identifiers.append({'scheme': 'ISSN', 'value': issn})
         self['identifiers'] = _identifiers
     self['mode_of_issuance'] = 'SERIAL'
-    return {'title': clean_val('a', value, str, req=True)}
+    return clean_val('a', value, str, req=True)
diff --git a/cds_dojson/marc21/models/books/book.py b/cds_dojson/marc21/models/books/book.py
@@ -29,7 +29,9 @@ class CDSBook(CDSOverdoBookBase):
 
     __query__ = '690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
                 '690C_:BOOKSUGGESTION OR 980__:PROCEEDINGS OR 980__:PERI OR ' \
-                '697C_:LEGSERLIB OR 697C_:"ENGLISH BOOK CLUB" -980__:DELETED'
+                '(-980:STANDARD 980:BOOK) OR ' \
+                '697C_:LEGSERLIB ' \
+                '-980__:DELETED -980__:MIGRATED -980:__STANDARD' \
 
     __schema__ = 'https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json'
 

diff --git a/cds_dojson/marc21/models/books/multipart.py b/cds_dojson/marc21/models/books/multipart.py
@@ -29,7 +29,7 @@ class CDSMultipart(CDSOverdoBookBase):
 
     __query__ = '(690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
                 '690C_:BOOKSUGGESTION OR 980__:PROCEEDINGS OR 980__:PERI OR ' \
-                '697C_:LEGSERLIB OR 697C_:"ENGLISH BOOK CLUB" -980__:DELETED)'\
+                '697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)'\
                 'AND 246__:/[a-zA-Z0-9]+/ '
 
     __schema__ = 'https://127.0.0.1:5000/schemas/series/series-v1.0.0.json'
@@ -67,6 +67,7 @@ class CDSMultipart(CDSOverdoBookBase):
         '082__a',
         '084__2',
         '084__a',
+        '084__c',
         '088__9',
         '088__a',
         '100__a',

diff --git a/cds_dojson/marc21/models/books/serial.py b/cds_dojson/marc21/models/books/serial.py
@@ -29,7 +29,7 @@ class CDSSerial(CDSOverdoBookBase):
 
     __query__ = '(690C_:BOOK OR 690C_:"YELLOW REPORT" OR ' \
                 '690C_:BOOKSUGGESTION OR 980__:PROCEEDINGS OR 980__:PERI OR ' \
-                '697C_:LEGSERLIB OR 697C_:"ENGLISH BOOK CLUB" -980__:DELETED)' \
+                '697C_:LEGSERLIB -980__:DELETED -980__:MIGRATED)' \
                 ' AND 490__:/[a-zA-Z0-9]+/'
 
     __schema__ = 'https://127.0.0.1:5000/schemas/series/series-v1.0.0.json'

diff --git a/cds_dojson/marc21/models/books/standard.py b/cds_dojson/marc21/models/books/standard.py
@@ -28,7 +28,7 @@
 class CDSStandard(CDSOverdoBookBase):
     """Translation Index for CDS Books."""
 
-    __query__ = '690C_:STANDARD -980__:DELETED'
+    __query__ = '690C_:STANDARD OR 980__:STANDARD -980__:DELETED -980__:MIGRATED'
 
     __schema__ = 'records/books/book/book-v.0.0.1.json'
 

diff --git a/tests/test_books.py b/tests/test_books.py
@@ -161,10 +161,10 @@ def test_subject_classification(app):
             </datafield>
             """, {
                 'keywords': [
-                    {'name': '13.75.Jz', 'source': 'PACS'},
-                    {'name': '13.60.Rj', 'source': 'PACS'},
-                    {'name': '14.20.Jn', 'source': 'PACS'},
-                    {'name': '25.80.Nv', 'source': 'PACS'},
+                    {'value': '13.75.Jz', 'source': 'PACS'},
+                    {'value': '13.60.Rj', 'source': 'PACS'},
+                    {'value': '14.20.Jn', 'source': 'PACS'},
+                    {'value': '25.80.Nv', 'source': 'PACS'},
                 ]
             }
         )
@@ -1626,7 +1626,7 @@ def test_imprint(app):
                 <subfield code="g">2015</subfield>
             </datafield>
             """, {
-                'publication_year': 2013,
+                'publication_year': "2013",
                 'imprint': {
                     'place': 'Sydney',
                     'publisher': 'Allen & Unwin',
@@ -1681,15 +1681,15 @@ def test_number_of_pages(app):
                 <subfield code="a">373 p</subfield>
             </datafield>
             """, {
-                'number_of_pages': 373,
+                'number_of_pages': "373",
             })
         check_transformation(
             """
             <datafield tag="300" ind1=" " ind2=" ">
                 <subfield code="a">480 p. ; 1 CD-ROM suppl</subfield>
             </datafield>
             """, {
-                'number_of_pages': 480,
+                'number_of_pages': "480",
                 'physical_copy_description': '1 CD-ROM'
             })
         check_transformation(
@@ -1698,7 +1698,7 @@ def test_number_of_pages(app):
                 <subfield code="a">42 p. ; 2 CD-ROM ; 1 DVD, 1 vhs</subfield>
             </datafield>
             """, {
-                'number_of_pages': 42,
+                'number_of_pages': "42",
                 'physical_copy_description': '2 CD-ROM, 1 DVD, 1 VHS'
             })
         check_transformation(
@@ -2209,10 +2209,7 @@ def test_note(app):
             </datafield>
             """,
             {
-                'note':
-                    {
-                        'value': """Translated from ... / No CD-ROM"""
-                    },
+                'note': """Translated from ... / No CD-ROM"""
             }
         )
         check_transformation(
@@ -2224,11 +2221,7 @@ def test_note(app):
                 </subfield>
             </datafield>
             """, {
-                'note':
-                    {
-                        'value': """Comments: Book, 380 p.,""",
-                        'source': 'arXiv',
-                    },
+                'note': """Comments: Book, 380 p.,"""
 
             }
         )
@@ -2504,7 +2497,7 @@ def test_541(app):
                         {'value': "practice and application",
                          'type': 'SUBTITLE'}
                     ],
-                    'recid': 2654497,
+                    'legacy_recid': 2654497,
                     'isbns': [
                         {
                             'medium': "print version, hardback",
@@ -2571,7 +2564,7 @@ def test_keywords(app):
             """,
             {
                 'keywords': [
-                    {'name': 'Keyword Name 1', 'source': 'PACS'},
+                    {'value': 'Keyword Name 1', 'source': 'PACS'},
                 ],
             }
         )

diff --git a/tests/test_matcher.py b/tests/test_matcher.py
@@ -16,7 +16,7 @@
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02D111-1307, USA.
-
+import pytest
 from dojson.contrib import marc21 as default
 
 from cds_dojson.marc21.models.books import book, standard, serial, multipart
@@ -70,7 +70,10 @@ def test_marc21_matcher_books():
 
     assert book.model == matcher(book_blob1, 'cds_dojson.marc21.models')
     assert book.model == matcher(book_blob2, 'cds_dojson.marc21.models')
-    assert book.model == matcher(book_blob3, 'cds_dojson.marc21.models')
+
+    with pytest.raises(AssertionError):
+        # English book club should not be matched
+        assert book.model == matcher(book_blob3, 'cds_dojson.marc21.models')
     assert standard.model == matcher(standard_blob1,
                                      'cds_dojson.marc21.models')
     assert serial.model == matcher(serial_blob1,