Merge 2b88c9e into 650a821

CERNDocumentServer · Jul 7, 2017 · 28bdce5 · 28bdce5
2 parents 650a821 + 2b88c9e
commit 28bdce5
Show file tree

Hide file tree

Showing 20 changed files with 1,315 additions and 1,103 deletions.
diff --git a/cds_dojson/marc21/fields/base.py b/cds_dojson/marc21/fields/base.py
@@ -30,7 +30,7 @@ def recid(self, key, value):
 @marc21.over('agency_code', '^003')
 def agency_code(self, key, value):
     """Control number identifier."""
-    return value
+    return value or 'SzGeCERN'
 
 
 @marc21.over('modification_date', '^005')

diff --git a/cds_dojson/marc21/fields/videos/__init__.py b/cds_dojson/marc21/fields/videos/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CERN Document Server.
+# Copyright (C) 2017 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# Invenio is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Invenio; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02D111-1307, USA.
+"""CDS Video fields.."""
diff --git a/cds_dojson/marc21/fields/videos/video.py b/cds_dojson/marc21/fields/videos/video.py
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CERN Document Server.
+# Copyright (C) 2017 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# Invenio is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Invenio; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+"""Video fields."""
+
+import re
+
+import requests
+from dojson.utils import filter_values, for_each_value, force_list
+
+from ...models.videos.video import model
+
+# Required fields
+
+
+@model.over('title', '^245_[1_]')
+@filter_values
+def title(self, key, value):
+    """Title."""
+    return {
+        'title': value.get('a'),
+        'subtitle': value.get('b'),
+    }
+
+
+@model.over('description', '^520__')
+def description(self, key, value):
+    """Description."""
+    return value.get('a')
+
+
+@model.over('date', '^269__')
+def date(self, key, value):
+    """Date."""
+    return value.get('c')
+
+
+@model.over('publication_date', '937__')
+def publication_date(self, key, value):
+    """Publication Date."""
+    # TODO: normalize data
+    return value.get('c')
+
+
+def _get_author_info_from_people_collection(info):
+    """Get author information from CDS auto-completion endpoint."""
+    # TODO: probably we will need to extract this somewhere else
+    URL = 'https://cds.cern.ch/submit/get_authors?query={0}&relative_curdir=cdslabs%2Fvideos'
+    if '0' in info or not info.get('a'):
+        # There is already enough information or we don't have a name to query
+        return info
+    author_info = requests.get(URL.format(info.get('a'))).json()
+    if not author_info or len(author_info) > 1:
+        # Didn't find anything or find to many matches
+        return info
+
+    # Prepare author name
+    author_info = author_info[0]
+    if 'name' not in author_info:
+        author_info['name'] = '{0}, {1}'.format(author_info['lastname'],
+                                                author_info['firstname'])
+    return author_info
+
+
+def _get_correct_role(role):
+    """Clean up roles."""
+    # TODO: decide on roles and values
+    return role
+
+
+def _extract_json_ids(info):
+    """."""
+    SOURCES = {
+        'AUTHOR|(INSPIRE)': 'INSPIRE',
+        'AUTHOR|(CDS)': 'CDS',
+        '(SzGeCERN)': 'CERN'
+    }
+    regex = re.compile('((AUTHOR\|\((CDS|INSPIRE)\))|(\(SzGeCERN\)))(.*)')
+    ids = []
+    for id_ in info.get('0', []):
+        match = regex.match(id_)
+        if not match:
+            pass
+        ids.append({
+            'value': match.group(5),
+            'source': SOURCES[match.group(1)]
+        })
+    # Try and get the IDs from the auto-completion
+    try:
+        ids.append({'value': info['cernccid'], 'source': 'CERN'})
+    except KeyError:
+        pass
+    try:
+        ids.append({'value': info['recid'], 'source': 'CDS'})
+    except KeyError:
+        pass
+    try:
+        ids.append({'value': info['inspireid'], 'source': 'INSPIRE'})
+    except KeyError:
+        pass
+
+    return ids
+
+
+@filter_values
+def _build_contributor(value):
+    """Create a."""
+    value = _get_author_info_from_people_collection(value)
+
+    return {
+        'ids': _extract_json_ids(value) or None,
+        'name': value.get('a') or value.get('name'),
+        'affiliations': force_list(value.get('u') or value.get('affiliation')),
+        'role': _get_correct_role(value.get('e')),
+        'email': value.get('email'),
+    }
+
+
+@model.over('contributors', '^(100|700)__')
+def contributors(self, key, value):
+    """Contributors."""
+    authors = self.get('contributors', [])
+    values = force_list(value)
+    for value in values:
+        authors.append(_build_contributor(value))
+    return authors
+
+
+@model.over('report_number', '^(037|088)__')
+@for_each_value
+def report_number(self, key, value):
+    """Report number.
+
+    Category and type are also derived from the report number.
+    """
+    rn = value.get('a') or value.get('9')
+    if rn and key.startswith('037__'):
+        # Extract category and type only from main report number, i.e. 037__a
+        self['category'], self['type'] = rn.split('-')[:2]
+
+    return rn
+
+
+@model.over('duration', '^300__')
+def duration(self, key, value):
+    """Duration.
+
+    The new duration must be expressed in the form hh:mm:ss[.mmm]
+    """
+    try:
+        return re.match('(\d{2}:\d{2}:\d{2})(\.\d+)?', value.get('a')).group(1)
+    except AttributeError:
+        # The regex didn't match
+        # TODO: should we try to match something else?
+        return None
+
+
+# Access
+@model.over('_access', '(^859__)|(^506[1_]_)')
+def access(self, key, value):
+    """Access rights.
+
+    It includes read/update access.
+    - 859__f contains the email of the submitter.
+    - 506__m/5061_d list of groups or emails of people who can access the
+      record. The groups are in the form <group-name> [CERN] which needs to be
+      transform into the email form.
+    """
+    _access = self.get('_access', {})
+    for value in force_list(value):
+        if key == '859__' and 'f' in value:
+            _access.setdefault('update', [])
+            _access['update'].append(value.get('f'))
+        elif key.startswith('506'):
+            _access.setdefault('read', [])
+            _access['read'].extend([
+                s.replace(' [CERN]', '@cern.ch')
+                for s in force_list(value.get('d') or value.get('m', '')) if s
+            ])
+    return _access
diff --git a/cds_dojson/marc21/models/videos/video.py b/cds_dojson/marc21/models/videos/video.py
@@ -31,6 +31,8 @@ class CDSVideo(OverdoJSONSchema):
 
     __schema__ = 'records/videos/video/video-v1.0.0.json'
 
+    __ignore_keys__ = {'035__9', '035__a', '5061_2', '5061_5', '5061_a'}
+
 
 model = CDSVideo(bases=(cds_base, ),
                  entry_point_group='cds_dojson.marc21.video')
diff --git a/cds_dojson/overdo.py b/cds_dojson/overdo.py
@@ -63,7 +63,7 @@ class Overdo(DoJSONOverdo):
     __query__ = ''
     """To be used by the matcher to find the proper model."""
 
-    __ignore_keys__ = []
+    __ignore_keys__ = set()
     """List of keys which don't need transformation."""
 
     def over(self, name, *source_tags, **kwargs):
@@ -89,7 +89,7 @@ def override(rule):
 
     def missing(self, blob, **kwargs):
         """Return keys with missing rules."""
-        return set(self.__class__.__ignore_keys__).symmetric_difference(
+        return self.__class__.__ignore_keys__.symmetric_difference(
             not_accessed_keys(blob))