Skip to content

Commit

Permalink
Merge branch 'develop' into feature/ingregulate
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Nov 22, 2017
2 parents d6e82da + b6bf925 commit 1f722dc
Show file tree
Hide file tree
Showing 10 changed files with 125 additions and 24 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
## Added
* Support for set blacklists for sources that follow OAI-PMH protocol
* Set whitelist for UA Campus Repository
* Support for encrypted json field and start using it in SourceConfig model

## Changed
* Collect metadata in MODS format from UA Campus Repository
* Update columbia.edu harvester source config (disabled set to false)

## Fixed
* Backfill CHANGELOG.md to include `2.10.0` and `2.11.0`
Expand Down
6 changes: 6 additions & 0 deletions project/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from django.utils.log import DEFAULT_LOGGING

from celery.schedules import crontab
import jwe


def split(string, delim):
Expand All @@ -37,6 +38,11 @@ def split(string, delim):
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = os.environ.get('SECRET_KEY', 'c^0=k9r3i2@kh=*=(w2r_-sc#fd!+b23y%)gs+^0l%=bt_dst0')

SALT = os.environ.get('SALT', 'r_-78y%c^(w2_ds0d*=t!+c=s+^0l=bt%2isc#f2@kh=0k5r)g')

SENSITIVE_DATA_KEY = jwe.kdf(SECRET_KEY.encode('utf-8'), SALT.encode('utf-8'))


# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = bool(os.environ.get('DEBUG', True))

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ bleach==1.4.3 # Apache 2.0
boto3==1.4.4 # Apache 2.0
celery==4.1.0 # BSD 3 Clause
colorlog==2.7.0 # MIT
cryptography==1.9 # BSD or Apache 2.0
dateparser==0.4.0 # BSD
django-allauth==0.31.0 # MIT
django-cors-headers==2.0.2 # MIT
Expand Down Expand Up @@ -34,6 +35,7 @@ pillow==4.0.0 # PIL Software License:
psycogreen==1.0 # BSD
psycopg2==2.6.1 # LGPL with exceptions or ZPL
pycountry==1.20 # LGPL 2.1
PyJWE==1.0.0 # Apache 2.0
pyyaml==3.11 # MIT
raven==6.1.0 # BSD
requests==2.10.0 # Apache 2.0
Expand Down
26 changes: 26 additions & 0 deletions share/migrations/0048_auto_20171113_1852.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.4 on 2017-11-13 18:52
from __future__ import unicode_literals

from django.db import migrations
import share.models.fields


class Migration(migrations.Migration):

dependencies = [
('share', '0047_auto_20171019_2018'),
]

operations = [
migrations.AddField(
model_name='sourceconfig',
name='private_harvester_kwargs',
field=share.models.fields.EncryptedJSONField(blank=True, null=True),
),
migrations.AddField(
model_name='sourceconfig',
name='private_transformer_kwargs',
field=share.models.fields.EncryptedJSONField(blank=True, null=True),
),
]
31 changes: 30 additions & 1 deletion share/models/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from decimal import Decimal
from functools import partial

import six
from dateutil import parser
import jwe
from psycopg2.extras import Json
import six

from django import forms
from django.conf import settings
from django.contrib.contenttypes.fields import GenericRelation
from django.contrib.postgres import lookups
from django.contrib.postgres.fields.jsonb import JSONField
Expand Down Expand Up @@ -550,3 +552,30 @@ def bulk_related_objects(self):
# https://github.com/django/django/blob/master/django/db/models/deletion.py#L151
# Disable django cascading deletes for this field
raise AttributeError('This is a dirty hack')


class EncryptedJSONField(models.BinaryField):
"""
This field transparently encrypts data in the database. It should probably only be used with PG unless
the user takes into account the db specific trade-offs with TextFields.
"""
prefix = b'jwe:::'

def get_db_prep_value(self, input_json, **kwargs):
if not input_json:
return None

input_json = self.prefix + jwe.encrypt(json.dumps(input_json).encode('utf-8'), settings.SENSITIVE_DATA_KEY)

return input_json

def to_python(self, output_json):
if not output_json:
return None

output_json = json.loads(jwe.decrypt(bytes(output_json[len(self.prefix):]), settings.SENSITIVE_DATA_KEY).decode('utf-8'))

return output_json

def from_db_value(self, value, expression, connection, context):
return self.to_python(value)
6 changes: 5 additions & 1 deletion share/models/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
from django.utils import timezone
from django.utils.deconstruct import deconstructible

from share.util.extensions import Extensions
from share.models.fields import EncryptedJSONField
from share.models.fuzzycount import FuzzyCountManager
from share.models.indexes import ConcurrentIndex
from share.util import chunked, placeholders, BaseJSONAPIMeta
from share.util.extensions import Extensions


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -129,6 +130,9 @@ class SourceConfig(models.Model):

disabled = models.BooleanField(default=False)

private_harvester_kwargs = EncryptedJSONField(blank=True, null=True)
private_transformer_kwargs = EncryptedJSONField(blank=True, null=True)

objects = NaturalKeyManager('label')

class JSONAPIMeta(BaseJSONAPIMeta):
Expand Down
43 changes: 22 additions & 21 deletions share/search/fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,24 @@ class CreativeWorkFetcher(Fetcher):
),
'''

# Gather all the works we want, so postgres doesn't get confused by the huge query below
# Exclude works with empty titles or too many identifiers
'''
all_creative_works AS (
SELECT *
FROM share_creativework AS all_creative_works
WHERE id IN (SELECT id FROM pks)
AND title != ''
AND (
SELECT COUNT(*) FROM (
SELECT * FROM share_workidentifier
WHERE share_workidentifier.creative_work_id = all_creative_works.id
LIMIT %(max_identifiers)s + 1
) AS identifiers
) <= %(max_identifiers)s
),
'''

# For each work, construct the JSON that (after post-processing) will be sent to elasticsearch
'''
results AS (
Expand All @@ -234,19 +252,13 @@ class CreativeWorkFetcher(Fetcher):
, 'retractions', COALESCE(retractions, '{}')
, 'lineage', COALESCE(lineage, '{}')
) AS _source
FROM share_creativework AS creativework
FROM all_creative_works AS creativework
LEFT JOIN LATERAL (
SELECT sources FROM all_sources WHERE creativework_id = creativework.id
) AS sources ON TRUE
LEFT JOIN all_sources ON all_sources.creativework_id = creativework.id
LEFT JOIN LATERAL (
SELECT tags FROM all_tags WHERE creativework_id = creativework.id
) AS tags ON TRUE
LEFT JOIN all_tags ON all_tags.creativework_id = creativework.id
LEFT JOIN LATERAL (
SELECT related_agents FROM all_related_agents WHERE creativework_id = creativework.id
) AS related_agents ON TRUE
LEFT JOIN all_related_agents ON all_related_agents.creativework_id = creativework.id
LEFT JOIN LATERAL (
SELECT array_agg(identifier.uri) AS identifiers
Expand Down Expand Up @@ -360,17 +372,6 @@ class CreativeWorkFetcher(Fetcher):
AND work_relation.type = %(retraction_relation)s
AND NOT retraction.is_deleted
) AS retractions ON TRUE
'''

# Exclude works with empty titles or too many identifiers
'''
WHERE creativework.id IN (SELECT id FROM pks)
AND creativework.title != ''
AND (
SELECT COUNT(*) FROM share_workidentifier
WHERE share_workidentifier.creative_work_id = creativework.id
LIMIT %(max_identifiers)s + 1
) <= %(max_identifiers)s
)
'''
)
Expand Down
2 changes: 1 addition & 1 deletion share/sources/edu.columbia/source.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
configs:
- base_url: http://academiccommons.columbia.edu/catalog/oai
disabled: true
disabled: false
earliest_date: null
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
Expand Down
1 change: 1 addition & 0 deletions tests/factories/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class MockHarvester(BaseHarvester):

mock_entry = mock.create_autospec(pkg_resources.EntryPoint, instance=True)
mock_entry.name = self.key
mock_entry.module_name = self.key
mock_entry.resolve.return_value = MockHarvester

stevedore.ExtensionManager.ENTRY_POINT_CACHE['share.harvesters'].append(mock_entry)
Expand Down
30 changes: 30 additions & 0 deletions tests/share/util/test_encrypted_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pytest

from share.models.fields import EncryptedJSONField


class TestEncryptedJsonField:

@pytest.fixture
def field(self):
return EncryptedJSONField(null=True, blank=True)

@pytest.mark.parametrize('input_text, output_text, isempty', [
(['atom', {'elements': ['hydrogen', 'oxygen', 1.0, 2]}], ['atom', {'elements': ['hydrogen', 'oxygen', 1.0, 2]}], False),
({'msg': u'hello'}, {'msg': u'hello'}, False),
({"model": u'찦차КЛМНО💁◕‿◕。)╱i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤  ǝɹol', "type": 'XE'}, {"model": u'찦차КЛМНО💁◕‿◕。)╱i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤  ǝɹol', "type": 'XE'}, False),
({}, None, True),
('', None, True),
([], None, True),
(set(), None, True)
])
def test_encrypt_and_decrypt(self, field, input_text, output_text, isempty):
my_value_encrypted = field.get_db_prep_value(input_text)

if isempty:
assert my_value_encrypted is None
else:
assert isinstance(my_value_encrypted, bytes)

my_value_decrypted = field.from_db_value(my_value_encrypted, None, None, None)
assert my_value_decrypted == output_text

0 comments on commit 1f722dc

Please sign in to comment.