Initial commit

APSL · May 10, 2018 · a357fd0 · a357fd0
1 parent 88421af
commit a357fd0
Show file tree

Hide file tree

Showing 12 changed files with 377 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,70 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+docs/_static
+docs/_templates
+
+# PyBuilder
+target/
+
+# PyCharm
+.idea
+
+# Python decouple settings file
+settings.ini
+
+log/
+py27/
+py35/
+flake8/
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1 @@
+Marc Galofré
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -0,0 +1,8 @@
+==========
+Change log
+==========
+
+0.1 (2017-06-21)
+----------------
+
+* Initial release.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2018 - APSL
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include AUTHORS
+include LICENSE
+include CHANGELOG.rst
+include README.rst
diff --git a/README.md → README.rst b/README.md → README.rst
diff --git a/hattori/__init__.py b/hattori/__init__.py
@@ -0,0 +1,3 @@
+__author__ = 'Marc Galofré'
+__email__ = 'mgalofre@apsl.net'
+__version__ = '0.1'
diff --git a/hattori/base.py b/hattori/base.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+import logging
+
+from six import string_types
+from django.conf import settings
+from bulk_update.helper import bulk_update
+from faker import Faker
+from multiprocessing import Pool
+
+ANONYMIZER_MODULE_NAME = 'anonymizers'
+DEFAULT_CHUNK_SIZE = 50
+
+logger = logging.getLogger(__name__)
+
+try:
+    faker = Faker(settings.LANGUAGE_CODE)
+except AttributeError:
+    faker = Faker()
+
+
+class BaseAnonymizer:
+
+    def __init__(self):
+        try:
+            getattr(self, 'model')
+            getattr(self, 'attributes')
+        except AttributeError:
+            logger.info('ERROR: Your anonymizer is missing the model or attributes definition!')
+            exit(1)
+
+    def get_query_set(self):
+        """
+        You can override this in your Anonymizer.
+        :return: QuerySet
+        """
+        return self.model.objects.all()
+
+    def get_allowed_value(self, replacer, model_instance, field_name):
+        retval = replacer()
+        max_length = model_instance._meta.get_field(field_name).max_length
+        if max_length:
+            retval = retval[:max_length]
+        return retval
+
+    def _process_instances(self, instances):
+        count_fields = 0
+        count_instances = 0
+
+        for model_instance in instances:
+            for field_name, replacer in self.attributes:
+                if callable(replacer):
+                    replaced_value = self.get_allowed_value(replacer, model_instance, field_name)
+                elif isinstance(replacer, string_types):
+                    replaced_value = replacer
+                else:
+                    raise TypeError('Replacers need to be callables or Strings!')
+                setattr(model_instance, field_name, replaced_value)
+                count_fields += 1
+            count_instances += 1
+        return instances, count_instances, count_fields
+
+    def _run_parallel(self, instances, parallel_processes):
+        count_instances = 0
+        count_fields = 0
+        instances_processed = []
+        chunks = [instances[i:i + DEFAULT_CHUNK_SIZE] for i in range(0, len(instances), DEFAULT_CHUNK_SIZE)]
+        pool = Pool(processes=parallel_processes)
+        futures = [pool.apply_async(self._process_instances, (objs,)) for objs in chunks]
+        for future in futures:
+            instances_parallel, count_instances_parallel, count_fields_parallel = future.get()
+            instances_processed += instances_parallel
+            count_instances += count_instances_parallel
+            count_fields += count_fields_parallel
+        pool.close()
+        pool.join()
+        return instances_processed, count_instances, count_fields
+
+    def run(self, batch_size=None, parallel_processes=0):
+        instances = self.get_query_set()
+        batch_size = DEFAULT_CHUNK_SIZE if batch_size is None else int(batch_size)
+
+        if parallel_processes > 1:
+            instances_processed, count_instances, count_fields = self._run_parallel(instances, parallel_processes)
+        else:
+            instances_processed, count_instances, count_fields = self._process_instances(instances)
+
+        bulk_update(instances_processed, update_fields=[attrs[0] for attrs in self.attributes],
+                    batch_size=batch_size)
+
+        return len(self.attributes), count_instances, count_fields
diff --git a/hattori/management/__init__.py b/hattori/management/__init__.py
diff --git a/hattori/management/commands/__init__.py b/hattori/management/commands/__init__.py
diff --git a/hattori/management/commands/anonymize_db.py b/hattori/management/commands/anonymize_db.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+from importlib import import_module
+from importlib.util import find_spec
+import inspect
+import sys
+
+from django.conf import settings
+from django.core.management import BaseCommand
+
+from hattori.base import ANONYMIZER_MODULE_NAME, BaseAnonymizer
+
+
+class Command(BaseCommand):
+    help = 'This tool replaces real (user-)data of model instances in your database with mock data.'
+    modules = None  # List of anonymizers modules. They can be placed in every app
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '-a',
+            '--app',
+            help='Only anonymize the given app',
+            dest="app",
+            metavar="APP"
+        )
+        parser.add_argument(
+            "-m",
+            "--model",
+            "--models",
+            dest="models",
+            help="Models to anonymize. Separate multiples by comma.",
+            metavar="MODEL"
+        )
+        parser.add_argument(
+            "-b",
+            "--batch-size",
+            dest="batch_size",
+            help="batch size used in the bulk_update of the instances. Depends on the DB machine. Use 500 in vagrant.",
+            metavar="BATCH_SIZE",
+            type=int
+        )
+        parser.add_argument(
+            "-p",
+            "--parallel",
+            dest="parallel",
+            help="Number of parallel processes for parallel execution",
+            metavar="PARALLEL",
+            type=int,
+            default=0
+        )
+
+    def handle(self, *args, **options):
+        models = None
+        if options['models'] is not None:
+            models = [m.strip() for m in options['models'].split(',')]
+
+        if options['parallel'] > 0:
+            self.stdout.write('Running in parallel mode with {} concurrent processes'.format(options['parallel']))
+        self.stdout.write('Autodiscovering anonymizers...')
+
+        modules = self._autodiscover_module(ANONYMIZER_MODULE_NAME, app=options['app'])
+        self.stdout.write('Found anonymizers for {} apps'.format(len(modules)))
+        total_replacements_count = 0
+        for module in modules:
+            self.stdout.write('{}:'.format(module.__package__))
+            anonymizers = self._get_app_anonymizers(module, models=models)
+
+            if len(anonymizers) == 0:
+                self.stdout.write('- No anonymizers or skipped by --app or --model arguments')
+                continue
+
+            for anonymizer_class_name in anonymizers:
+                anonymizer = getattr(module, anonymizer_class_name)()
+                self.stdout.write('- {}'.format(anonymizer.model.__name__))
+                # Start the anonymizing process
+                number_of_replaced_fields = anonymizer.run(options['batch_size'], options['parallel'])
+                self.stdout.write('-- {} fields, {} model instances, {} total replacements'.format(
+                    number_of_replaced_fields[0],
+                    number_of_replaced_fields[1],
+                    number_of_replaced_fields[2]
+                ))
+                total_replacements_count += number_of_replaced_fields[2]
+        self.stdout.write(self.style.SUCCESS('DONE. Replaced {} values in total'.format(total_replacements_count)))
+
+    def _autodiscover_module(self, module_name, app=None):
+        apps_to_search = [app] if app else settings.INSTALLED_APPS
+
+        modules = []
+        for app in apps_to_search:
+            try:
+                import_module(app)
+                app_path = sys.modules[app].__path__
+            except AttributeError:
+                continue
+            except ImportError:
+                self.stdout.write(self.style.ERROR('ERROR: Can not find app ' + app))
+                exit(1)
+            try:
+                find_spec(module_name, app_path)
+            except ImportError:
+                continue
+            import_module('%s.%s' % (app, module_name))
+            modules.append(sys.modules['%s.%s' % (app, module_name)])
+        return modules
+
+    def _get_app_anonymizers(self, module, models=None):
+        if models:
+            return [m[0] for m in inspect.getmembers(module, inspect.isclass)
+                    if BaseAnonymizer in m[1].__bases__ and m[1].model.__name__ in models]
+        else:
+            return [m[0] for m in inspect.getmembers(module, inspect.isclass) if BaseAnonymizer in m[1].__bases__]
diff --git a/setup.py b/setup.py
@@ -0,0 +1,71 @@
+# -*- encoding: utf-8 -*-
+
+import os
+import re
+import codecs
+
+try:
+    from setuptools import setup, find_packages
+except ImportError:
+    from distutils.core import setup
+
+
+def get_version(package):
+    """
+    Return package version as listed in `__version__` in `init.py`.
+    """
+    init_py = codecs.open(os.path.abspath(os.path.join(package, '__init__.py')), encoding='utf-8').read()
+    return re.search("^__version__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1)
+
+
+def get_author(package):
+    """
+    Return package author as listed in `__author__` in `init.py`.
+    """
+    init_py = codecs.open(os.path.abspath(os.path.join(package, '__init__.py')), encoding='utf-8').read()
+    return re.search("^__author__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1)
+
+
+def get_email(package):
+    """
+    Return package email as listed in `__email__` in `init.py`.
+    """
+    init_py = codecs.open(os.path.abspath(os.path.join(package, '__init__.py')), encoding='utf-8').read()
+    return re.search("^__email__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1)
+
+
+def get_long_description():
+    """
+    return the long description from README.rst file
+    :return:
+    """
+    return codecs.open(os.path.join(os.path.dirname(__file__), 'README.rst'), encoding='utf-8').read()
+
+
+setup(
+    name='django-hattori',
+    version=get_version('hattori'),
+    author=get_author('hattori'),
+    author_email=get_email('hattori'),
+    url='https://github.com/APSL/django-hattori',
+    packages=find_packages(exclude=['tests*']),
+    description='Command to anonymize sensitive data.',
+    long_description=get_long_description(),
+    install_requires=[
+        'Django>=1.8',
+        'django-bulk-update>=2.2.0',
+        'Faker>=0.8.13',
+        'six',
+    ],
+    classifiers=[
+        'Environment :: Web Environment',
+        'Intended Audience :: Developers',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Operating System :: OS Independent',
+        'Topic :: Software Development'
+    ],
+    include_package_data=True,
+    zip_safe=False,
+)