Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mgalofre committed May 10, 2018
1 parent 88421af commit a357fd0
Show file tree
Hide file tree
Showing 12 changed files with 377 additions and 0 deletions.
70 changes: 70 additions & 0 deletions .gitignore
@@ -0,0 +1,70 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/
docs/_static
docs/_templates

# PyBuilder
target/

# PyCharm
.idea

# Python decouple settings file
settings.ini

log/
py27/
py35/
flake8/
1 change: 1 addition & 0 deletions AUTHORS
@@ -0,0 +1 @@
Marc Galofré
8 changes: 8 additions & 0 deletions CHANGELOG.rst
@@ -0,0 +1,8 @@
==========
Change log
==========

0.1 (2017-06-21)
----------------

* Initial release.
20 changes: 20 additions & 0 deletions LICENSE
@@ -0,0 +1,20 @@
Copyright (c) 2018 - APSL

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
4 changes: 4 additions & 0 deletions MANIFEST.in
@@ -0,0 +1,4 @@
include AUTHORS
include LICENSE
include CHANGELOG.rst
include README.rst
File renamed without changes.
3 changes: 3 additions & 0 deletions hattori/__init__.py
@@ -0,0 +1,3 @@
__author__ = 'Marc Galofré'
__email__ = 'mgalofre@apsl.net'
__version__ = '0.1'
90 changes: 90 additions & 0 deletions hattori/base.py
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
import logging

from six import string_types
from django.conf import settings
from bulk_update.helper import bulk_update
from faker import Faker
from multiprocessing import Pool

ANONYMIZER_MODULE_NAME = 'anonymizers'
DEFAULT_CHUNK_SIZE = 50

logger = logging.getLogger(__name__)

try:
faker = Faker(settings.LANGUAGE_CODE)
except AttributeError:
faker = Faker()


class BaseAnonymizer:

def __init__(self):
try:
getattr(self, 'model')
getattr(self, 'attributes')
except AttributeError:
logger.info('ERROR: Your anonymizer is missing the model or attributes definition!')
exit(1)

def get_query_set(self):
"""
You can override this in your Anonymizer.
:return: QuerySet
"""
return self.model.objects.all()

def get_allowed_value(self, replacer, model_instance, field_name):
retval = replacer()
max_length = model_instance._meta.get_field(field_name).max_length
if max_length:
retval = retval[:max_length]
return retval

def _process_instances(self, instances):
count_fields = 0
count_instances = 0

for model_instance in instances:
for field_name, replacer in self.attributes:
if callable(replacer):
replaced_value = self.get_allowed_value(replacer, model_instance, field_name)
elif isinstance(replacer, string_types):
replaced_value = replacer
else:
raise TypeError('Replacers need to be callables or Strings!')
setattr(model_instance, field_name, replaced_value)
count_fields += 1
count_instances += 1
return instances, count_instances, count_fields

def _run_parallel(self, instances, parallel_processes):
count_instances = 0
count_fields = 0
instances_processed = []
chunks = [instances[i:i + DEFAULT_CHUNK_SIZE] for i in range(0, len(instances), DEFAULT_CHUNK_SIZE)]
pool = Pool(processes=parallel_processes)
futures = [pool.apply_async(self._process_instances, (objs,)) for objs in chunks]
for future in futures:
instances_parallel, count_instances_parallel, count_fields_parallel = future.get()
instances_processed += instances_parallel
count_instances += count_instances_parallel
count_fields += count_fields_parallel
pool.close()
pool.join()
return instances_processed, count_instances, count_fields

def run(self, batch_size=None, parallel_processes=0):
instances = self.get_query_set()
batch_size = DEFAULT_CHUNK_SIZE if batch_size is None else int(batch_size)

if parallel_processes > 1:
instances_processed, count_instances, count_fields = self._run_parallel(instances, parallel_processes)
else:
instances_processed, count_instances, count_fields = self._process_instances(instances)

bulk_update(instances_processed, update_fields=[attrs[0] for attrs in self.attributes],
batch_size=batch_size)

return len(self.attributes), count_instances, count_fields
Empty file added hattori/management/__init__.py
Empty file.
Empty file.
110 changes: 110 additions & 0 deletions hattori/management/commands/anonymize_db.py
@@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-
from importlib import import_module
from importlib.util import find_spec
import inspect
import sys

from django.conf import settings
from django.core.management import BaseCommand

from hattori.base import ANONYMIZER_MODULE_NAME, BaseAnonymizer


class Command(BaseCommand):
help = 'This tool replaces real (user-)data of model instances in your database with mock data.'
modules = None # List of anonymizers modules. They can be placed in every app

def add_arguments(self, parser):
parser.add_argument(
'-a',
'--app',
help='Only anonymize the given app',
dest="app",
metavar="APP"
)
parser.add_argument(
"-m",
"--model",
"--models",
dest="models",
help="Models to anonymize. Separate multiples by comma.",
metavar="MODEL"
)
parser.add_argument(
"-b",
"--batch-size",
dest="batch_size",
help="batch size used in the bulk_update of the instances. Depends on the DB machine. Use 500 in vagrant.",
metavar="BATCH_SIZE",
type=int
)
parser.add_argument(
"-p",
"--parallel",
dest="parallel",
help="Number of parallel processes for parallel execution",
metavar="PARALLEL",
type=int,
default=0
)

def handle(self, *args, **options):
models = None
if options['models'] is not None:
models = [m.strip() for m in options['models'].split(',')]

if options['parallel'] > 0:
self.stdout.write('Running in parallel mode with {} concurrent processes'.format(options['parallel']))
self.stdout.write('Autodiscovering anonymizers...')

modules = self._autodiscover_module(ANONYMIZER_MODULE_NAME, app=options['app'])
self.stdout.write('Found anonymizers for {} apps'.format(len(modules)))
total_replacements_count = 0
for module in modules:
self.stdout.write('{}:'.format(module.__package__))
anonymizers = self._get_app_anonymizers(module, models=models)

if len(anonymizers) == 0:
self.stdout.write('- No anonymizers or skipped by --app or --model arguments')
continue

for anonymizer_class_name in anonymizers:
anonymizer = getattr(module, anonymizer_class_name)()
self.stdout.write('- {}'.format(anonymizer.model.__name__))
# Start the anonymizing process
number_of_replaced_fields = anonymizer.run(options['batch_size'], options['parallel'])
self.stdout.write('-- {} fields, {} model instances, {} total replacements'.format(
number_of_replaced_fields[0],
number_of_replaced_fields[1],
number_of_replaced_fields[2]
))
total_replacements_count += number_of_replaced_fields[2]
self.stdout.write(self.style.SUCCESS('DONE. Replaced {} values in total'.format(total_replacements_count)))

def _autodiscover_module(self, module_name, app=None):
apps_to_search = [app] if app else settings.INSTALLED_APPS

modules = []
for app in apps_to_search:
try:
import_module(app)
app_path = sys.modules[app].__path__
except AttributeError:
continue
except ImportError:
self.stdout.write(self.style.ERROR('ERROR: Can not find app ' + app))
exit(1)
try:
find_spec(module_name, app_path)
except ImportError:
continue
import_module('%s.%s' % (app, module_name))
modules.append(sys.modules['%s.%s' % (app, module_name)])
return modules

def _get_app_anonymizers(self, module, models=None):
if models:
return [m[0] for m in inspect.getmembers(module, inspect.isclass)
if BaseAnonymizer in m[1].__bases__ and m[1].model.__name__ in models]
else:
return [m[0] for m in inspect.getmembers(module, inspect.isclass) if BaseAnonymizer in m[1].__bases__]
71 changes: 71 additions & 0 deletions setup.py
@@ -0,0 +1,71 @@
# -*- encoding: utf-8 -*-

import os
import re
import codecs

try:
from setuptools import setup, find_packages
except ImportError:
from distutils.core import setup


def get_version(package):
"""
Return package version as listed in `__version__` in `init.py`.
"""
init_py = codecs.open(os.path.abspath(os.path.join(package, '__init__.py')), encoding='utf-8').read()
return re.search("^__version__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1)


def get_author(package):
"""
Return package author as listed in `__author__` in `init.py`.
"""
init_py = codecs.open(os.path.abspath(os.path.join(package, '__init__.py')), encoding='utf-8').read()
return re.search("^__author__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1)


def get_email(package):
"""
Return package email as listed in `__email__` in `init.py`.
"""
init_py = codecs.open(os.path.abspath(os.path.join(package, '__init__.py')), encoding='utf-8').read()
return re.search("^__email__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1)


def get_long_description():
"""
return the long description from README.rst file
:return:
"""
return codecs.open(os.path.join(os.path.dirname(__file__), 'README.rst'), encoding='utf-8').read()


setup(
name='django-hattori',
version=get_version('hattori'),
author=get_author('hattori'),
author_email=get_email('hattori'),
url='https://github.com/APSL/django-hattori',
packages=find_packages(exclude=['tests*']),
description='Command to anonymize sensitive data.',
long_description=get_long_description(),
install_requires=[
'Django>=1.8',
'django-bulk-update>=2.2.0',
'Faker>=0.8.13',
'six',
],
classifiers=[
'Environment :: Web Environment',
'Intended Audience :: Developers',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Operating System :: OS Independent',
'Topic :: Software Development'
],
include_package_data=True,
zip_safe=False,
)

0 comments on commit a357fd0

Please sign in to comment.