From f2b4463c6255058db36c1e7462cfeb51dec58584 Mon Sep 17 00:00:00 2001 From: Erik Rose Date: Tue, 24 Jan 2012 20:38:17 -0800 Subject: [PATCH] [bug 720865] Remove race conditions that were causing mapping conflicts in production. * Remove support for putting different doctypes in different indices. * Remove the ability to limit reindexing to given doctypes. * Make get_mapping() return less boilerplate. Factor that up. --- apps/forums/models.py | 34 ++++++------- apps/questions/models.py | 52 +++++++++----------- apps/search/es_utils.py | 38 +++++++------- apps/search/management/commands/esreindex.py | 19 ++----- apps/search/models.py | 40 ++++++--------- apps/sumo/tests/__init__.py | 8 ++- apps/wiki/models.py | 33 ++++++------- settings.py | 2 +- 8 files changed, 93 insertions(+), 133 deletions(-) diff --git a/apps/forums/models.py b/apps/forums/models.py index 9605641e35e..661c10379a3 100644 --- a/apps/forums/models.py +++ b/apps/forums/models.py @@ -179,25 +179,21 @@ def update_last_post(self, exclude_post=None): @classmethod def get_mapping(cls): - mapping = { - 'properties': { - 'id': {'type': 'integer'}, - 'thread_id': {'type': 'integer'}, - 'forum_id': {'type': 'integer'}, - 'title': {'type': 'string', 'analyzer': 'snowball'}, - 'is_sticky': {'type': 'boolean'}, - 'is_locked': {'type': 'boolean'}, - 'author_id': {'type': 'integer'}, - 'author_ord': {'type': 'string'}, - 'content': {'type': 'string', 'analyzer': 'snowball', - 'store': 'yes', - 'term_vector': 'with_positions_offsets'}, - 'created': {'type': 'date'}, - 'updated': {'type': 'date'}, - 'replies': {'type': 'integer'} - } - } - return mapping + return { + 'id': {'type': 'integer'}, + 'thread_id': {'type': 'integer'}, + 'forum_id': {'type': 'integer'}, + 'title': {'type': 'string', 'analyzer': 'snowball'}, + 'is_sticky': {'type': 'boolean'}, + 'is_locked': {'type': 'boolean'}, + 'author_id': {'type': 'integer'}, + 'author_ord': {'type': 'string'}, + 'content': {'type': 'string', 'analyzer': 'snowball', + 'store': 'yes', + 'term_vector': 'with_positions_offsets'}, + 'created': {'type': 'date'}, + 'updated': {'type': 'date'}, + 'replies': {'type': 'integer'}} def extract_document(self): """Extracts interesting thing from a Thread and its Posts""" diff --git a/apps/questions/models.py b/apps/questions/models.py index bfe2bf20cec..631c367193b 100755 --- a/apps/questions/models.py +++ b/apps/questions/models.py @@ -283,34 +283,30 @@ def my_tags(self): @classmethod def get_mapping(cls): - mapping = { - 'properties': { - 'id': {'type': 'long'}, - 'question_id': {'type': 'long'}, - 'title': {'type': 'string', 'analyzer': 'snowball'}, - 'question_content': - {'type': 'string', 'analyzer': 'snowball', - # TODO: Stored because originally, this is the - # only field we were excerpting on. Standardize - # one way or the other. - 'store': 'yes', 'term_vector': 'with_positions_offsets'}, - 'answer_content': - {'type': 'string', 'analyzer': 'snowball'}, - 'replies': {'type': 'integer'}, - 'is_solved': {'type': 'boolean'}, - 'is_locked': {'type': 'boolean'}, - 'has_answers': {'type': 'boolean'}, - 'has_helpful': {'type': 'boolean'}, - 'created': {'type': 'date'}, - 'updated': {'type': 'date'}, - 'question_creator': {'type': 'string'}, - 'answer_creator': {'type': 'string'}, - 'question_votes': {'type': 'integer'}, - 'answer_votes': {'type': 'integer'}, - 'tag': {'type': 'string'} - } - } - return mapping + return { + 'id': {'type': 'long'}, + 'question_id': {'type': 'long'}, + 'title': {'type': 'string', 'analyzer': 'snowball'}, + 'question_content': + {'type': 'string', 'analyzer': 'snowball', + # TODO: Stored because originally, this is the + # only field we were excerpting on. Standardize + # one way or the other. + 'store': 'yes', 'term_vector': 'with_positions_offsets'}, + 'answer_content': + {'type': 'string', 'analyzer': 'snowball'}, + 'replies': {'type': 'integer'}, + 'is_solved': {'type': 'boolean'}, + 'is_locked': {'type': 'boolean'}, + 'has_answers': {'type': 'boolean'}, + 'has_helpful': {'type': 'boolean'}, + 'created': {'type': 'date'}, + 'updated': {'type': 'date'}, + 'question_creator': {'type': 'string'}, + 'answer_creator': {'type': 'string'}, + 'question_votes': {'type': 'integer'}, + 'answer_votes': {'type': 'integer'}, + 'tag': {'type': 'string'}} def extract_document(self): """Extracts indexable attributes from a Question and its answers.""" diff --git a/apps/search/es_utils.py b/apps/search/es_utils.py index a99bac7efc0..82278307b06 100644 --- a/apps/search/es_utils.py +++ b/apps/search/es_utils.py @@ -66,12 +66,9 @@ def format_time(time_to_go): return "%dm %ds" % (time_to_go / 60, time_to_go % 60) -def es_reindex_with_progress(doctypes=None, percent=100): +def es_reindex_with_progress(percent=100): """Rebuild Elastic indexes as you iterate over yielded progress ratios. - :arg doctypes: Defaults to None which will index all doctypes. - Otherwise indexes the doctypes specified. See - :py:func:`.get_doctype_stats()` for what doctypes look like. :arg percent: Defaults to 100. Allows you to specify how much of each doctype you want to index. This is useful for development where doing a full reindex takes an hour. @@ -79,36 +76,35 @@ def es_reindex_with_progress(doctypes=None, percent=100): """ from search.models import get_search_models - es = elasticutils.get_es() - search_models = get_search_models() - if doctypes: - search_models = [cls for cls in search_models - if cls._meta.db_table in doctypes] - - if len(search_models) == len(get_search_models()): - index = settings.ES_INDEXES.get('default') - if index is not None: - # If we're indexing everything and there's a default index - # specified in settings, then we delete and recreate it. - es.delete_index_if_exists(index) - es.create_index(index) - total = sum([cls.objects.count() for cls in search_models]) + es = elasticutils.get_es() + index = settings.ES_INDEXES['default'] + es.delete_index_if_exists(index) + # There should be no mapping-conflict race here since the index doesn't + # exist. Live indexing should just fail. + + # Simultaneously create the index and the mappings, so live indexing + # doesn't get a chance to index anything between the two and infer a bogus + # mapping (which ES then freaks out over when we try to lay in an + # incompatible explicit mapping). + mappings = dict((cls._meta.db_table, {'properties': cls.get_mapping()}) + for cls in search_models) + es.create_index(index, settings={'mappings': mappings}) + total = sum([cls.objects.count() for cls in search_models]) to_index = [cls.index_all(percent) for cls in search_models] - return (float(done) / total for done, _ in izip(count(1), chain(*to_index))) -def es_reindex(doctypes=None, percent=100): +def es_reindex(percent=100): """Rebuild ElasticSearch indexes See :py:func:`.es_reindex_with_progress` for argument details. """ - [x for x in es_reindex_with_progress(doctypes, percent) if False] + [x for x in es_reindex_with_progress(percent) if False] def es_whazzup(): diff --git a/apps/search/management/commands/esreindex.py b/apps/search/management/commands/esreindex.py index 59dcd4d775b..ea418fca17f 100644 --- a/apps/search/management/commands/esreindex.py +++ b/apps/search/management/commands/esreindex.py @@ -1,6 +1,8 @@ import logging -from django.core.management.base import BaseCommand, CommandError from optparse import make_option + +from django.core.management.base import BaseCommand, CommandError + from search.es_utils import es_reindex from search.models import get_search_models @@ -14,17 +16,6 @@ class Command(BaseCommand): def handle(self, *args, **options): logging.basicConfig(level=logging.INFO) percent = options['percent'] - if percent > 100 or percent < 1: + if not 1 <= percent <= 100: raise CommandError('percent should be between 1 and 100') - - if args: - search_models = get_search_models() - possible_doctypes = dict((cls._meta.db_table, cls) - for cls in search_models) - for mem in args: - if mem not in possible_doctypes: - raise CommandError('"%s" is not a valid doctype (%s)' % - (mem, possible_doctypes.keys())) - - # args are the list of doctypes to index. - es_reindex(args, percent) + es_reindex(percent) diff --git a/apps/search/models.py b/apps/search/models.py index 871f234efaf..8b919b812cd 100644 --- a/apps/search/models.py +++ b/apps/search/models.py @@ -60,11 +60,11 @@ class SearchMixin(object): """ @classmethod def get_mapping(self): - """Returns the ES mapping defition for this document type + """Return the ES mapping properties for this document type. - This must be implemented. It should return an ES mapping. + For example... :: - For examples, see the codebase. + {'id': {'type': 'integer'}, ...} """ raise NotImplementedError @@ -80,12 +80,6 @@ def extract_document(self): """ raise NotImplementedError - @classmethod - def get_es_index(cls): - """Returns the index for this class""" - indexes = settings.ES_INDEXES - return indexes.get(cls._meta.db_table) or indexes['default'] - def index_later(self): """Register myself to be indexed at the end of the request.""" _local_tasks().add((index_task.delay, (self.__class__, (self.id,)))) @@ -110,23 +104,12 @@ def index_all(cls, percent=100): es = es_utils.get_es() doc_type = cls._meta.db_table - index = cls.get_es_index() - - if index != settings.ES_INDEXES.get('default'): - # If this doctype isn't using the default index, then this - # doctype is responsible for deleting and re-creating the - # index. - es.delete_index_if_exists(index) - es.create_index(index) + index = settings.ES_INDEXES['default'] start_time = time.time() log.info('reindex %s into %s index', doc_type, index) - log.info('setting up mapping....') - mapping = cls.get_mapping() - es.put_mapping(doc_type, mapping, index) - log.info('iterating through %s....', doc_type) total = cls.objects.count() to_index = int(total * (percent / 100.0)) @@ -187,12 +170,16 @@ def index(cls, document, bulk=False, force_insert=False, refresh=False, if es is None: es = elasticutils.get_es() - index = cls.get_es_index() + index = settings.ES_INDEXES['default'] doc_type = cls._meta.db_table # TODO: handle pyes.urllib3.TimeoutErrors here. - es.index(document, index=index, doc_type=doc_type, id=document['id'], - bulk=bulk, force_insert=force_insert) + es.index(document, + index=index, + doc_type=doc_type, + id=document['id'], + bulk=bulk, + force_insert=force_insert) if refresh: es.refresh(timesleep=0) @@ -203,10 +190,11 @@ def unindex(cls, id): if not settings.ES_LIVE_INDEXING: return - index = cls.get_es_index() doc_type = cls._meta.db_table try: - elasticutils.get_es().delete(index, doc_type, id) + elasticutils.get_es().delete(settings.ES_INDEXES['default'], + doc_type, + id) except pyes.exceptions.NotFoundException: # Ignore the case where we try to delete something that's # not there. diff --git a/apps/sumo/tests/__init__.py b/apps/sumo/tests/__init__.py index 2f1e1866ce1..64f406c5cbf 100644 --- a/apps/sumo/tests/__init__.py +++ b/apps/sumo/tests/__init__.py @@ -78,7 +78,7 @@ def tearDown(self): self.teardown_indexes() super(ElasticTestCase, self).tearDown() - def refresh(self, index='default'): + def refresh(self): # Any time we're doing a refresh, we're making sure that the # index is ready to be queried. Given that, it's almost # always the case that we want to run all the generated tasks, @@ -86,8 +86,7 @@ def refresh(self, index='default'): from search.models import generate_tasks generate_tasks() - es = get_es() - es.refresh(settings.ES_INDEXES[index], timesleep=0) + get_es().refresh(settings.ES_INDEXES['default'], timesleep=0) def setup_indexes(self): """(Re-)create ES indexes.""" @@ -115,8 +114,7 @@ def setup_indexes(self): def teardown_indexes(self): es = get_es() - for index in settings.ES_INDEXES.values(): - es.delete_index_if_exists(index) + es.delete_index_if_exists(settings.ES_INDEXES['default']) settings.ES_LIVE_INDEXING = False diff --git a/apps/wiki/models.py b/apps/wiki/models.py index 7e85ed1c400..3e2a08726ee 100644 --- a/apps/wiki/models.py +++ b/apps/wiki/models.py @@ -628,25 +628,20 @@ def is_watched_by(self, user): @classmethod def get_mapping(cls): - mapping = { - 'properties': { - 'id': {'type': 'integer'}, - 'title': {'type': 'string', 'analyzer': 'snowball'}, - 'locale': {'type': 'string', 'index': 'not_analyzed'}, - 'current': {'type': 'integer'}, - 'parent_id': {'type': 'integer'}, - 'content': - {'type': 'string', 'analyzer': 'snowball'}, - 'category': {'type': 'integer'}, - 'slug': {'type': 'string'}, - 'is_archived': {'type': 'boolean'}, - 'summary': {'type': 'string', 'analyzer': 'snowball'}, - 'keywords': {'type': 'string', 'analyzer': 'snowball'}, - 'updated': {'type': 'date'}, - 'tag': {'type': 'string'} - } - } - return mapping + return { + 'id': {'type': 'integer'}, + 'title': {'type': 'string', 'analyzer': 'snowball'}, + 'locale': {'type': 'string', 'index': 'not_analyzed'}, + 'current': {'type': 'integer'}, + 'parent_id': {'type': 'integer'}, + 'content': {'type': 'string', 'analyzer': 'snowball'}, + 'category': {'type': 'integer'}, + 'slug': {'type': 'string'}, + 'is_archived': {'type': 'boolean'}, + 'summary': {'type': 'string', 'analyzer': 'snowball'}, + 'keywords': {'type': 'string', 'analyzer': 'snowball'}, + 'updated': {'type': 'date'}, + 'tag': {'type': 'string'}} def extract_document(self): d = {} diff --git a/settings.py b/settings.py index 10088061356..db810c27e29 100644 --- a/settings.py +++ b/settings.py @@ -592,7 +592,7 @@ def JINJA_CONFIG(): # # Connection information for Elastic ES_HOSTS = ['127.0.0.1:9200'] -ES_INDEXES = {'default': 'sumo'} +ES_INDEXES = {'default': 'sumo'} # Doesn't support non-default indexes atm. ES_LIVE_INDEXING = False # Keep indexes up to date as objects are made/deleted ES_TIMEOUT = 5 # 5 second timeouts for querying ES_INDEXING_TIMEOUT = 30 # 30 second timeouts for all things indexing