-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SNO-172-upgrade-to-elasticsearch7 #296
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,10 +31,6 @@ | |
|
||
# An index to store non-content metadata | ||
META_MAPPING = { | ||
'_all': { | ||
'enabled': False, | ||
'analyzer': 'snovault_search_analyzer' | ||
}, | ||
'dynamic_templates': [ | ||
{ | ||
'store_generic': { | ||
|
@@ -49,12 +45,6 @@ | |
} | ||
|
||
|
||
PATH_FIELDS = ['submitted_file_name'] | ||
NON_SUBSTRING_FIELDS = ['uuid', '@id', 'submitted_by', 'md5sum', | ||
'references', 'submitted_file_name'] | ||
KEYWORD_FIELDS = ['schema_version', 'uuid', 'accession', 'alternate_accessions', | ||
'aliases', 'status', 'date_created', 'submitted_by', | ||
'internal_status', 'target', 'biosample_type'] | ||
TEXT_FIELDS = ['pipeline_error_detail', 'description', 'notes'] | ||
|
||
|
||
|
@@ -90,7 +80,6 @@ def schema_mapping(name, schema): | |
properties[k] = mapping | ||
return { | ||
'type': 'object', | ||
'include_in_all': False, | ||
'properties': properties, | ||
} | ||
|
||
|
@@ -122,9 +111,7 @@ def schema_mapping(name, schema): | |
|
||
if type_ == 'string': | ||
|
||
if name in KEYWORD_FIELDS: | ||
field_type = 'keyword' | ||
elif name in TEXT_FIELDS: | ||
if name in TEXT_FIELDS: | ||
field_type = 'text' | ||
else: | ||
field_type = 'keyword' | ||
|
@@ -133,10 +120,6 @@ def schema_mapping(name, schema): | |
'type': field_type | ||
} | ||
|
||
# these fields are unintentially partially matching some small search | ||
# keywords because fields are analyzed by nGram analyzer | ||
if name in NON_SUBSTRING_FIELDS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No equivalent replacement of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is the newer commit: afbe476#diff-13be881525721a1251be911ff12f4c7a722ced10eb689a96d8c5821e8d9cd0e8R140 |
||
sub_mapping['include_in_all'] = False | ||
return sub_mapping | ||
|
||
if type_ == 'number': | ||
|
@@ -167,12 +150,13 @@ def index_settings(): | |
'settings': { | ||
'index.max_result_window': 99999, | ||
'index.mapping.total_fields.limit': 5000, | ||
'index.number_of_shards': 5, | ||
'index.number_of_replicas': 2, | ||
'index.number_of_shards': 1, | ||
'index.number_of_replicas': 0, | ||
Bek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'index.max_ngram_diff': 32, | ||
'analysis': { | ||
'filter': { | ||
'substring': { | ||
'type': 'nGram', | ||
'type': 'ngram', | ||
'min_gram': 1, | ||
'max_gram': 33 | ||
}, | ||
|
@@ -273,10 +257,6 @@ def audit_mapping(): | |
|
||
def es_mapping(mapping): | ||
return { | ||
'_all': { | ||
'enabled': True, | ||
'analyzer': 'snovault_search_analyzer' | ||
}, | ||
'dynamic_templates': [ | ||
{ | ||
'template_principals_allowed': { | ||
|
@@ -293,6 +273,7 @@ def es_mapping(mapping): | |
'match_mapping_type': "string", | ||
'mapping': { | ||
'type': 'keyword', | ||
'copy_to': '_all' | ||
}, | ||
}, | ||
}, | ||
|
@@ -302,6 +283,7 @@ def es_mapping(mapping): | |
'match_mapping_type': "string", | ||
'mapping': { | ||
'type': 'keyword', | ||
'copy_to': '_all' | ||
}, | ||
}, | ||
}, | ||
|
@@ -325,16 +307,19 @@ def es_mapping(mapping): | |
} | ||
}, | ||
}, | ||
} | ||
}, | ||
], | ||
'properties': { | ||
'_all': { | ||
'type': 'text', | ||
'store': False, | ||
'analyzer': 'snovault_search_analyzer' | ||
}, | ||
'uuid': { | ||
'type': 'keyword', | ||
'include_in_all': False, | ||
}, | ||
'tid': { | ||
'type': 'keyword', | ||
'include_in_all': False, | ||
}, | ||
'item_type': { | ||
'type': 'keyword', | ||
|
@@ -343,33 +328,26 @@ def es_mapping(mapping): | |
'object': { | ||
'type': 'object', | ||
'enabled': False, | ||
'include_in_all': False, | ||
}, | ||
'properties': { | ||
'type': 'object', | ||
'enabled': False, | ||
'include_in_all': False, | ||
}, | ||
'propsheets': { | ||
'type': 'object', | ||
'enabled': False, | ||
'include_in_all': False, | ||
}, | ||
'embedded_uuids': { | ||
'type': 'keyword', | ||
'include_in_all': False, | ||
}, | ||
'linked_uuids': { | ||
'type': 'keyword', | ||
'include_in_all': False, | ||
}, | ||
'paths': { | ||
'type': 'keyword', | ||
'include_in_all': False, | ||
}, | ||
'audit': { | ||
'type': 'object', | ||
'include_in_all': False, | ||
'properties': { | ||
'ERROR': { | ||
'type': 'object', | ||
|
@@ -465,19 +443,16 @@ def type_mapping(types, item_type, embed=True): | |
for prop in props: | ||
new_mapping = new_mapping[prop]['properties'] | ||
new_mapping[last]['boost'] = boost | ||
if last in NON_SUBSTRING_FIELDS: | ||
new_mapping[last]['include_in_all'] = False | ||
else: | ||
new_mapping[last]['include_in_all'] = True | ||
new_mapping[last]['copy_to'] = '_all' | ||
return mapping | ||
|
||
|
||
def create_elasticsearch_index(es, index, body): | ||
es.indices.create(index=index, body=body, wait_for_active_shards=1, ignore=[400, 404], master_timeout='5m', request_timeout=300) | ||
|
||
|
||
def set_index_mapping(es, index, doc_type, mapping): | ||
es.indices.put_mapping(index=index, doc_type=doc_type, body=mapping, ignore=[400], request_timeout=300) | ||
def set_index_mapping(es, index, mapping): | ||
es.indices.put_mapping(index=index, body=mapping, ignore=[400], request_timeout=300) | ||
|
||
|
||
def create_snovault_index_alias(es, indices): | ||
|
@@ -497,20 +472,19 @@ def run(app, collections=None, dry_run=False): | |
indices = [] | ||
for collection_name in collections: | ||
if collection_name == 'meta': | ||
doc_type = 'meta' | ||
mapping = META_MAPPING | ||
else: | ||
index = doc_type = collection_name | ||
index = collection_name | ||
collection = registry[COLLECTIONS].by_item_type[collection_name] | ||
mapping = es_mapping(type_mapping(registry[TYPES], collection.type_info.item_type)) | ||
|
||
if mapping is None: | ||
continue # Testing collections | ||
if dry_run: | ||
print(json.dumps(sorted_dict({index: {doc_type: mapping}}), indent=4)) | ||
print(json.dumps(sorted_dict({index: {collection_name: mapping}}), indent=4)) | ||
continue | ||
create_elasticsearch_index(es, index, index_settings()) | ||
set_index_mapping(es, index, doc_type, {doc_type: mapping}) | ||
set_index_mapping(es, index, mapping) | ||
if collection_name != 'meta': | ||
indices.append(index) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this always had
properties
why the if/else now?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if there is no mapping directive exists in the schema, object should be stored without its fields being analyzed.
else
clause prevents unnecessary dynamic mapping andupdate_mapping
events in elasticsearch.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't seem to be working properly for some fields, e.g.
cloud_metadata
in files. (Might have something to do with it being a calculated_property.)Before and after. Now you can't look up cloud_metadata.url=* for example.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, previous assumption was that if the field was important, it would be in the schema. This can be fixed, implication is that any other object inserted that doesn't have a mapping directive in the schema will have all of its field values be analyzed as keyword.