Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug-1885978: remove raw crash from indexing code #6560

Merged
merged 1 commit into from Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 3 additions & 9 deletions socorro/external/es/crashstorage.py
Expand Up @@ -216,7 +216,7 @@ def fix_datetime(value):
def build_document(src, crash_document, fields, all_keys):
"""Given a source document and fields and valid keys, builds a document to index.

:param dict src: the source document with raw_crash and processed_crash keys
:param dict src: the source document with "processed_crash" key
:param dict crash_document: the document to fill
:param list fields: the list of fields in super search fields
:param set all_keys: the list of valid keys
Expand Down Expand Up @@ -270,7 +270,7 @@ def build_document(src, crash_document, fields, all_keys):


class ESCrashStorage(CrashStorageBase):
"""This sends raw and processed crash reports to Elasticsearch."""
"""Indexes documents based on the processed crash to Elasticsearch."""

# These regex will catch field names from Elasticsearch exceptions. They
# have been tested with Elasticsearch 1.4.
Expand Down Expand Up @@ -498,14 +498,10 @@ def save_processed_crash(self, raw_crash, processed_crash):
es_doctype = self.get_doctype()
all_valid_keys = self.get_keys(index_name, es_doctype)

src = {
"raw_crash": copy.deepcopy(raw_crash),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One less deepcopy should make it a little faster, though I'm not sure we'd notice.

"processed_crash": copy.deepcopy(processed_crash),
}
src = {"processed_crash": copy.deepcopy(processed_crash)}

crash_document = {
"crash_id": crash_id,
"raw_crash": {},
"processed_crash": {},
}
build_document(src, crash_document, fields=FIELDS, all_keys=all_valid_keys)
Expand Down Expand Up @@ -533,8 +529,6 @@ def _capture(key, data):
# we can fix it later.
self.logger.exception(f"something went wrong when capturing {key}")

_capture("raw_crash_size", crash_document["raw_crash"])
_capture("processed_crash_size", crash_document["processed_crash"])
_capture("crash_document_size", crash_document)

def _index_crash(self, connection, es_index, es_doctype, crash_document, crash_id):
Expand Down
16 changes: 8 additions & 8 deletions socorro/external/es/super_search_fields.py
Expand Up @@ -297,8 +297,8 @@ def boolean_field(
show up in aggregations.

:param name: the name used to query the field in super search
:param namespace: either "raw_crash" or "processed_crash"; note that we're moving
to a model where we pull everything from the processed_crash, so prefer that
:param namespace: either "processed_crash" or some dotted path to a key deep in
the processed crash
:param in_database_name: the field in the processed crash to pull this data from

:returns: super search field specification as a dict
Expand Down Expand Up @@ -332,8 +332,8 @@ def keyword_field(
names, fields that have a limited set of choices, etc.

:param name: the name used to query the field in super search
:param namespace: either "raw_crash" or "processed_crash"; note that we're moving
to a model where we pull everything from the processed_crash, so prefer that
:param namespace: either "processed_crash" or some dotted path to a key deep in
the processed crash
:param in_database_name: the field in the processed crash to pull this data from
:param choices: a list of valid values for the dropdown

Expand Down Expand Up @@ -367,8 +367,8 @@ def integer_field(
"""Generates a whole number field.

:param name: the name used to query the field in super search
:param namespace: either "raw_crash" or "processed_crash"; note that we're moving
to a model where we pull everything from the processed_crash, so prefer that
:param namespace: either "processed_crash" or some dotted path to a key deep in
the processed crash
:param in_database_name: the field in the processed crash to pull this data from
:param storage_mapping_type: the storage mapping type to use for Elasticsearch;
"short", "integer", "long"
Expand Down Expand Up @@ -403,8 +403,8 @@ def float_field(
"""Generates a floating point field.

:param name: the name used to query the field in super search
:param namespace: either "raw_crash" or "processed_crash"; note that we're moving
to a model where we pull everything from the processed_crash, so prefer that
:param namespace: either "processed_crash" or some dotted path to a key deep in
the processed crash
:param in_database_name: the field in the processed crash to pull this data from

:returns: super search field specification as a dict
Expand Down
16 changes: 7 additions & 9 deletions socorro/tests/conftest.py
Expand Up @@ -240,19 +240,19 @@ def get_url(self):
def refresh(self):
self.conn.refresh()

def index_crash(self, raw_crash, processed_crash, refresh=True):
def index_crash(self, processed_crash, refresh=True):
"""Index a single crash and refresh"""
self._crashstorage.save_processed_crash(raw_crash, processed_crash)
self._crashstorage.save_processed_crash(
raw_crash={},
processed_crash=processed_crash,
)

if refresh:
self.refresh()

def index_many_crashes(
self, number, raw_crash=None, processed_crash=None, loop_field=None
):
def index_many_crashes(self, number, processed_crash=None, loop_field=None):
"""Index multiple crashes and refresh at the end"""
processed_crash = processed_crash or {}
raw_crash = raw_crash or {}

crash_ids = []
for i in range(number):
Expand All @@ -262,9 +262,7 @@ def index_many_crashes(
if loop_field is not None:
processed_copy[loop_field] = processed_crash[loop_field] % i

self.index_crash(
raw_crash=raw_crash, processed_crash=processed_copy, refresh=False
)
self.index_crash(processed_crash=processed_copy, refresh=False)

self.refresh()
return crash_ids
Expand Down
2 changes: 0 additions & 2 deletions socorro/tests/external/es/test_analyzers.py
Expand Up @@ -34,7 +34,6 @@ def test_semicolon_keywords(self, es_helper):

value1 = "/path/to/dll;;foo;C:\\bar\\boo"
es_helper.index_crash(
raw_crash={},
processed_crash={
"uuid": crash_id_1,
"app_init_dlls": value1,
Expand All @@ -43,7 +42,6 @@ def test_semicolon_keywords(self, es_helper):
)
value2 = "/path/to/dll;D:\\bar\\boo"
es_helper.index_crash(
raw_crash={},
processed_crash={
"uuid": crash_id_2,
"app_init_dlls": value2,
Expand Down
39 changes: 10 additions & 29 deletions socorro/tests/external/es/test_crashstorage.py
Expand Up @@ -73,8 +73,6 @@
},
}

SAMPLE_RAW_CRASH = {"ProductName": "Firefox", "ReleaseChannel": "nightly"}

REMOVED_VALUE = object()


Expand Down Expand Up @@ -103,13 +101,12 @@ def build_crashstorage(self):

def test_index_crash(self, es_helper):
"""Test indexing a crash document."""
raw_crash = deepcopy(SAMPLE_RAW_CRASH)
processed_crash = deepcopy(SAMPLE_PROCESSED_CRASH)
processed_crash["date_processed"] = date_to_string(utc_now())

crashstorage = self.build_crashstorage()
crashstorage.save_processed_crash(
raw_crash=raw_crash,
raw_crash={},
processed_crash=processed_crash,
)

Expand All @@ -121,9 +118,6 @@ def test_index_crash(self, es_helper):

def test_index_crash_indexable_keys(self, es_helper):
"""Test indexing ONLY indexes valid, known keys."""
raw_crash = {
"InvalidKey": "alpha",
}
processed_crash = {
"another_invalid_key": "alpha",
"date_processed": date_to_string(utc_now()),
Expand All @@ -133,7 +127,7 @@ def test_index_crash_indexable_keys(self, es_helper):

crashstorage = self.build_crashstorage()
crashstorage.save_processed_crash(
raw_crash=raw_crash,
raw_crash={},
processed_crash=processed_crash,
)

Expand All @@ -143,11 +137,8 @@ def test_index_crash_indexable_keys(self, es_helper):
id=processed_crash["uuid"],
)

# Verify keys that aren't in super_search_fields aren't in the raw or processed
# crash parts
raw_crash = doc["_source"]["raw_crash"]
assert list(sorted(raw_crash.keys())) == []
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One less "sort an empty list" in the codebase. 🎉


# Verify keys that aren't in super_search_fields aren't in the the final
# document
processed_crash = doc["_source"]["processed_crash"]
assert list(sorted(processed_crash.keys())) == [
"date_processed",
Expand Down Expand Up @@ -188,33 +179,27 @@ def test_index_crash_mapping_keys(self, es_helper):

# Create a crash for this week and save it
now_uuid = create_new_ooid(timestamp=now)
raw_crash = {
"BuildID": "20200506000000",
}
processed_crash = {
field: "this week",
"date_processed": date_to_string(now),
"uuid": now_uuid,
}

crashstorage.save_processed_crash(
raw_crash=raw_crash,
raw_crash={},
processed_crash=processed_crash,
)

# Create a crash for four weeks ago with the bum mapping and save it
old_uuid = create_new_ooid(timestamp=four_weeks_ago)
raw_crash = {
"BuildID": "20200506000000",
}
processed_crash = {
field: "this week",
"date_processed": date_to_string(now - timedelta(days=28)),
"uuid": old_uuid,
}

crashstorage.save_processed_crash(
raw_crash=raw_crash,
raw_crash={},
processed_crash=processed_crash,
)

Expand All @@ -239,9 +224,8 @@ def test_index_crash_mapping_keys(self, es_helper):
assert field not in doc["_source"]["processed_crash"]

def test_crash_size_capture(self):
"""Verify we capture raw/processed crash sizes in ES crashstorage"""
"""Verify saving a processed crash emits a metric for crash document size"""
crash_id = create_new_ooid()
raw_crash = {"ProductName": "Firefox", "ReleaseChannel": "nightly"}
processed_crash = {
"date_processed": "2012-04-08 10:56:41.558922",
"uuid": crash_id,
Expand All @@ -250,13 +234,11 @@ def test_crash_size_capture(self):
crashstorage = self.build_crashstorage()
with MetricsMock() as mm:
crashstorage.save_processed_crash(
raw_crash=raw_crash,
raw_crash={},
processed_crash=processed_crash,
)

mm.assert_histogram("processor.es.raw_crash_size", value=2)
mm.assert_histogram("processor.es.processed_crash_size", value=96)
mm.assert_histogram("processor.es.crash_document_size", value=186)
mm.assert_histogram("processor.es.crash_document_size", value=169)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This value changes because the document no longer contains a "raw_crash":{},.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that I'm thinking about this, we could stop emitting the processed crash size. The "Socorro prod app metrics" dashboard only has a panel for the crash document size.

I'll fix this now--one less metric.


def test_index_data_capture(self, es_helper):
"""Verify we capture index data in ES crashstorage"""
Expand Down Expand Up @@ -352,7 +334,6 @@ def test_delete_expired_indices(self, es_helper):
)
def test_indexing_bad_data(self, key, value, expected_value, es_helper):
crash_id = create_new_ooid()
raw_crash = {"ProductName": "Firefox", "ReleaseChannel": "nightly"}
processed_crash = {
"date_processed": date_from_ooid(crash_id),
"uuid": crash_id,
Expand All @@ -368,7 +349,7 @@ def test_indexing_bad_data(self, key, value, expected_value, es_helper):
# Save the crash data and then fetch it and verify the value is as expected
crashstorage = self.build_crashstorage()
crashstorage.save_processed_crash(
raw_crash=raw_crash,
raw_crash={},
processed_crash=processed_crash,
)
es_helper.refresh()
Expand Down
2 changes: 0 additions & 2 deletions socorro/tests/external/es/test_query.py
Expand Up @@ -31,15 +31,13 @@ def test_get(self, es_helper):

datestamp = date_to_string(utc_now())
es_helper.index_crash(
raw_crash={},
processed_crash={
"uuid": create_new_ooid(),
"date_processed": datestamp,
"product": "WaterWolf",
},
)
es_helper.index_crash(
raw_crash={},
processed_crash={
"uuid": create_new_ooid(),
"date_processed": datestamp,
Expand Down
4 changes: 1 addition & 3 deletions socorro/tests/external/es/test_super_search_fields.py
Expand Up @@ -97,7 +97,6 @@ def test_get_mapping(self, es_helper):
properties = mapping[doctype]["properties"]

print(json.dumps(properties, indent=4, sort_keys=True))
assert "raw_crash" not in properties
assert "processed_crash" in properties

processed_crash = properties["processed_crash"]["properties"]
Expand Down Expand Up @@ -218,8 +217,7 @@ def test_validate_super_search_fields(name, properties):
if properties.get("destination_keys"):
for key in properties["destination_keys"]:
possible_keys = [
# Old keys we're probably migrating from
f"raw_crash.{properties['in_database_name']}",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of the keys start with raw_crash and we want to make sure that's true going forward. We only want to index values that have been normalized and validated and those are in the processed crash.

# Old key we're possibly migrating from
f"processed_crash.{properties['in_database_name']}",
# New key we're probably migrating to
f"processed_crash.{properties['name']}",
Expand Down