mozilla-services · willkg · Mar 20, 2024 · Mar 20, 2024 · willkg · Mar 20, 2024
@@ -216,7 +216,7 @@ def fix_datetime(value):
 def build_document(src, crash_document, fields, all_keys):
     """Given a source document and fields and valid keys, builds a document to index.
 
-    :param dict src: the source document with raw_crash and processed_crash keys
+    :param dict src: the source document with "processed_crash" key
     :param dict crash_document: the document to fill
     :param list fields: the list of fields in super search fields
     :param set all_keys: the list of valid keys
@@ -270,7 +270,7 @@ def build_document(src, crash_document, fields, all_keys):
 
 
 class ESCrashStorage(CrashStorageBase):
-    """This sends raw and processed crash reports to Elasticsearch."""
+    """Indexes documents based on the processed crash to Elasticsearch."""
 
     # These regex will catch field names from Elasticsearch exceptions. They
     # have been tested with Elasticsearch 1.4.
@@ -498,14 +498,10 @@ def save_processed_crash(self, raw_crash, processed_crash):
         es_doctype = self.get_doctype()
         all_valid_keys = self.get_keys(index_name, es_doctype)
 
-        src = {
-            "raw_crash": copy.deepcopy(raw_crash),
-            "processed_crash": copy.deepcopy(processed_crash),
-        }
+        src = {"processed_crash": copy.deepcopy(processed_crash)}
 
         crash_document = {
             "crash_id": crash_id,
-            "raw_crash": {},
             "processed_crash": {},
         }
         build_document(src, crash_document, fields=FIELDS, all_keys=all_valid_keys)
@@ -533,8 +529,6 @@ def _capture(key, data):
                 # we can fix it later.
                 self.logger.exception(f"something went wrong when capturing {key}")
 
-        _capture("raw_crash_size", crash_document["raw_crash"])
-        _capture("processed_crash_size", crash_document["processed_crash"])
         _capture("crash_document_size", crash_document)
 
     def _index_crash(self, connection, es_index, es_doctype, crash_document, crash_id):

@@ -297,8 +297,8 @@ def boolean_field(
     show up in aggregations.
 
     :param name: the name used to query the field in super search
-    :param namespace: either "raw_crash" or "processed_crash"; note that we're moving
-        to a model where we pull everything from the processed_crash, so prefer that
+    :param namespace: either "processed_crash" or some dotted path to a key deep in
+        the processed crash
     :param in_database_name: the field in the processed crash to pull this data from
 
     :returns: super search field specification as a dict
@@ -332,8 +332,8 @@ def keyword_field(
     names, fields that have a limited set of choices, etc.
 
     :param name: the name used to query the field in super search
-    :param namespace: either "raw_crash" or "processed_crash"; note that we're moving
-        to a model where we pull everything from the processed_crash, so prefer that
+    :param namespace: either "processed_crash" or some dotted path to a key deep in
+        the processed crash
     :param in_database_name: the field in the processed crash to pull this data from
     :param choices: a list of valid values for the dropdown
 
@@ -367,8 +367,8 @@ def integer_field(
     """Generates a whole number field.
 
     :param name: the name used to query the field in super search
-    :param namespace: either "raw_crash" or "processed_crash"; note that we're moving
-        to a model where we pull everything from the processed_crash, so prefer that
+    :param namespace: either "processed_crash" or some dotted path to a key deep in
+        the processed crash
     :param in_database_name: the field in the processed crash to pull this data from
     :param storage_mapping_type: the storage mapping type to use for Elasticsearch;
         "short", "integer", "long"
@@ -403,8 +403,8 @@ def float_field(
     """Generates a floating point field.
 
     :param name: the name used to query the field in super search
-    :param namespace: either "raw_crash" or "processed_crash"; note that we're moving
-        to a model where we pull everything from the processed_crash, so prefer that
+    :param namespace: either "processed_crash" or some dotted path to a key deep in
+        the processed crash
     :param in_database_name: the field in the processed crash to pull this data from
 
     :returns: super search field specification as a dict

@@ -240,19 +240,19 @@ def get_url(self):
     def refresh(self):
         self.conn.refresh()
 
-    def index_crash(self, raw_crash, processed_crash, refresh=True):
+    def index_crash(self, processed_crash, refresh=True):
         """Index a single crash and refresh"""
-        self._crashstorage.save_processed_crash(raw_crash, processed_crash)
+        self._crashstorage.save_processed_crash(
+            raw_crash={},
+            processed_crash=processed_crash,
+        )
 
         if refresh:
             self.refresh()
 
-    def index_many_crashes(
-        self, number, raw_crash=None, processed_crash=None, loop_field=None
-    ):
+    def index_many_crashes(self, number, processed_crash=None, loop_field=None):
         """Index multiple crashes and refresh at the end"""
         processed_crash = processed_crash or {}
-        raw_crash = raw_crash or {}
 
         crash_ids = []
         for i in range(number):
@@ -262,9 +262,7 @@ def index_many_crashes(
             if loop_field is not None:
                 processed_copy[loop_field] = processed_crash[loop_field] % i
 
-            self.index_crash(
-                raw_crash=raw_crash, processed_crash=processed_copy, refresh=False
-            )
+            self.index_crash(processed_crash=processed_copy, refresh=False)
 
         self.refresh()
         return crash_ids

@@ -34,7 +34,6 @@ def test_semicolon_keywords(self, es_helper):
 
         value1 = "/path/to/dll;;foo;C:\\bar\\boo"
         es_helper.index_crash(
-            raw_crash={},
             processed_crash={
                 "uuid": crash_id_1,
                 "app_init_dlls": value1,
@@ -43,7 +42,6 @@ def test_semicolon_keywords(self, es_helper):
         )
         value2 = "/path/to/dll;D:\\bar\\boo"
         es_helper.index_crash(
-            raw_crash={},
             processed_crash={
                 "uuid": crash_id_2,
                 "app_init_dlls": value2,

@@ -73,8 +73,6 @@
     },
 }
 
-SAMPLE_RAW_CRASH = {"ProductName": "Firefox", "ReleaseChannel": "nightly"}
-
 REMOVED_VALUE = object()
 
 
@@ -103,13 +101,12 @@ def build_crashstorage(self):
 
     def test_index_crash(self, es_helper):
         """Test indexing a crash document."""
-        raw_crash = deepcopy(SAMPLE_RAW_CRASH)
         processed_crash = deepcopy(SAMPLE_PROCESSED_CRASH)
         processed_crash["date_processed"] = date_to_string(utc_now())
 
         crashstorage = self.build_crashstorage()
         crashstorage.save_processed_crash(
-            raw_crash=raw_crash,
+            raw_crash={},
             processed_crash=processed_crash,
         )
 
@@ -121,9 +118,6 @@ def test_index_crash(self, es_helper):
 
     def test_index_crash_indexable_keys(self, es_helper):
         """Test indexing ONLY indexes valid, known keys."""
-        raw_crash = {
-            "InvalidKey": "alpha",
-        }
         processed_crash = {
             "another_invalid_key": "alpha",
             "date_processed": date_to_string(utc_now()),
@@ -133,7 +127,7 @@ def test_index_crash_indexable_keys(self, es_helper):
 
         crashstorage = self.build_crashstorage()
         crashstorage.save_processed_crash(
-            raw_crash=raw_crash,
+            raw_crash={},
             processed_crash=processed_crash,
         )
 
@@ -143,11 +137,8 @@ def test_index_crash_indexable_keys(self, es_helper):
                 id=processed_crash["uuid"],
             )
 
-        # Verify keys that aren't in super_search_fields aren't in the raw or processed
-        # crash parts
-        raw_crash = doc["_source"]["raw_crash"]
-        assert list(sorted(raw_crash.keys())) == []
-
+        # Verify keys that aren't in super_search_fields aren't in the the final
+        # document
         processed_crash = doc["_source"]["processed_crash"]
         assert list(sorted(processed_crash.keys())) == [
             "date_processed",
@@ -188,33 +179,27 @@ def test_index_crash_mapping_keys(self, es_helper):
 
         # Create a crash for this week and save it
         now_uuid = create_new_ooid(timestamp=now)
-        raw_crash = {
-            "BuildID": "20200506000000",
-        }
         processed_crash = {
             field: "this week",
             "date_processed": date_to_string(now),
             "uuid": now_uuid,
         }
 
         crashstorage.save_processed_crash(
-            raw_crash=raw_crash,
+            raw_crash={},
             processed_crash=processed_crash,
         )
 
         # Create a crash for four weeks ago with the bum mapping and save it
         old_uuid = create_new_ooid(timestamp=four_weeks_ago)
-        raw_crash = {
-            "BuildID": "20200506000000",
-        }
         processed_crash = {
             field: "this week",
             "date_processed": date_to_string(now - timedelta(days=28)),
             "uuid": old_uuid,
         }
 
         crashstorage.save_processed_crash(
-            raw_crash=raw_crash,
+            raw_crash={},
             processed_crash=processed_crash,
         )
 
@@ -239,9 +224,8 @@ def test_index_crash_mapping_keys(self, es_helper):
             assert field not in doc["_source"]["processed_crash"]
 
     def test_crash_size_capture(self):
-        """Verify we capture raw/processed crash sizes in ES crashstorage"""
+        """Verify saving a processed crash emits a metric for crash document size"""
         crash_id = create_new_ooid()
-        raw_crash = {"ProductName": "Firefox", "ReleaseChannel": "nightly"}
         processed_crash = {
             "date_processed": "2012-04-08 10:56:41.558922",
             "uuid": crash_id,
@@ -250,13 +234,11 @@ def test_crash_size_capture(self):
         crashstorage = self.build_crashstorage()
         with MetricsMock() as mm:
             crashstorage.save_processed_crash(
-                raw_crash=raw_crash,
+                raw_crash={},
                 processed_crash=processed_crash,
             )
 
-            mm.assert_histogram("processor.es.raw_crash_size", value=2)
-            mm.assert_histogram("processor.es.processed_crash_size", value=96)
-            mm.assert_histogram("processor.es.crash_document_size", value=186)
+            mm.assert_histogram("processor.es.crash_document_size", value=169)
 
     def test_index_data_capture(self, es_helper):
         """Verify we capture index data in ES crashstorage"""
@@ -352,7 +334,6 @@ def test_delete_expired_indices(self, es_helper):
     )
     def test_indexing_bad_data(self, key, value, expected_value, es_helper):
         crash_id = create_new_ooid()
-        raw_crash = {"ProductName": "Firefox", "ReleaseChannel": "nightly"}
         processed_crash = {
             "date_processed": date_from_ooid(crash_id),
             "uuid": crash_id,
@@ -368,7 +349,7 @@ def test_indexing_bad_data(self, key, value, expected_value, es_helper):
         # Save the crash data and then fetch it and verify the value is as expected
         crashstorage = self.build_crashstorage()
         crashstorage.save_processed_crash(
-            raw_crash=raw_crash,
+            raw_crash={},
             processed_crash=processed_crash,
         )
         es_helper.refresh()

@@ -31,15 +31,13 @@ def test_get(self, es_helper):
 
         datestamp = date_to_string(utc_now())
         es_helper.index_crash(
-            raw_crash={},
             processed_crash={
                 "uuid": create_new_ooid(),
                 "date_processed": datestamp,
                 "product": "WaterWolf",
             },
         )
         es_helper.index_crash(
-            raw_crash={},
             processed_crash={
                 "uuid": create_new_ooid(),
                 "date_processed": datestamp,

@@ -97,7 +97,6 @@ def test_get_mapping(self, es_helper):
         properties = mapping[doctype]["properties"]
 
         print(json.dumps(properties, indent=4, sort_keys=True))
-        assert "raw_crash" not in properties
         assert "processed_crash" in properties
 
         processed_crash = properties["processed_crash"]["properties"]
@@ -218,8 +217,7 @@ def test_validate_super_search_fields(name, properties):
         if properties.get("destination_keys"):
             for key in properties["destination_keys"]:
                 possible_keys = [
-                    # Old keys we're probably migrating from
-                    f"raw_crash.{properties['in_database_name']}",
+                    # Old key we're possibly migrating from
                     f"processed_crash.{properties['in_database_name']}",
                     # New key we're probably migrating to
                     f"processed_crash.{properties['name']}",