Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug-1881575: support storing crashes in gcs #977

Merged
merged 9 commits into from Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 5 additions & 2 deletions .circleci/config.yml
Expand Up @@ -85,13 +85,16 @@ jobs:
docker compose run --rm ci shell bash -c 'cd systemtest && NGINX_TESTS=0 POST_CHECK=1 HOST=http://ci-web:8000 pytest -vv'

- run:
name: Run systemtest with pubsub
name: Run systemtest with pubsub and gcs
command: |
echo 'CRASHMOVER_CRASHPUBLISH_CLASS=antenna.ext.pubsub.crashpublish.PubSubCrashPublish' >> my.env
echo 'CRASHMOVER_CRASHSTORAGE_CLASS=antenna.ext.gcs.crashstorage.GcsCrashStorage' >> my.env
docker compose run --rm ci shell ./bin/run_setup.sh
docker compose up --detach --wait --wait-timeout=10 ci-web
docker compose run --rm ci shell bash -c 'cd systemtest && NGINX_TESTS=0 POST_CHECK=1 HOST=http://ci-web:8000 pytest -vv'
sed '$d' -i my.env # remove config on last line
# remove config on last two lines
sed '$d' -i my.env
sed '$d' -i my.env

- run:
name: Push to Dockerhub
Expand Down
21 changes: 21 additions & 0 deletions antenna/ext/crashstorage_base.py
Expand Up @@ -4,6 +4,8 @@

import logging

from antenna.util import get_date_from_crash_id


logger = logging.getLogger(__name__)

Expand All @@ -17,6 +19,25 @@ class Config:
def __init__(self, config):
self.config = config.with_options(self)

def _path_join(self, *paths):
relud marked this conversation as resolved.
Show resolved Hide resolved
return "/".join(paths)

def _get_raw_crash_path(self, crash_id):
date = get_date_from_crash_id(crash_id)
return self._path_join("v1", "raw_crash", date, crash_id)

def _get_dump_names_path(self, crash_id):
return self._path_join("v1", "dump_names", crash_id)

def _get_dump_name_path(self, crash_id, dump_name):
# NOTE(willkg): This is something that Socorro collector did. I'm not
# really sure why, but in order to maintain backwards compatability, we
# need to keep doing it.
if dump_name in (None, "", "upload_file_minidump"):
dump_name = "dump"
relud marked this conversation as resolved.
Show resolved Hide resolved

return self._path_join("v1", dump_name, crash_id)

def publish_crash(self, crash_report):
"""Save the crash report."""
raise NotImplementedError
Expand Down
41 changes: 12 additions & 29 deletions antenna/ext/fs/crashstorage.py
Expand Up @@ -9,7 +9,7 @@
from everett.manager import Option

from antenna.ext.crashstorage_base import CrashStorageBase
from antenna.util import get_date_from_crash_id, json_ordered_dumps
from antenna.util import json_ordered_dumps


logger = logging.getLogger(__name__)
Expand All @@ -19,18 +19,19 @@ class FSCrashStorage(CrashStorageBase):
"""Save raw crash files to the file system.

This generates a tree something like this which mirrors what we do
on S3:
on S3 and GCS:

::

<FS_ROOT>/
<YYYYMMDD>/
raw_crash/
<CRASHID>.json
dump_names/
<CRASHID>.json
<DUMP_NAME>/
<CRASHID>
v1/
dump_names/
<CRASHID>
<DUMPNAME>/
<CRASHID>
raw_crash/
<YYYYMMDD>/
<CRASHID>


Couple of things to note:
Expand Down Expand Up @@ -63,26 +64,8 @@ def __init__(self, config):
if not os.path.isdir(self.root):
os.makedirs(self.root)

def _get_raw_crash_path(self, crash_id):
"""Return path for where the raw crash should go."""
return os.path.join(
self.root, get_date_from_crash_id(crash_id), "raw_crash", crash_id + ".json"
)

def _get_dump_names_path(self, crash_id):
"""Return path for where the dump_names list should go."""
return os.path.join(
self.root,
get_date_from_crash_id(crash_id),
"dump_names",
crash_id + ".json",
)

def _get_dump_name_path(self, crash_id, dump_name):
"""Return path for a given dump."""
return os.path.join(
self.root, get_date_from_crash_id(crash_id), dump_name, crash_id
)
def _path_join(self, *paths):
return os.path.join(self.root, *paths)

def _save_file(self, fn, contents):
logger.debug("Saving file %r", fn)
Expand Down
3 changes: 3 additions & 0 deletions antenna/ext/gcs/__init__.py
@@ -0,0 +1,3 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
163 changes: 163 additions & 0 deletions antenna/ext/gcs/crashstorage.py
@@ -0,0 +1,163 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

import logging
import os
import uuid

from everett.manager import Option
from google.auth.credentials import AnonymousCredentials
from google.cloud import storage

from antenna.app import register_for_verification
from antenna.ext.crashstorage_base import CrashStorageBase
from antenna.util import json_ordered_dumps

logger = logging.getLogger(__name__)


def generate_test_filepath():
"""Generate a unique-ish test filepath."""
return "test/testfile-%s.txt" % uuid.uuid4()


class GcsCrashStorage(CrashStorageBase):
"""Save raw crash files to GCS.

This will save raw crash files to GCS in a pseudo-tree something like this:

::

<BUCKET>
v1/
dump_names/
<CRASHID>
<DUMPNAME>/
<CRASHID>
raw_crash/
<YYYYMMDD>/
<CRASHID>


**Authentication**

The google cloud sdk will automatically detect credentials as described in
https://googleapis.dev/python/google-api-core/latest/auth.html:

- If you're running in a Google Virtual Machine Environment (Compute Engine, App
Engine, Cloud Run, Cloud Functions), authentication should "just work".
- If you're developing locally, the easiest way to authenticate is using the `Google
Cloud SDK <http://cloud.google.com/sdk>`_::

$ gcloud auth application-default login

- If you're running your application elsewhere, you should download a `service account
<https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating>`_
JSON keyfile and point to it using an environment variable::

$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"


**Local emulator**

If you set the environment variable ``STORAGE_EMULATOR_HOST=http://host:port``,
then this will connect to a local GCS emulator.


"""

class Config:
bucket_name = Option(
doc=(
"Google Cloud Storage bucket to save to. Note that the bucket must "
"already have been created."
),
)

def __init__(self, config):
self.config = config.with_options(self)
self.bucket = self.config("bucket_name")

if emulator := os.environ.get("STORAGE_EMULATOR_HOST"):
logger.debug(
"STORAGE_EMULATOR_HOST detected, connecting to emulator: %s",
emulator,
)
self.client = storage.Client(
credentials=AnonymousCredentials(),
project="test",
)
else:
self.client = storage.Client()
Copy link
Member Author

@relud relud Feb 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unlike for pubsub in #974 (comment) the default retry and timeout behavior for google-cloud-storage is better defined. we use two network methods in this class: Client.get_bucket and Blob.upload_from_string, both of which have a default timeout of 60 seconds. Blob.upload_from_string is expected not to retry because the client assumes it is not an idempotent action. Client.get_bucket may retry and I think that's fine given there is a default timeout set, and a default retry timeout of 120 seconds. Overall a single file upload is bounded to (bucket retry timeout)+(bucket rpc timeout)+(blob upload rpc timeout) => 120+60+60 => 240 seconds.

tl;dr I think this client sets sane defaults for retry and timeout, and we shouldn't mess with them unless/until we see an issue.


register_for_verification(self.verify_write_to_bucket)

def _save_file(self, path, data):
"""Save a single file to GCS.

:arg str path: the path to save to
:arg bytes data: the data to save

"""
bucket = self.client.get_bucket(self.bucket)
blob = bucket.blob(path)
blob.upload_from_string(data)

def verify_write_to_bucket(self):
"""Verify GCS bucket exists and can be written to."""
self._save_file(generate_test_filepath(), b"test")

def check_health(self, state):
"""Check GCS connection health."""
try:
# get the bucket to verify GCS is up and we can connect to it.
self.client.get_bucket(self.bucket)
except Exception as exc:
state.add_error("GcsCrashStorage", repr(exc))

def save_raw_crash(self, crash_id, raw_crash):
"""Save the raw crash and related dumps.

.. Note::

If you're saving the raw crash and dumps, make sure to save the raw
crash last.
relud marked this conversation as resolved.
Show resolved Hide resolved

:arg str crash_id: The crash id as a string.
:arg dict raw_crash: dict The raw crash as a dict.

"""
self._save_file(
self._get_raw_crash_path(crash_id),
json_ordered_dumps(raw_crash).encode("utf-8"),
)

def save_dumps(self, crash_id, dumps):
"""Save dump data.

:arg str crash_id: The crash id
:arg dict dumps: dump name -> dump

"""
# Save dump_names even if there are no dumps
self._save_file(
self._get_dump_names_path(crash_id),
json_ordered_dumps(list(sorted(dumps.keys()))).encode("utf-8"),
)

# Save dumps
for dump_name, dump in dumps.items():
self._save_file(self._get_dump_name_path(crash_id, dump_name), dump)

def save_crash(self, crash_report):
"""Save crash data."""
crash_id = crash_report.crash_id
raw_crash = crash_report.raw_crash
dumps = crash_report.dumps

# Save dumps first
self.save_dumps(crash_id, dumps)

# Save raw crash
self.save_raw_crash(crash_id, raw_crash)
22 changes: 1 addition & 21 deletions antenna/ext/s3/crashstorage.py
Expand Up @@ -8,7 +8,7 @@

from antenna.app import register_for_verification
from antenna.ext.crashstorage_base import CrashStorageBase
from antenna.util import get_date_from_crash_id, json_ordered_dumps
from antenna.util import json_ordered_dumps


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -63,26 +63,6 @@ def check_health(self, state):
"""Check connection health."""
self.connection.check_health(state)

def _get_raw_crash_path(self, crash_id):
return "v1/raw_crash/{date}/{crash_id}".format(
date=get_date_from_crash_id(crash_id),
crash_id=crash_id,
)

def _get_dump_names_path(self, crash_id):
return f"v1/dump_names/{crash_id}"

def _get_dump_name_path(self, crash_id, dump_name):
# NOTE(willkg): This is something that Socorro collector did. I'm not
# really sure why, but in order to maintain backwards compatability, we
# need to keep doing it.
if dump_name in (None, "", "upload_file_minidump"):
dump_name = "dump"

return "v1/{dump_name}/{crash_id}".format(
dump_name=dump_name, crash_id=crash_id
)

def save_raw_crash(self, crash_id, raw_crash):
"""Save the raw crash and related dumps.

Expand Down
4 changes: 2 additions & 2 deletions bin/run_setup.sh
Expand Up @@ -22,8 +22,8 @@ python ./bin/s3_cli.py create "${CRASHMOVER_CRASHSTORAGE_BUCKET_NAME}"
python ./bin/s3_cli.py list_buckets

echo "Delete and create GCS bucket..."
python ./bin/gcs_cli.py delete "${CRASHMOVER_CRASHSTORAGE_GCS_BUCKET_NAME}"
python ./bin/gcs_cli.py create "${CRASHMOVER_CRASHSTORAGE_GCS_BUCKET_NAME}"
python ./bin/gcs_cli.py delete "${CRASHMOVER_CRASHSTORAGE_BUCKET_NAME}"
python ./bin/gcs_cli.py create "${CRASHMOVER_CRASHSTORAGE_BUCKET_NAME}"
python ./bin/gcs_cli.py list_buckets

echo "Delete and create SQS queue..."
Expand Down
2 changes: 2 additions & 0 deletions docker-compose.yml
Expand Up @@ -52,6 +52,7 @@ services:
env_file:
- docker/config/local_dev.env
- docker/config/test.env
- my.env
links:
- fakesentry
- gcs-emulator
Expand All @@ -67,6 +68,7 @@ services:
# exclude docker/config/test.env because this will be used for systemtest
# which requires store and publish to actually happen
- docker/config/local_dev.env
- my.env
command: web

# Web container is a prod-like fully-functioning Antenna container
Expand Down
2 changes: 1 addition & 1 deletion docker/config/local_dev.env
Expand Up @@ -25,7 +25,7 @@ CRASHMOVER_CRASHPUBLISH_SUBSCRIPTION_NAME=local_dev_socorro_sub
PUBSUB_EMULATOR_HOST=pubsub:5010

# GCS settings
CRASHMOVER_CRASHSTORAGE_GCS_BUCKET_NAME=antennabucket
CRASHMOVER_CRASHSTORAGE_BUCKET_NAME=antennabucket
relud marked this conversation as resolved.
Show resolved Hide resolved

# Set GCS library to use emulator
# This has to be an https url or google_cloud_storage won't work
Expand Down
19 changes: 19 additions & 0 deletions docs/configuration.rst
Expand Up @@ -241,6 +241,25 @@ supported.
configuration here.


Google Cloud Storage
--------------------

The ``GcsCrashStorage`` class will save crash data to Google Cloud Storage.

.. autocomponentconfig:: antenna.ext.gcs.crashstorage.GcsCrashStorage
:show-docstring:
:case: upper
:namespace: crashmover_crashstorage
:show-table:

When set as the CrashMover crashstorage class, configuration
for this class is in the ``CRASHMOVER_CRASHSTORAGE`` namespace.

Example::

CRASHMOVER_CRASHSTORAGE_BUCKET_NAME=mybucket


Crash publish
=============

Expand Down