diff --git a/.gitignore b/.gitignore index 66019d4..b33ab23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Custom ignores observability/examples/simple/observability-simple - +_build # Python ignores # Byte-compiled / optimized / DLL files diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..731b37f --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,17 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-20.04 + tools: + python: "3.9" + +sphinx: + configuration: docs/conf.py + +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/screenshots-dashboards-alerts.png b/docs/_static/screenshots-dashboards-alerts.png new file mode 100644 index 0000000..d210551 Binary files /dev/null and b/docs/_static/screenshots-dashboards-alerts.png differ diff --git a/docs/_static/screenshots-dashboards-availability.png b/docs/_static/screenshots-dashboards-availability.png new file mode 100644 index 0000000..7876c91 Binary files /dev/null and b/docs/_static/screenshots-dashboards-availability.png differ diff --git a/docs/_static/screenshots-dashboards-docker-metrics.png b/docs/_static/screenshots-dashboards-docker-metrics.png new file mode 100644 index 0000000..0abdb13 Binary files /dev/null and b/docs/_static/screenshots-dashboards-docker-metrics.png differ diff --git a/docs/_static/screenshots-dashboards-es-metrics.png b/docs/_static/screenshots-dashboards-es-metrics.png new file mode 100644 index 0000000..022aaf1 Binary files /dev/null and b/docs/_static/screenshots-dashboards-es-metrics.png differ diff --git a/docs/_static/screenshots-dashboards-vm-metrics.png b/docs/_static/screenshots-dashboards-vm-metrics.png new file mode 100644 index 0000000..1c2ec59 Binary files /dev/null and b/docs/_static/screenshots-dashboards-vm-metrics.png differ diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..4fd3c9e --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,37 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html +import os +import sys +sys.path.insert(0, os.path.abspath("../observability/docs")) + +print("Hello") +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'CogStack Platform Toolkit' +copyright = '2025, CogStack Org' +author = 'CogStack Org' +release = 'latest' +html_title = "CogStack Platform Toolkit" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx_rtd_theme', + 'sphinx.ext.autodoc', + 'myst_parser', + 'sphinx.ext.inheritance_diagram', +] +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "furo" +html_static_path = ['_static'] diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..6402543 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,15 @@ + +# Cogstack Platform Toolit + +This project provides utilities for running Cogstack in production. + +- [CogStack Observability](observability/_index.md) + +```{toctree} +:hidden: + +observability/_index + +``` + + diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..8084272 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/observability/docs/overview.md b/docs/observability/_index.md similarity index 80% rename from observability/docs/overview.md rename to docs/observability/_index.md index 84158f2..00fb27f 100644 --- a/observability/docs/overview.md +++ b/docs/observability/_index.md @@ -1,4 +1,4 @@ -# Cogstack Observability Stack +# Cogstack Observability This project provides observability of a cogstack deployment. @@ -9,10 +9,15 @@ It provides the following features: - Blackbox Probing of services to find service level indicators of uptime and latency - A working inventory of what is running where -## Contents See the [Quickstart](./get-started/quickstart.md) to see how to easily run this stack. +```{toctree} +:maxdepth: 2 +get-started/_index +setup/_index +customization/_index +reference/_index - +``` diff --git a/docs/observability/customization/_index.md b/docs/observability/customization/_index.md new file mode 100644 index 0000000..715e31b --- /dev/null +++ b/docs/observability/customization/_index.md @@ -0,0 +1,19 @@ +# Customization + +```{include} custom-dashboards.md +:heading-offset: 1 +``` + +```{include} custom-prometheus-configs.md +:heading-offset: 1 +``` + + +```{toctree} +:titlesonly: +:hidden: + +custom-prometheus-configs.md +custom-dashboards.md + +``` \ No newline at end of file diff --git a/docs/observability/customization/custom-dashboards.md b/docs/observability/customization/custom-dashboards.md new file mode 100644 index 0000000..87c98b3 --- /dev/null +++ b/docs/observability/customization/custom-dashboards.md @@ -0,0 +1,15 @@ +# Custom Dashboards +You can setup custom dashboards as json files, and include them along with the defaults in this project. + +Grafana is setup with preconfigured dashboards, datasource, and alerting. These will work when prometheus is run in this stack, and is dependent on all the metrics following defined rules. + +It is advised that any edits or new configs get committed back into your git repository, and stick with grafana provisioning instead of allowing manual edits. + + +## How to add a new dashboard with provisioning + +- Mount new dashboard files in the `/etc/grafana/provisioning/dashboards/site` directory +- To remove or change the existing, mount over the existing files there + +For more info see [Grafana Alerting Provisioning](https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards) + diff --git a/docs/observability/customization/custom-prometheus-configs.md b/docs/observability/customization/custom-prometheus-configs.md new file mode 100644 index 0000000..5ec75c2 --- /dev/null +++ b/docs/observability/customization/custom-prometheus-configs.md @@ -0,0 +1,17 @@ +# Custom Prometheus Configuration +You can add compeltely custom prometheus scrape configs and recording rules by mounting in docker. + +- `site/prometheus/scrape-configs/*.yml`. This is for advanced configuration. + +Any yml file put in this directory will be used as standard promethues scrape configs. This will give full flexibility over what metrics are collected and all features in prometheus. Add any further configs that you want prometheus to use. + +```yaml +# Custom scrape config definition +scrape_configs: + - job_name: custom-scrape-config # Scrape configuration to get metrics from elasticsearch, eg index size. + static_configs: + - targets: + - my-custom-target:9114 + labels: + custom_label: custom # (Optional) +``` \ No newline at end of file diff --git a/docs/observability/get-started/_index.md b/docs/observability/get-started/_index.md new file mode 100644 index 0000000..0dc7dff --- /dev/null +++ b/docs/observability/get-started/_index.md @@ -0,0 +1,7 @@ +# Getting Started + +```{toctree} +:maxdepth: 2 +quickstart +userguide-tutorial +``` diff --git a/observability/docs/get-started/quickstart.md b/docs/observability/get-started/quickstart.md similarity index 65% rename from observability/docs/get-started/quickstart.md rename to docs/observability/get-started/quickstart.md index 565ff5c..28e18a7 100644 --- a/observability/docs/get-started/quickstart.md +++ b/docs/observability/get-started/quickstart.md @@ -1,32 +1,34 @@ -## QuickStart +# QuickStart This tutorial guides you through running the simplest setup of the observability stack using example configuration files and Docker Compose. After completing these steps, you will have a full observability stack running locally, showing the availability of web pages you want to target -### Requirements +## Requirements - Docker installed ([install Docker](https://docs.docker.com/get-docker/)) - Docker Compose installed ([install Docker Compose](https://docs.docker.com/compose/install/)) - A terminal with network access +## Steps + ### Step 1: Run the Quickstart script Run this quickstart script to setup the project ```bash -curl https://raw.githubusercontent.com/CogStack/cogstack-platform-toolkit/main/observability/examples/simple/quickstart.sh | bash +curl https://raw.githubusercontent.com/CogStack/cogstack-platform-toolkit/refs/heads/main/observability/examples/simple/quickstart.sh | bash ``` Now go to "http://localhost/grafana" to see the dashboards Thats everything. The stack is running and you can see the availability. +If you can't use the script, see the [Manual Quickstart](../advanced-usage/quickstart-manual.md) to setup your own files. + ### Optional Step: Probe your own web page Now you can look at getting monitoring of your own page -In your current folder, edit the file `prometheus/scrape-configs/probers/probe-simple.yml` that you downloaded from git. - -Add the following yml to the bottom of the file: +1. In your current folder, in the file `prometheus/scrape-configs/probers/probe-simple.yml` add the following yml to the bottom of the file: ```yaml - targets: @@ -36,23 +38,24 @@ Add the following yml to the bottom of the file: job: probe-my-own-site ``` +Note to be careful of the indentation in yml, this target must be at the same depth as the existing contents. -The change should get applied automatically, but if you dont want to wait then run +2. Restart the containers with: ``` docker compose restart ``` Now refresh the grafana dashboard, and you can see the availability of google.com, it's probably 100%! - ## Next steps This is the end of this quickstart tutorial, that enables probing availability of endpoints. For the next steps we can: +- Look deeper into the observability dashboards, on [Dashboards Userguide](./userguide-tutorial.md) - Productionise our deployment to enable further features -- Enable *Telemetry* like VM memory usage, and Elasticsearch index size, by running Exporters +- Configure *Telemetry* like VM memory usage, and Elasticsearch index size, by running Exporters - Enable *Alerting* based on our availability and a defined Service Level Objective (SLO) -- Look further into the available dashboards +- Setup further *Probing* of our running services to get availability metrics - Fully customize the stack with our own dashboards, recording rules and metrics diff --git a/docs/observability/get-started/userguide-tutorial.md b/docs/observability/get-started/userguide-tutorial.md new file mode 100644 index 0000000..a2d43d6 --- /dev/null +++ b/docs/observability/get-started/userguide-tutorial.md @@ -0,0 +1,76 @@ +# Dashboard User Guide +This guide walks you through how to monitor your stack using the included Grafana dashboards. It shows how to use each dashboard, and some ideas of what things to look out for. + +## Availability - How well are things running? +![Availability Dashboard](../../_static/screenshots-dashboards-availability.png) + +Open the Cogstack Monitoring Dashboard on [localhost/grafana](http://localhost/grafana/d/NEzutrbMk/cogstack-monitoring-dashboard) + +Use the percentage uptime charts at the top to see the availability over a given time period. For example, “Over the last 8 hours, we have 99.5% availability on my service”. + +Use the time filter in the top right corner of the page to change the window, for example change it to 30 days to see availability for the total month. + +Look for trends like: +- Has there been a full outage of a service for 5 minutes, where where 5m availability goes to 0 +- Is there some disruption over the time period, where my 5m availability stays high, but my 6h availability is going down? +- Have we met the service level objective, if we set the time threshold to 30 days? + +Use the filters at the top, or click in the table to better filter the view down to specific targets, services or hosts. + +See [Setup Probing](../setup/probing.md) to do the full setup of probers. + +## Inventory - What is running? +![Docker Metrics Dashboard](../../_static/screenshots-dashboards-docker-metrics.png) + +Use the Docker Metrics dashboard to check which containers are running, where, and whether they're healthy. This is useful for verifying deployments or diagnosing issues. + +The dashboard above includes the hostnames, IP addresses and any other details configured. + +Check for things like: +- Containers not running where you thought they should be by looking at the hostname for each container +- Containers restarting unexpectedly, by looking at the "Running" column in the table + +See [telemetry](../setup/telemetry.md) to set this up + +## Telemetry - How can I see details of resources? +Some additional dashboards are setup to provide more metrics. + +### VM Metrics +![ VM Metrics dashboard ](../../_static/screenshots-dashboards-vm-metrics.png) + +Open the VM Metrics dashboard on [localhost/grafana](http://localhost/grafana/d/rYdddlPWk/vm-metrics-in-cogstack) + +Select a VM from the host dropdown . + +Look for things like: + +- CPU Usage — is a process using too much CPU? +- Memory Usage — if you're running out of RAM +- Disk IO / Space — alerts you to low disk conditions +- Trends over time, by setting the time filter to 30 days. Is your disk usage increasing over time? + +### Elasticsearch Metrics +![ElasticSearch Metrics Dashboard](../../_static/screenshots-dashboards-es-metrics.png) +Open the Elasticsearch Metrics dashboard on [localhost/grafana](http://localhost/grafana/d/n_nxrE_mk/elasticsearch-metrics-in-cogstack) + +This dashboard helps you understand how your ElasticSearch or Opensearch cluster is behaving. + +Look at: +- Cluster health status — shows yellow/red states immediately +- Index size per shard — to detect unbalanced index growth +- Query latency and throughput — useful during heavy search loads + +See [telemetry](../setup/telemetry.md) to set this up + +## Alerting - When should I look at this? +Alerting is setup using Grafana Alerts, but paused by default + +When alerts are setup, the grafana graphs will show when the alerts were fired. +![Alerts Firing on dashboard](../../_static/screenshots-dashboards-alerts.png) + +Two sets of rules are defined in this project: + +- Basic alerts using uptime. If over 5m or 6h, if it drops below a certain percentage uptime, send an alert +- Alerting on SLOs by using burn rates, for multi-window multi-rate alerts following best practices defined in [Google SRE - Prometheus Alerting: Turn SLOs into Alerts](https://sre.google/workbook/alerting-on-slos/) + +See [Alerting](../setup/alerting.md) to set this up diff --git a/docs/observability/reference/_index.md b/docs/observability/reference/_index.md new file mode 100644 index 0000000..10348bb --- /dev/null +++ b/docs/observability/reference/_index.md @@ -0,0 +1,9 @@ +# Reference + +```{toctree} +:maxdepth: 2 + +project-details.md +concept-materials.md + +``` diff --git a/docs/observability/reference/concept-materials.md b/docs/observability/reference/concept-materials.md new file mode 100644 index 0000000..a65cd13 --- /dev/null +++ b/docs/observability/reference/concept-materials.md @@ -0,0 +1,7 @@ +# Concepts +```{toctree} +:maxdepth: 2 +understanding-metrics.md + +``` + diff --git a/docs/observability/reference/project-details.md b/docs/observability/reference/project-details.md new file mode 100644 index 0000000..e3b3041 --- /dev/null +++ b/docs/observability/reference/project-details.md @@ -0,0 +1,9 @@ +# Further Project Details + + +```{toctree} +:maxdepth: 2 +quickstart-manual.md + +``` + diff --git a/observability/docs/get-started/quickstart-manual.md b/docs/observability/reference/quickstart-manual.md similarity index 96% rename from observability/docs/get-started/quickstart-manual.md rename to docs/observability/reference/quickstart-manual.md index 71d009c..9349606 100644 --- a/observability/docs/get-started/quickstart-manual.md +++ b/docs/observability/reference/quickstart-manual.md @@ -1,6 +1,5 @@ - -## Manual Quickstart - +# Manual Quickstart +//TODO The quickstart page uses a script to setup the folders for you. This page instead details how to do it manually, to provide clarity. @@ -11,7 +10,8 @@ Create the necessary directory structure mkdir -p observability-simple/prometheus/scrape-configs/probers mkdir -p observability-simple/prometheus/scrape-configs/exporters ``` - + Something + Download these two files from github, and place in the right folder diff --git a/docs/observability/reference/understanding-metrics.md b/docs/observability/reference/understanding-metrics.md new file mode 100644 index 0000000..fd56eb3 --- /dev/null +++ b/docs/observability/reference/understanding-metrics.md @@ -0,0 +1,28 @@ +# Understanding Concepts + +This page provides some reference explanations for the concepts used + +## Availability in depth +We measure the availability of the stack using prometheus and blackbox exporter. + +The exporter calls an endpoint defined in the yaml at a given frequency, and exposes the result as either a 0 or 1. + +The success metric is 0 or 1, so our uptime over a time period is the average of the value over that period. EG - `avg_over_time(probe_success[8h]) * 100 ` + +Probing frequency is defined by the prometheus scrape_interval in the prometheus config, the exporter itself doesnt know. Example interval by default is every 10s + + +### Availability at a given point in time +What does the percentage availability mean? Lets explain with an example: + +Say we see in our 8h availability graph, we have 98.77% availability at 15:00 yesterday. + +Our probe interval is every 10 seconds. This means that in 8 hours we make 2440 calls. + +For 98.77% availability, we must have had 30 calls fail over the time period (2440 * 0.9877) + +30 failing calls over the time period could happen in a few ways: +- We could have just dropped 30 calls spaced evenly over the period of 8 hours, which probably can't be noticed +- we could have had a outage of 0% availability for 5 minutes in sequence, where the thing is properly broken for that period. This would mean 30 calls failed, so uptime over 8 hours is 98.7% + +This show why we want to understand availability over different time windows diff --git a/docs/observability/setup/_index.md b/docs/observability/setup/_index.md new file mode 100644 index 0000000..3c77f3d --- /dev/null +++ b/docs/observability/setup/_index.md @@ -0,0 +1,12 @@ +# Setup + +```{toctree} +:maxdepth: 2 + +production-setup.md +probing.md +telemetry.md +alerting.md + + +``` diff --git a/docs/observability/setup/alerting.md b/docs/observability/setup/alerting.md new file mode 100644 index 0000000..c697557 --- /dev/null +++ b/docs/observability/setup/alerting.md @@ -0,0 +1,104 @@ +# Alerting + +This guide explains how to enable and customize alerting in the CogStack observability stack using Grafana and Prometheus. + +By default, alerts are **paused**. The system is preconfigured to send alerts to a **Slack Webhook**, but this can be customized. + +There are two categories of alerting: + +* **Basic availability alerts**: Triggered when uptime falls below a threshold over short windows (5m or 6h). +* **Burn rate alerts**: Using multi-window multi-rate alerts following best practices in [Google SRE principles](https://sre.google/workbook/alerting-on-slos/), used to track compliance with SLOs. + +--- + +## How to Enable Alerting + +### 1. Define Your SLO + +To configure burn rate alerting, create a Prometheus recording rule to define your target SLO: + +``` +groups: + - name: slo-target-rules + rules: + - record: slo_target_over_30_days + expr: 0.95 + labels: + job: "probe-services" +``` + +* `expr`: Target SLO (e.g., `0.95` for 95% over 30 days) +* `job`: Must match the probe job name defined in your configuration. This allows you to have different SLOs for different endpoints. + +Place this file at: + +``` +prometheus/recording-rules/slo.yml +``` + +This should be mounted in the docker container under `/etc/prometheus/cogstack/site/prometheus/recording-rules/slo.yml`, which should be already setup if you followed the setup instructions. + +--- + +### 2. Configure Alerting Environment + +Set these environment variables to control alerting behavior: + +``` +ALERTING_PAUSE_AVAILABILITY_5M=true +ALERTING_PAUSE_AVAILABILITY_6H=true +ALERTING_PAUSE_BURN_RATE=true +SLACK_WEBHOOK_URL=https://hooks.slack.com/services/your-webhook +``` + +* Set any of the `ALERTING_PAUSE_*` variables to `false` to enable that alert type. +* `SLACK_WEBHOOK_URL` should be set to a webhook, which will send any alerts to slack. + +--- + +## Advanced Customization +### Customize Alert Contact points + +You can customize where alerts are sent by defining a new contact point in Grafana: + +``` +notifiers: + - name: "custom-contact" + type: "slack" + settings: + url: "https://hooks.slack.com/services/..." +``` + +Mount this file into: + +``` +/etc/grafana/provisioning/alerting/custom-contact.yml +``` + +Then update the environment variable: + +``` +ALERTING_DEFAULT_CONTACT=custom-contact +``` + +**Note** to be only mount the exact file, and not override the whole provisioning folder in the image, as this is already used to contain the defaults. + +--- + +### Add Custom Alerts +To define additional alert rules, create files in: + +``` +/etc/grafana/provisioning/alerting/ +``` + +Grafana will automatically load these at startup. + +--- + +## Further Reading + +* [Grafana Alerting Provisioning](https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/) +* [Google SRE – Burn Rate Alerting](https://sre.google/workbook/alerting-on-slos/#4-alert-on-burn-rate) + +Let me know if you'd like to split this into multiple focused guides, e.g., one for basic uptime, one for SLO-based alerts. diff --git a/docs/observability/setup/probing.md b/docs/observability/setup/probing.md new file mode 100644 index 0000000..bb60714 --- /dev/null +++ b/docs/observability/setup/probing.md @@ -0,0 +1,130 @@ +# Availability + +This guide explains how to configure HTTP probers using Blackbox Exporter to monitor the availability of your services. These probers generate uptime and latency metrics, which can then be visualized in Grafana. + +See the [Reference](../reference/understanding-metrics.md) for an explanation of the metrics this generates. + +--- + +## How to Add New Probers + +To add a new prober target: + +1. Navigate to the folder: + + ``` + prometheus/scrape-configs/probers/ + ``` + +2. Create a new YAML file (e.g., `probe.my-services.yml`) with the following structure: + + ``` + # probe.my-services.yml + - targets: + - https://myservice.example.com/health + labels: + name: my-service # Mandatory - the name of the service being probed + job: my-services # Mandatory - used to group probes in dashboards + ip_address: "10.0.0.12" # Optional - IP of the host being probed + host: service-hostname # Optional - Human-readable hostname + region: eu-west # Optional - Any additional metadata label + ``` + +3. Ensure the folder is mounted in docker under `/etc/prometheus/cogstack/site/prometheus/scrape-configs/probers`, which it should be by default if you've followed the setup guids. Any valid `.yml` files in this folder will be automatically picked up and used as Blackbox targets. + +--- + +## Advanced Setup + +### How to add Auth to the prober or further configurations + +To define how a probe behaves (e.g., add basic auth, headers, timeout, method), we will configure a module in the Blackbox Exporter config. + +#### Create a Blackbox Exporter Config file +You will need to create a new file, and then mount it over the existing provided vconfig + + +1. Create a new file: + + ``` + prometheus/blackbox-exporter/custom-blackbox-config.yml + ``` + +2. Add the existing defaults + +``` +modules: + http_get_200: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] # Defaults to 2xx + method: GET + preferred_ip_protocol: "ip4" # defaults to "ip6" + tls_config: + insecure_skip_verify: true +``` + +3. Add your own module to the modules in that file +``` + http_2xx_custom: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] # Defaults to 2xx + method: GET + preferred_ip_protocol: "ip4" # defaults to "ip6" + tls_config: + insecure_skip_verify: true + basic_auth: + username: my-user + password: example-pass +``` + +This example adds a module named `http_2xx_custom` that adds some basic auth credentials + +--- + +#### Reference the new module in your prober config + +In your probe YAML file, reference the module in the `module` field of the `labels` section: + +``` + - targets: + - https://myservice.example.com/health + labels: + name: my-service + module: http_2xx_custom # Optional - overrides the default Blackbox module +``` + +#### Mount the config file +You lastly need to mount the new config file and refer to it in docker compose + +``` + blackbox-exporter: + image: cogstacksystems/cogstack-observability-blackbox-exporter:latest + restart: unless-stopped + networks: + - observability + volumes: + - ./prometheus/blackbox-exporter:/config + command: + - "--config.file=/config/custom-blackbox-config.yml" +``` + +--- + +## Notes + +* Changes will take effect on the next Prometheus reload or container restart. +* Jobs with the same `job` label are grouped in dashboards to simplify analysis. +* Job labels need to line up with defined SLOs to enable Alerting +* Probers can be used for both external URLs, and direct to local docker containers. For example, we probe grafana on "cogstack-observability-grafana-1:3000/". If you want to probe local docker containers, note that the network has to line up + + +## External links +For full Blackbox Exporter documentation, see: + +- [Prometheus Blackbox Exporter](https://github.com/prometheus/blackbox_exporter) diff --git a/docs/observability/setup/production-setup.md b/docs/observability/setup/production-setup.md new file mode 100644 index 0000000..55f2ad6 --- /dev/null +++ b/docs/observability/setup/production-setup.md @@ -0,0 +1,103 @@ +# Production Setup Tutorial +//In Progress +This tutorial guides you through setting up the **CogStack Observability Stack** for production use. + +If you're new, we recommend completing the [Quickstart Tutorial](../quickstart.md) first to get a simplified setup running. + +By the end of the tutorial, you will have a complete stack offering all the observability features, customized to your usage. + +--- + +## Step 1: Understand the Folder Structure + +Your project configuration should follow this structure: + +``` +observability.docker-compose.yml +exporters.docker-compose.yml +prometheus/ + scrape-configs/ + exporters/ # Targets that expose metrics (e.g. Elasticsearch, Docker, VMs) + probers/ # HTTP endpoints to check availability + recording-rules/ # Prometheus recording rules (e.g. for SLOs, summaries) + blackbox-exporter/ # (Optional) Custom Probe configuration +grafana/ # (Optional) Custom Grafana dashboards and config +``` + +## Step 2: Initialise the project + +Run: +```bash +curl https://raw.githubusercontent.com/CogStack/cogstack-platform-toolkit/refs/heads/main/observability/examples/full/full-quickstart.sh | bash +``` + +This script will setup all the folder structure, and download all the relevant files. + +### Script Details +The script automates making folders, and downloading these files: + +Downloads the example docker compose files: +- [docker-compose.yml](../../../observability/examples/full/docker-compose.yml) +- [exporters.docker-compose.yml](../../../observability/examples/full/exporters.docker-compose.yml) +- [exporters.elastic.docker-compose.yml](../../../observability/examples/full/exporters.elastic.docker-compose.yml) + +Downloads the prometheus configurations: +- [prometheus/scrape-configs/exporters/exporters.yml](../../../observability/examples/full/prometheus/scrape-configs/exporters/exporters.yml) +- [prometheus/scrape-configs/probers/probe-external.ymll](../../../observability/examples/full/prometheus/scrape-configs/probers/probe-external.yml) +- [prometheus/scrape-configs/probers/probe-internal.yml ](../../../observability/examples/full/prometheus/scrape-configs/probers/probe-internal.yml) +- [prometheus/scrape-configs/recording-rules/slo.yml](../../../observability/examples/full/prometheus/scrape-configs/recording-rules/slo.yml) + + +Inspect the results in your local directory, and see that it matches the folder layout defined in step 1. + +## Step 3: Run the Stack +The files come with basic defaults, so we can now run the stack + + + ``` + docker compose up -d + docker compose -f exporters.docker-compose.yml up -d + ``` + +This will launch Prometheus, Grafana, and all required services with + + +## Step 4: Create Site-Specific Config Files + +You must provide your own scrape and recording rules to tell Prometheus what to monitor. + +* Exporters: Targets like Elasticsearch or Docker + → Add files in `scrape-configs/exporters/*.yml` + +* Probers: HTTP endpoints you want to monitor for availability + → Add files in `scrape-configs/probers/*.yml` + +* Recording Rules: Define uptime goals or custom aggregations + → Add files in `recording-rules/*.yml` + +Refer to the following How-To guides for creating each config: + +* [Configure Probers](./probing.md) +* [Add Exporters](./telemetry.md) +* [Enable Alerting](./alerting.md) +* [Customise Setup](../customization/_index.md) + +--- + + + +--- + +## What’s Next? + +Your observability stack is now monitoring your own services. + +Continue with: + +* [Grafana Dashboards](./dashboards.md) +* [Set up Alerts](./alerting.md) +* [Create custom views](../customization/_index.md) + +--- + +Let me know if you'd like to add code snippets for `.yml` examples in each folder. diff --git a/docs/observability/setup/telemetry.md b/docs/observability/setup/telemetry.md new file mode 100644 index 0000000..e282c22 --- /dev/null +++ b/docs/observability/setup/telemetry.md @@ -0,0 +1,35 @@ +# Telemetry + +We can get telemetry from our services and VMs displayed in our dashboards. This telemetry gives us things like memory usage, and running container versions. + +Using telemetry lets us get feedback from the stack, diagnose problems, and predict issues before they occur. + +## Prometheus Exporters +Prometheus gets metrics from "Exporters". These need to be run on each VM you want to get metrics from + +Run these exporters on each virtual machine. + +- Node Exporter: This gives host metrics eg disk usage, memory +- Elastic Search Exporter: Get ES metrics like index size +- CAdvisor: This gives docker metrics, eg what containers are running + + Then add the host and IPs into a yaml file in `scrape-configs/exporters/` to tell prometheus where to scrape. + + +## How to run Exporters + +## Add Exporters to Prometheus +- `scrape-configs/exporters/` - + Add yaml files into this folder. These file should contain all exporter prometheus metrics, for example from node_exporter or CAdvisor. Add any hosts and ip addresses you want to collect /metrics from will be retrieved + +```yaml +# Exporter example yml +- targets: + - 123.0.0.1:9100 # Enter your IP address and port of a target + labels: + job: node_exporter # Mandatory - Enter the type of metric being collected + host: my-host-name # (Optional) A readable hostname + custom_label: a_custom_label # (Optional) + # __metrics_path__: /path/metrics # Optionally override the metrics path, the default is just /metrics +# ... add all targets +``` \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..caa2a36 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +Sphinx==6.2.1 +sphinx-rtd-theme~=1.0 +myst-parser~=0.17 +sphinx-autoapi==3.0.0 +sphinx-autobuild +furo \ No newline at end of file diff --git a/observability/README.md b/observability/README.md index aff4413..d17c0ee 100644 --- a/observability/README.md +++ b/observability/README.md @@ -9,158 +9,7 @@ It provides the following features: - Blackbox Probing of services to find service level indicators of uptime and latency - A working inventory of what is running where -## Usage Guide - -The stack runs grafana with a set of dashboards. -Access Grafana on `http://your-ip/grafana` - -### Grafana Dashboards -- Availability -- Elasticsearch -- VM Metrics (Memory use, CPU etc) -- Docker Metrics (Running containers) - -### Alerting -The alerts are paused by default. - -Alerting is based on either pure availability on 5 minutes or 6 hours, as well as a burn rate implementation. - -See [Google SRE Guide](https://sre.google/workbook/alerting-on-slos/#4-alert-on-burn-rate) which explains burn rate alerting. The alerting setup here follows the recommendations in the SRE handbook for Multiwindow, Multiburn rate alerting. - -For burn rate alerting, ensure that a recording rule is setup to create a record for `slo_target_over_30_days`, with a job label that matches your probe job labels. See the prometheus readme in this project. - -## How to Run the stack - -See the /examples folder for a working example of running this - -To setup the stack for your deployment: -- Create prometheus configurations as listed below -- Copy this docker-compose.yml file -- Mount your site config files into `/etc/prometheus/cogstack/site` -- Run with docker compose - -To collect metrics from VMs to fill out the dashboards -- Run the Exporters on each VM as detailed below - - -## Grafana Configuration - -Grafana is setup with preconfigured dashboards, datasource, and alerting. These will work when prometheus is run in this stack, and is dependent on all the metrics following defined rules. - -It is advised that any edits or new configs get committed back into your git repository, and stick with grafana provisioning instead of allowing manual edits - -### Customise grafana - -#### Dashboards -- Mount new dashboard files in the `/etc/grafana/provisioning/dashboards/site` directory -- To remove or change the existing, then mount over the existing files there - -For more info see [Grafana Alerting Provisioning](https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards) - -#### Alerting -- Enable/Disable alerts using environment variables -- By default alerts will send to slack. Provide the env variable `SLACK_WEBHOOK_URL` to go there -- To change where the alerts are sent: create and mount custom a custom contact point in `/etc/grafana/provisioning/alerting/custom-contact.yml`. Then change the environment variable `ALERTING_DEFAULT_CONTACT` to use that name -- Add custom alerts by mounting alert files in `/etc/grafana/provisioning/alerting/`. - -For more info see [Grafana Provisioning](https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/) - - -## Prometheus Configurations - -Prometheus is preconfigured with rules and configs, though needs additional site specific configs to scrape your stack. For example, it needs to be told exactly what hostname and port elasticsearch is running on, in order to check availability. - -This is managed through using different prometheus configuration files, which include different scrape config files. - -### Prometheus Site Configurations - -Prometheus is used to get metrics from targets. It can target anything exporting metrics. - -To use this project, you need to define your own targets in yml files and mount in docker following this folder structure: -``` - site/ - /scrape-configs/exporters/*.yml - /scrape-configs/probers/*.yml - /scrape-configs/*.yml - /recording-rules/*yml -``` - -There are 3 ways to add targets to prometheus in this project: - -- `scrape-configs/probers/*.yml`. -Add yaml files to this folder as probe targets. Any yml files put into this directory, for example "probe.example.yml", will be used as targets to probe for availability using blackbox exporter. Add any URLs that you want to measure the availability of. - -```yaml -# Prober yml -- targets: - - https://google.com/something - labels: - name: google-homepage # Mandatory - the name of the service being probed - job: override_job # (Optional. Default is "probe-cogstack-availability") Customise a job to enable grouping in the dashboard - ip_address: "123.0.0.1" # (Optional) The IP address - host: a_hostname # (Optional) A readable hostname - custom_label: a_custom_label # (Optional) Any other label - -``` -Note that URLs need to be accessible from the host running prometheus - -- `scrape-configs/exporters/` - - Add yaml files into this folder. These file should contain all exporter prometheus metrics, for example from node_exporter or CAdvisor. Add any hosts and ip addresses you want to collect /metrics from will be retrieved - -```yaml -# Exporter example yml -- targets: - - 123.0.0.1:9100 # Enter your IP address and port of a target - labels: - job: node_exporter # Mandatory - Enter the type of metric being collected - host: my-host-name # (Optional) A readable hostname - custom_label: a_custom_label # (Optional) - # __metrics_path__: /path/metrics # Optionally override the metrics path, the default is just /metrics -# ... add all targets -``` - -- `scrape-configs/*.yml`. This is for advanced configuration. Any yml file put in this directory will be used as standard promethues scrape configs. This will give full flexibility over what metrics are collected and all features in prometheus. Add any further configs that you want prometheus to use. - -```yaml -# Custom scrape config definition -scrape_configs: - - job_name: custom-scrape-config # Scrape configuration to get metrics from elasticsearch, eg index size. - static_configs: - - targets: - - my-custom-target:9114 - labels: - custom_label: custom # (Optional) -``` - -- `/recording-rules/*.yml` - -Add recording rules in here. - -To enable the burn rate alerting feature, you must include a recording rule file with the following contents. - -```yaml -groups: - - name: slo-target-rules - rules: - - record: slo_target_over_30_days # (Dont change) - expr: 0.95 # Mandatory - Specify the SLO you want to target, for example 0.95 for 95% uptime over 30 days - labels: - job: "probe-cogstack-availability" #Mandatory - name the job, which must match the job in the probe targets defined -``` - -## Exporters - -Prometheus gets metrics from "Exporters". These need to be run on each VM you want to get metrics from - -Run these exporters on each virtual machine. - -- Node Exporter: This gives host metrics eg disk usage, memory -- Elastic Search Exporter: Get ES metrics like index size -- CAdvisor: This gives docker metrics, eg what containers are running - - Then add the host and IPs into a yaml file in `scrape-configs/exporters/` to tell prometheus where to scrape. - - +See the docs in the docs root folder for the full documentation ## Local Development @@ -171,3 +20,5 @@ docker compose up -d ``` This will start the observability stack, set to probe and monitor itself. + +Access Grafana on `http://localhost/grafana` \ No newline at end of file diff --git a/observability/docs/get-started/setup-tutorial.md b/observability/docs/get-started/setup-tutorial.md deleted file mode 100644 index 8748972..0000000 --- a/observability/docs/get-started/setup-tutorial.md +++ /dev/null @@ -1,17 +0,0 @@ - -This page - - - -## Run the Observability Stack using Docker Compose - -See the /examples folder for a working example of running this - -To setup the stack for your deployment: -- Create prometheus configurations as listed below -- Copy this docker-compose.yml file -- Mount your site config files into `/etc/prometheus/cogstack/site` -- Run with docker compose - -To collect metrics from VMs to fill out the dashboards -- Run the Exporters on each VM as detailed below \ No newline at end of file diff --git a/observability/docs/get-started/userguide-tutorial.md b/observability/docs/get-started/userguide-tutorial.md deleted file mode 100644 index e69de29..0000000 diff --git a/observability/examples/full/docker-compose.yml b/observability/examples/full/docker-compose.yml new file mode 100755 index 0000000..8fb9362 --- /dev/null +++ b/observability/examples/full/docker-compose.yml @@ -0,0 +1,38 @@ +# Observability main stack. Prometheus and Grafana. +# Depends on docker-compose.exporters.yml for the network +name: "cogstack-observability" +services: + prometheus: + image: cogstacksystems/cogstack-observability-prometheus:latest + restart: unless-stopped + volumes: + - ./prometheus:/etc/prometheus/cogstack/site/ + - prometheus-data:/prometheus + networks: + - observability + grafana: + image: cogstacksystems/cogstack-observability-grafana:latest + restart: unless-stopped + volumes: + - grafana-data:/var/lib/grafana + networks: + - observability + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true # Allows use of grafana without sign in + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + traefik: + image: cogstacksystems/cogstack-observability-traefik:latest + networks: + - observability + restart: unless-stopped + ports: + - "80:80" + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro # So that Traefik can listen to the Docker events +networks: + observability: + driver: bridge + +volumes: + prometheus-data: + grafana-data: \ No newline at end of file diff --git a/observability/examples/full/exporters.docker-compose.yml b/observability/examples/full/exporters.docker-compose.yml new file mode 100755 index 0000000..0cae3bf --- /dev/null +++ b/observability/examples/full/exporters.docker-compose.yml @@ -0,0 +1,40 @@ +# Exporter deployment on each VM for exporting metrics. +name: "cogstack-observability-exporters" +services: + node-exporter: + image: prom/node-exporter + restart: unless-stopped + networks: + - observability-exporters + ports: + - 9100:9100 + labels: + - "traefik.enable=true" + - "traefik.http.routers.node-exporter.rule=PathPrefix(`/node-exporter`)" + - "traefik.http.middlewares.node-exporter-stripprefix.stripprefix.prefixes=/node-exporter" + - "traefik.http.routers.node-exporter.middlewares=node-exporter-stripprefix@docker" + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - 9116:8080 + networks: + - observability-exporters + command: + - '-housekeeping_interval=10s' + - '-docker_only=true' + security_opt: + - no-new-privileges:true + labels: + - "traefik.enable=true" + - "traefik.http.routers.cadvisor.rule=PathPrefix(`/cadvisor`)" + - "traefik.http.middlewares.cadvisor-stripprefix.stripprefix.prefixes=/cadvisor" + - "traefik.http.routers.cadvisor.middlewares=cadvisor-stripprefix@docker" + +networks: + observability-exporters: + driver: bridge \ No newline at end of file diff --git a/observability/examples/full/exporters.elastic.docker-compose.yml b/observability/examples/full/exporters.elastic.docker-compose.yml new file mode 100644 index 0000000..3a02bd5 --- /dev/null +++ b/observability/examples/full/exporters.elastic.docker-compose.yml @@ -0,0 +1,21 @@ +# Exporter deployment on each VM for exporting metrics. +name: "cogstack-observability-exporters" +services: + elasticsearch_exporter: + image: quay.io/prometheuscommunity/elasticsearch-exporter:latest + restart: unless-stopped + command: + - '--es.uri=${ES_URL}' + - '--es.ssl-skip-verify' + ports: + - "9114:9114" + environment: + - ES_USERNAME=${ES_USERNAME} + - ES_PASSWORD=${ES_PASSWORD} + networks: + - observability-exporters + +networks: + observability-exporters: + name: cogstack-observability-exporters_observability-exporters + external: true \ No newline at end of file diff --git a/observability/examples/full/full-quickstart.sh b/observability/examples/full/full-quickstart.sh new file mode 100644 index 0000000..3f77c9a --- /dev/null +++ b/observability/examples/full/full-quickstart.sh @@ -0,0 +1,28 @@ + +#!/bin/bash +set -e +download_to() { + local path="$1" + local url="https://raw.githubusercontent.com/CogStack/cogstack-platform-toolkit/main/observability/examples/full/${path}" + + echo "Downloading ${path} from ${url}" + mkdir -p "$(dirname "$path")" + curl -fsSL -o "$path" "$url" +} + +mkdir -p cogstack-observability/prometheus/scrape-configs/probers +mkdir -p cogstack-observability/prometheus/scrape-configs/exporters +mkdir -p cogstack-observability/prometheus/scrape-configs/recording-rules +cd cogstack-observability + +download_to docker-compose.yml +download_to exporters.docker-compose.yml +download_to exporters.elastic.docker-compose.yml + +download_to prometheus/scrape-configs/probers/probe-internal.yml +download_to prometheus/scrape-configs/probers/probe-external.yml +download_to prometheus/scrape-configs/exporters/exporters.yml +download_to prometheus/scrape-configs/recording-rules/slo.yml + + +echo "Setup complete in cogstack-observability/" diff --git a/observability/examples/full/prometheus/scrape-configs/exporters/exporters.yml b/observability/examples/full/prometheus/scrape-configs/exporters/exporters.yml new file mode 100644 index 0000000..8036d08 --- /dev/null +++ b/observability/examples/full/prometheus/scrape-configs/exporters/exporters.yml @@ -0,0 +1,5 @@ +- targets: + - cogstack-observability-node-exporter-1:9100 + labels: + job: node_exporter + host: localhost \ No newline at end of file diff --git a/observability/examples/full/prometheus/scrape-configs/probers/probe-external.yml b/observability/examples/full/prometheus/scrape-configs/probers/probe-external.yml new file mode 100644 index 0000000..b3d7353 --- /dev/null +++ b/observability/examples/full/prometheus/scrape-configs/probers/probe-external.yml @@ -0,0 +1,6 @@ +# Example of probe targets +- targets: + - https://cogstack.org + labels: + name: cogstack-homepage + job: probe-services \ No newline at end of file diff --git a/observability/examples/full/prometheus/scrape-configs/probers/probe-internal.yml b/observability/examples/full/prometheus/scrape-configs/probers/probe-internal.yml new file mode 100644 index 0000000..b3d7353 --- /dev/null +++ b/observability/examples/full/prometheus/scrape-configs/probers/probe-internal.yml @@ -0,0 +1,6 @@ +# Example of probe targets +- targets: + - https://cogstack.org + labels: + name: cogstack-homepage + job: probe-services \ No newline at end of file diff --git a/observability/examples/full/prometheus/scrape-configs/recording-rules/slo.yml b/observability/examples/full/prometheus/scrape-configs/recording-rules/slo.yml new file mode 100644 index 0000000..440913c --- /dev/null +++ b/observability/examples/full/prometheus/scrape-configs/recording-rules/slo.yml @@ -0,0 +1,8 @@ +groups: + - name: slo-target-rules + rules: + # What SLO am I targeting + - record: slo_target_over_30_days + expr: 0.95 # We target 95% uptime over 30 days + labels: + job: "probe-external-demo-apps" #Job here must match the job in the probe targets \ No newline at end of file diff --git a/observability/examples/simple/docker-compose.yml b/observability/examples/simple/docker-compose.yml index 659a067..7f2f8d1 100755 --- a/observability/examples/simple/docker-compose.yml +++ b/observability/examples/simple/docker-compose.yml @@ -39,11 +39,6 @@ services: restart: unless-stopped networks: - observability - labels: - - "traefik.enable=true" - - "traefik.http.routers.node-exporter.rule=PathPrefix(`/node-exporter`)" - - "traefik.http.middlewares.node-exporter-stripprefix.stripprefix.prefixes=/node-exporter" - - "traefik.http.routers.node-exporter.middlewares=node-exporter-stripprefix@docker" networks: observability: driver: bridge diff --git a/observability/examples/simple/quickstart.sh b/observability/examples/simple/quickstart.sh index 45b67b7..a252b82 100644 --- a/observability/examples/simple/quickstart.sh +++ b/observability/examples/simple/quickstart.sh @@ -15,8 +15,7 @@ curl -fsSL -o prometheus/scrape-configs/probers/probe-simple.yml \ echo "Downloading exporters-simple.yml into prometheus/scrape-configs/exporters/..." curl -fsSL -o prometheus/scrape-configs/exporters/exporters-simple.yml \ - https://raw.githubusercontent.com/CogStack/cogstack-platform-toolkit/main/observability/examples/simple/prometheus/scrape-configs/exporters/exporters-simple.yaml - + https://raw.githubusercontent.com/CogStack/cogstack-platform-toolkit/refs/heads/main/observability/examples/simple/prometheus/scrape-configs/exporters/exporters-simple.yml echo "Setup complete in observability-simple/"