Skip to content

Commit

Permalink
Example of exporting cuebot metric's (#844)
Browse files Browse the repository at this point in the history
Export cuebot metric's into a prometheus format.

Closes #843
  • Loading branch information
larsbijl committed Jan 16, 2021
1 parent 6da325b commit 3133eb1
Show file tree
Hide file tree
Showing 4 changed files with 248 additions and 0 deletions.
52 changes: 52 additions & 0 deletions connectors/prometheus_metrics/Dockerfile
@@ -0,0 +1,52 @@
FROM centos:7
ENV PYTHONUNBUFFERED 1

WORKDIR /opt/opencue

RUN yum -y install \
epel-release \
gcc \
python-devel \
time

RUN yum -y install \
python-pip \
python36 \
python36-devel \
python36-pip

RUN python -m pip install --upgrade pip
RUN python3.6 -m pip install --upgrade pip

RUN python -m pip install --upgrade setuptools
RUN python3.6 -m pip install --upgrade setuptools

COPY LICENSE ./
COPY requirements.txt ./
COPY connectors/prometheus_metrics/requirements_metrics.txt ./

RUN python -m pip install -r requirements.txt -r requirements_metrics.txt
RUN python3.6 -m pip install -r requirements.txt -r requirements_metrics.txt

COPY connectors/prometheus_metrics/metrics ./metrics
COPY proto/ ./proto
COPY pycue/README.md ./pycue/
COPY pycue/setup.py ./pycue/
COPY pycue/FileSequence ./pycue/FileSequence
COPY pycue/opencue ./pycue/opencue

RUN python -m grpc_tools.protoc \
-I=./proto \
--python_out=./pycue/opencue/compiled_proto \
--grpc_python_out=./pycue/opencue/compiled_proto \
./proto/*.proto

# Fix imports to work in both Python 2 and 3. See
# <https://github.com/protocolbuffers/protobuf/issues/1491> for more info.
RUN 2to3 -wn -f import pycue/opencue/compiled_proto/*_pb2*.py

RUN cd pycue && python setup.py install

RUN cd pycue && python3.6 setup.py install

ENTRYPOINT ["python3", "/opt/opencue/metrics"]
182 changes: 182 additions & 0 deletions connectors/prometheus_metrics/metrics
@@ -0,0 +1,182 @@
#!/usr/bin/env python
import time

import opencue

from prometheus_client import start_http_server
from prometheus_client import Gauge


CLUE_HOSTS_HARDWARE = Gauge(
'cue_hosts_hardware_total', 'hosts hardware status', ['status'])
CLUE_HOSTS_LOCK = Gauge('cue_hosts_lock_total', 'hosts lock status', ['status'])
CLUE_PROCS = Gauge('cue_procs_total', 'number of Procs')
CLUE_PROCS_USABLE = Gauge('cue_procs_usable_total', 'number of usable Procs')
CLUE_PROCS_USED = Gauge('cue_procs_used_total', 'number of Procs currently in use')

CLUE_FRAMES = Gauge('cue_frames', 'number of frames ', ['status', 'show'])
CLUE_REMAIN = Gauge('cue_remain', 'remaining core seconds (estimated) ', ['show'])

MANAGE_WAITING = Gauge('cue_manage_waiting_total', '')
MANAGE_REMAINING_CAPACITY = Gauge('cue_manage_remaining_capacity_total', '')
MANAGE_THREADS = Gauge('cue_manage_threads_total', '')
MANAGE_EXECUTED = Gauge('cue_manage_executed_total', '')
MANAGE_REJECTED = Gauge('cue_manage_rejected_total', '')
DISPATCH_WAITING = Gauge('cue_dispatch_waiting_total', '')
DISPATCH_REMAINING_CAPACITY = Gauge('cue_dispatch_remaining_capacity_total', '')
DISPATCH_THREADS = Gauge('cue_dispatch_threads_total', '')
DISPATCH_EXECUTED = Gauge('cue_dispatch_executed_total', '')
DISPATCH_REJECTED = Gauge('cue_dispatch_rejected_total', '')
REPORT_WAITING = Gauge('cue_report_waiting_total', '')
REPORT_REMAINING_CAPACITY = Gauge('cue_report_remaining_capacity_total', '')
REPORT_THREADS = Gauge('cue_report_threads_total', '')
REPORT_EXECUTED = Gauge('cue_report_executed_total', '')
REPORT_REJECTED = Gauge('cue_report_rejected_total', '')
BOOKING_WAITING = Gauge('cue_booking_waiting_total', '')
BOOKING_REMAINING_CAPACITY = Gauge('cue_booking_remaining_capacity_total', '')
BOOKING_THREADS = Gauge('cue_booking_threads_total', '')
BOOKING_SLEEP_MILLIS = Gauge('cue_booking_sleep_millis_total', '')
BOOKING_EXECUTED = Gauge('cue_booking_executed_total', '')
BOOKING_REJECTED = Gauge('cue_booking_rejected_total', '')
HOST_BALANCE_SUCCESS = Gauge('cue_host_balance_success_total', '')
HOST_BALANCE_FAILED = Gauge('cue_host_balance_failed_total', '')
KILLED_OFFENDER_PROCS = Gauge('cue_killed_offender_procs_total', '')
KILLED_OOM_PROCS = Gauge('cue_killed_oom_procs_total', '')
CLEARED_PROCS = Gauge('cue_cleared_procs_total', '')
BOOKING_ERRORS = Gauge('cue_booking_errors_total', '')
BOOKING_RETRIES = Gauge('cue_booking_retries_total', '')
BOOKED_PROCS = Gauge('cue_booked_procs_total', '')
REQ_FOR_DATA = Gauge('cue_req_for_data_total', '')
REQ_FOR_FUNCTION = Gauge('cue_req_for_function_total', '')
REQ_ERRORS = Gauge('cue_req_errors_total', '')
UNBOOKED_PROCS = Gauge('cue_unbooked_procs_total', '')
PICKED_UP_CORES = Gauge('cue_picked_up_cores_total', '')
STRANDED_CORES = Gauge('cue_stranded_cores_total', '')


def main():
while True:
jobs = opencue.api.getJobs()
shows = {}
shows_remaining = {}

for job in jobs:
show = job.show()
if show not in shows:
shows[show] = {'pending': 0, 'dead': 0,
'eaten': 0, 'succeeded': 0, 'running': 0}

if show not in shows_remaining:
shows_remaining[show] = 0

shows[show]['pending'] += job.pendingFrames()
shows[show]['dead'] += job.deadFrames()
shows[show]['eaten'] += job.eatenFrames()
shows[show]['succeeded'] += job.succeededFrames()
shows[show]['running'] += job.runningFrames()

shows_remaining[show] += job.coreSecondsRemaining()

for show in shows:
for k, v in shows[show].items():
CLUE_FRAMES.labels(status=k, show=show).set(v)

for show in shows_remaining:
CLUE_REMAIN.labels(show=show).set(shows_remaining[show])

# Handle the Host information
hosts = opencue.api.getHosts()
down_hosts = up_hosts = 0
open_hosts = locked_hosts = nimby_locked_hosts = 0
repair_hosts = rebooting_hosts = reboot_when_idle_hosts = shutdown_when_idle_hosts = 0
total_procs = used_procs = usable_procs = 0

for host in hosts:
lstate = host.lockState()
if lstate == 0:
open_hosts += 1
elif lstate == 1:
locked_hosts += 1
elif lstate == 2:
nimby_locked_hosts += 1

state = host.state()
if state == 5:
repair_hosts += 1
elif state == 4:
shutdown_when_idle_hosts += 1
elif state == 3:
reboot_when_idle_hosts += 1
elif state == 2:
rebooting_hosts += 1

if host.isUp():
up_hosts += 1
if not host.isLocked():
usable_procs += host.cores()
used_procs += host.cores() - host.coresIdle()
else:
down_hosts += 1

total_procs += host.cores()

CLUE_HOSTS_LOCK.labels(status='open').set(open_hosts)
CLUE_HOSTS_LOCK.labels(status='locked').set(locked_hosts)
CLUE_HOSTS_LOCK.labels(status='nimby_locked').set(nimby_locked_hosts)

CLUE_HOSTS_HARDWARE.labels(status='up').set(up_hosts)
CLUE_HOSTS_HARDWARE.labels(status='down').set(down_hosts)
CLUE_HOSTS_HARDWARE.labels(status='repair').set(repair_hosts)
CLUE_HOSTS_HARDWARE.labels(status='rebooting').set(rebooting_hosts)
CLUE_HOSTS_HARDWARE.labels(status='reboot_when_idle').set(reboot_when_idle_hosts)
CLUE_HOSTS_HARDWARE.labels(status='shutdown_when_idle').set(shutdown_when_idle_hosts)

CLUE_PROCS_USABLE.set(usable_procs)
CLUE_PROCS_USED.set(used_procs)
CLUE_PROCS.set(total_procs)

# Apply the scheduler system stats.
system_stats = opencue.api.getSystemStats()

MANAGE_WAITING.set(system_stats.manage_waiting)
MANAGE_REMAINING_CAPACITY.set(system_stats.manage_remaining_capacity)
MANAGE_THREADS.set(system_stats.manage_threads)
MANAGE_EXECUTED.set(system_stats.manage_executed)
MANAGE_REJECTED.set(system_stats.manage_rejected)
DISPATCH_WAITING.set(system_stats.dispatch_waiting)
DISPATCH_REMAINING_CAPACITY.set(system_stats.dispatch_remaining_capacity)
DISPATCH_THREADS.set(system_stats.dispatch_threads)
DISPATCH_EXECUTED.set(system_stats.dispatch_executed)
DISPATCH_REJECTED.set(system_stats.dispatch_rejected)
REPORT_WAITING.set(system_stats.report_waiting)
REPORT_REMAINING_CAPACITY.set(system_stats.report_remaining_capacity)
REPORT_THREADS.set(system_stats.report_threads)
REPORT_EXECUTED.set(system_stats.report_executed)
REPORT_REJECTED.set(system_stats.report_rejected)
BOOKING_WAITING.set(system_stats.booking_waiting)
BOOKING_REMAINING_CAPACITY.set(system_stats.booking_remaining_capacity)
BOOKING_THREADS.set(system_stats.booking_threads)
BOOKING_SLEEP_MILLIS.set(system_stats.booking_sleep_millis)
BOOKING_EXECUTED.set(system_stats.booking_executed)
BOOKING_REJECTED.set(system_stats.booking_rejected)
HOST_BALANCE_SUCCESS.set(system_stats.host_balance_success)
HOST_BALANCE_FAILED.set(system_stats.host_balance_failed)
KILLED_OFFENDER_PROCS.set(system_stats.killed_offender_procs)
KILLED_OOM_PROCS.set(system_stats.killed_oom_procs)
CLEARED_PROCS.set(system_stats.cleared_procs)
BOOKING_ERRORS.set(system_stats.booking_errors)
BOOKING_RETRIES.set(system_stats.booking_retries)
BOOKED_PROCS.set(system_stats.booked_procs)
REQ_FOR_DATA.set(system_stats.req_for_data)
REQ_FOR_FUNCTION.set(system_stats.req_for_function)
REQ_ERRORS.set(system_stats.req_errors)
UNBOOKED_PROCS.set(system_stats.unbooked_procs)
PICKED_UP_CORES.set(system_stats.picked_up_cores)
STRANDED_CORES.set(system_stats.stranded_cores)

time.sleep(30)


if __name__ == '__main__':
start_http_server(8302)
main()
1 change: 1 addition & 0 deletions connectors/prometheus_metrics/requirements_metrics.txt
@@ -0,0 +1 @@
prometheus-client==0.9.0
13 changes: 13 additions & 0 deletions sandbox/docker-compose.yml
Expand Up @@ -56,3 +56,16 @@ services:
volumes:
- /tmp/rqd/logs:/tmp/rqd/logs
- /tmp/rqd/shots:/tmp/rqd/shots

metrics:
build:
context: ./
dockerfile: ./connectors/prometheus_metrics/Dockerfile
environment:
- CUEBOT_HOSTS=cuebot
depends_on:
- cuebot
links:
- cuebot
ports:
- "8302:8302"

0 comments on commit 3133eb1

Please sign in to comment.