From f2c37a55a4456c4af8b3b3b53cdd4f703a910b29 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 7 Jun 2019 11:30:11 -0700
Subject: [PATCH] ci: Collect CPU usage statistics on Azure

This commit adds a script which we'll execute on Azure Pipelines which
is intended to run in the background and passively collect CPU usage
statistics for our builders. The intention here is that we can use this
information over time to diagnose issues with builders, see where we can
optimize our build, fix parallelism issues, etc. This might not end up
being too useful in the long run but it's data we've wanted to collect
for quite some time now, so here's a stab at it!

Comments about how this is intended to work can be found in the python
script used here to collect CPU usage statistics.

Closes #48828
---
 .azure-pipelines/steps/run.yml |  16 +++
 src/ci/cpu-usage-over-time.py  | 175 +++++++++++++++++++++++++++++++++
 2 files changed, 191 insertions(+)
 create mode 100644 src/ci/cpu-usage-over-time.py

diff --git a/.azure-pipelines/steps/run.yml b/.azure-pipelines/steps/run.yml
index cfa28a6a1d70f..38aa022b624db 100644
--- a/.azure-pipelines/steps/run.yml
+++ b/.azure-pipelines/steps/run.yml
@@ -11,6 +11,12 @@ steps:
 - checkout: self
   fetchDepth: 2
 
+# Spawn a background process to collect CPU usage statistics which we'll upload
+# at the end of the build. See the comments in the script here for more
+# information.
+- bash: python src/ci/cpu-usage-over-time.py &> cpu-usage.csv &
+  displayName: "Collect CPU-usage statistics in the background"
+
 - bash: printenv | sort
   displayName: Show environment variables
 
@@ -136,3 +142,13 @@ steps:
     AWS_SECRET_ACCESS_KEY: $(AWS_SECRET_ACCESS_KEY)
   condition: and(succeeded(), or(eq(variables.DEPLOY, '1'), eq(variables.DEPLOY_ALT, '1')))
   displayName: Upload artifacts
+
+# Upload CPU usage statistics that we've been gathering this whole time. Always
+# execute this step in case we want to inspect failed builds, but don't let
+# errors here ever fail the build since this is just informational.
+- bash: aws s3 cp --acl public-read cpu-usage.csv s3://$DEPLOY_BUCKET/rustc-builds/$BUILD_SOURCEVERSION/cpu-$SYSTEM_JOBNAME.csv
+  env:
+    AWS_SECRET_ACCESS_KEY: $(AWS_SECRET_ACCESS_KEY)
+  condition: contains(variables, 'AWS_SECRET_ACCESS_KEY')
+  continueOnError: true
+  displayName: Upload CPU usage statistics
diff --git a/src/ci/cpu-usage-over-time.py b/src/ci/cpu-usage-over-time.py
new file mode 100644
index 0000000000000..78427a6360a9f
--- /dev/null
+++ b/src/ci/cpu-usage-over-time.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# ignore-tidy-linelength
+
+# This is a small script that we use on CI to collect CPU usage statistics of
+# our builders. By seeing graphs of CPU usage over time we hope to correlate
+# that with possible improvements to Rust's own build system, ideally diagnosing
+# that either builders are always fully using their CPU resources or they're
+# idle for long stretches of time.
+#
+# This script is relatively simple, but it's platform specific. Each platform
+# (OSX/Windows/Linux) has a different way of calculating the current state of
+# CPU at a point in time. We then compare two captured states to determine the
+# percentage of time spent in one state versus another. The state capturing is
+# all platform-specific but the loop at the bottom is the cross platform part
+# that executes everywhere.
+#
+# # Viewing statistics
+#
+# All builders will upload their CPU statistics as CSV files to our S3 buckets.
+# These URLS look like:
+#
+#   https://$bucket.s3.amazonaws.com/rustc-builds/$commit/cpu-$builder.csv
+#
+# for example
+#
+#   https://rust-lang-ci2.s3.amazonaws.com/rustc-builds/68baada19cd5340f05f0db15a3e16d6671609bcc/cpu-x86_64-apple.csv
+#
+# Each CSV file has two columns. The first is the timestamp of the measurement
+# and the second column is the % of idle cpu time in that time slice. Ideally
+# the second column is always zero.
+#
+# Once you've downloaded a file there's various ways to plot it and visualize
+# it. For command line usage you can use a script like so:
+#
+#      set timefmt '%Y-%m-%dT%H:%M:%S'
+#      set xdata time
+#      set ylabel "Idle CPU %"
+#      set xlabel "Time"
+#      set datafile sep ','
+#      set term png
+#      set output "printme.png"
+#      set grid
+#      builder = "i686-apple"
+#      plot "cpu-".builder.".csv" using 1:2 with lines title builder
+#
+# Executed as `gnuplot < ./foo.plot` it will generate a graph called
+# `printme.png` which you can then open up. If you know how to improve this
+# script or the viewing process that would be much appreciated :) (or even if
+# you know how to automate it!)
+
+import datetime
+import sys
+import time
+
+if sys.platform == 'linux2':
+    class State:
+        def __init__(self):
+            with open('/proc/stat', 'r') as file:
+                data = file.readline().split()
+            if data[0] != 'cpu':
+                raise Exception('did not start with "cpu"')
+            self.user = int(data[1])
+            self.nice = int(data[2])
+            self.system = int(data[3])
+            self.idle = int(data[4])
+            self.iowait = int(data[5])
+            self.irq = int(data[6])
+            self.softirq = int(data[7])
+            self.steal = int(data[8])
+            self.guest = int(data[9])
+            self.guest_nice = int(data[10])
+
+        def idle_since(self, prev):
+            user = self.user - prev.user
+            nice = self.nice - prev.nice
+            system = self.system - prev.system
+            idle = self.idle - prev.idle
+            iowait = self.iowait - prev.iowait
+            irq = self.irq - prev.irq
+            softirq = self.softirq - prev.softirq
+            steal = self.steal - prev.steal
+            guest = self.guest - prev.guest
+            guest_nice = self.guest_nice - prev.guest_nice
+            total = user + nice + system + idle + iowait + irq + softirq + steal + guest + guest_nice
+            return float(idle) / float(total) * 100
+
+elif sys.platform == 'win32':
+    from ctypes.wintypes import DWORD
+    from ctypes import Structure, windll, WinError, GetLastError, byref
+
+    class FILETIME(Structure):
+        _fields_ = [
+            ("dwLowDateTime", DWORD),
+            ("dwHighDateTime", DWORD),
+        ]
+
+    class State:
+        def __init__(self):
+            idle, kernel, user = FILETIME(), FILETIME(), FILETIME()
+
+            success = windll.kernel32.GetSystemTimes(
+                byref(idle),
+                byref(kernel),
+                byref(user),
+            )
+
+            assert success, WinError(GetLastError())[1]
+
+            self.idle = (idle.dwHighDateTime << 32) | idle.dwLowDateTime
+            self.kernel = (kernel.dwHighDateTime << 32) | kernel.dwLowDateTime
+            self.user = (user.dwHighDateTime << 32) | user.dwLowDateTime
+
+        def idle_since(self, prev):
+            idle = self.idle - prev.idle
+            user = self.user - prev.user
+            kernel = self.kernel - prev.kernel
+            return float(idle) / float(user + kernel) * 100
+
+elif sys.platform == 'darwin':
+    from ctypes import *
+    libc = cdll.LoadLibrary('/usr/lib/libc.dylib')
+
+    PROESSOR_CPU_LOAD_INFO = c_int(2)
+    CPU_STATE_USER = 0
+    CPU_STATE_SYSTEM = 1
+    CPU_STATE_IDLE = 2
+    CPU_STATE_NICE = 3
+    c_int_p = POINTER(c_int)
+
+    class State:
+        def __init__(self):
+            num_cpus_u = c_uint(0)
+            cpu_info = c_int_p()
+            cpu_info_cnt = c_int(0)
+            err = libc.host_processor_info(
+                libc.mach_host_self(),
+                PROESSOR_CPU_LOAD_INFO,
+                byref(num_cpus_u),
+                byref(cpu_info),
+                byref(cpu_info_cnt),
+            )
+            assert err == 0
+            self.user = 0
+            self.system = 0
+            self.idle = 0
+            self.nice = 0
+            cur = 0
+            while cur < cpu_info_cnt.value:
+                self.user += cpu_info[cur + CPU_STATE_USER]
+                self.system += cpu_info[cur + CPU_STATE_SYSTEM]
+                self.idle += cpu_info[cur + CPU_STATE_IDLE]
+                self.nice += cpu_info[cur + CPU_STATE_NICE]
+                cur += num_cpus_u.value
+
+        def idle_since(self, prev):
+            user = self.user - prev.user
+            system = self.system - prev.system
+            idle = self.idle - prev.idle
+            nice = self.nice - prev.nice
+            return float(idle) / float(user + system + idle + nice) * 100.0
+
+else:
+    print('unknown platform', sys.platform)
+    sys.exit(1)
+
+cur_state = State();
+print("Time,Idle")
+while True:
+    time.sleep(1);
+    next_state = State();
+    now = datetime.datetime.utcnow().isoformat()
+    idle = next_state.idle_since(cur_state)
+    print("%s,%s" % (now, idle))
+    sys.stdout.flush()
+    cur_state = next_state