In [20]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

import concurrent.futures, json, os, re, requests, subprocess

if not os.path.exists('python-utils'):
    subprocess.check_output('git clone https://github.com/CMU-CREATE-Lab/python-utils.git', shell=True)

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']
    exec('\n'.join(src), globals())

exec_ipynb('python-utils/utils.ipynb')
notebook_wide_display()

In [21]:
import os, psutil, subprocess

In [22]:
def error(msg):
    global errors
    Stat.log('ThumbnailWorker', 'critical', msg)
    errors.append(msg)
    
def reportStatus():
    if len(errors):
        Stat.log('ThumbnailWorker', 'down', 'Failed %d health checks' % len(errors), details=(' | '.join(errors)))
    else:
        Stat.log('ThumbnailWorker', 'up', 'Healthy, %d thumbnails running, load %.1f, CPU %d%%' % (cgiCount, os.getloadavg()[1], psutil.cpu_percent(3)))
        
def warning(msg):
    Stat.log('ThumbnailWorker', 'warning', msg)
    
def clearErrors():
    global errors
    errors = []

loadAvgMax = 50

def checkLoadAvgMax():
    (load1m, load5m, load15m) = os.getloadavg()
    if load5m > loadAvgMax:
        error('Load avg %.1f (test > %.1f)' % (load5m, loadAvgMax))

cpuMax = 90 # % of all CPU (i.e. 100=all cores at full utilization)

def checkCPUMax():
    cpu = psutil.cpu_percent(3)
    if cpu > cpuMax:
        error('CPU usage %d%% (test > %d%%)' % (cpu, cpuMax))        
        
maxDiskUsage = 95

def checkDiskUsage():
    dfCommand = 'df -l --print-type --exclude-type=tmpfs --exclude-type=devtmpfs'
    filesystems = subprocess.check_output(dfCommand, shell=True).decode('utf8').split('\n')[1:]

    for filesystem in filesystems:
        fields = filesystem.split()
        if not len(fields):
            continue
        diskUsage = int(fields[-2][:-1])
        if diskUsage > maxDiskUsage:
            mountPoint = fields[-1]
            error('%s is %d%% full (test > %d%%)' % (mountPoint, diskUsage, maxDiskUsage))

In [23]:
def checkCGIs():
    global cgiCount
    cgiCount = 0
    maxAge = 20 * 60 # 20 minutes
    for p in psutil.process_iter():
        cmdline = ' '.join(p.cmdline())
        if 'ruby' in cmdline and '/thumbnail-server.rb' in cmdline:
            age = time.time() - p.create_time()
            if age <= maxAge:
                cgiCount += 1
            else:
                warning('thumbnail-server.rb pid=%d running too long (%.1f mins > %.1f mins), killing' % (p.pid, age/60.0, maxAge/60.0))
                try:
                    p.kill()
                except psutil.AccessDenied:
                    error('Insufficient permission to kill chromdriver pid=%d' % p.pid)


In [24]:
maxChromedriverCount = 20

def checkChromedriver(p):
    if p.ppid() == 1:
        warning('Killing chromedriver pid=%d because process has no parent' % p.pid)
        try:
            p.kill()
        except psutil.AccessDenied:
            error('Insufficient permission to kill chromdriver pid=%d' % p.pid)
    else:
        parentName = psutil.Process(p.ppid()).name()
        if parentName != 'ruby':
            error('chromedriver pid=%d has unknown parent process pid=%d name=%s' % (p.pid, p.ppid(), parentName))

def checkChromedrivers():
    for p in psutil.process_iter():
        if p.name() == 'chromedriver':
            checkChromedriver(p)
            
    time.sleep(5) # wait for killed processes (if any) to die
    
    chromedriverCount = 0
    for p in psutil.process_iter():
        if p.name() == 'chromedriver':
            chromedriverCount += 1
    
    if chromedriverCount > maxChromedriverCount:
        error('%d chromedrivers running (test > %d)' % (chromedriverCount, maxChromedriverCount))


In [25]:
def checkChrome(p):
    if p.ppid() == 1 and '--headless' in ' '.join(p.cmdline()):
        warning('Killing headless chrome pid=%d because process has no parent' % p.pid)
        try:
            p.kill()
        except psutil.AccessDenied:
            error('Insufficient permission to kill chrome pid=%d' % p.pid)
        except psutil.NoSuchProcess:
            warning('Chrome pid=%d cannot be killed because it no longer exists' % p.pid)

def checkChromes():
    for p in psutil.process_iter():
        if p.name() == 'chrome':
            checkChrome(p)

In [26]:
def checkAll():
    clearErrors()
    checkLoadAvgMax()
    checkCPUMax()
    checkDiskUsage()
    checkCGIs()
    checkChromedrivers()
    checkChromes()
    reportStatus()

In [27]:
while True:
    checkAll()    
    sleep_until_next_period(60)

Stat.log {"service": "ThumbnailWorker", "level": "critical", "summary": "Load avg 122.4 (test > 50.0)", "host": "hal20", "details": null, "datetime": "2019-09-26T19:04:12.679324-04:00", "payload": {}}
Stat.log {"service": "ThumbnailWorker", "level": "critical", "summary": "CPU usage 97% (test > 90%)", "host": "hal20", "details": null, "datetime": "2019-09-26T19:04:15.721996-04:00", "payload": {}}
Stat.log {"service": "ThumbnailWorker", "level": "critical", "summary": "Insufficient permission to kill chromdriver pid=12904", "host": "hal20", "details": null, "datetime": "2019-09-26T19:04:15.874527-04:00", "payload": {}}
Stat.log {"service": "ThumbnailWorker", "level": "critical", "summary": "Insufficient permission to kill chromdriver pid=14098", "host": "hal20", "details": null, "datetime": "2019-09-26T19:04:15.981086-04:00", "payload": {}}
Stat.log {"service": "ThumbnailWorker", "level": "critical", "summary": "Insufficient permission to kill chromdriver pid=15182", "host": "hal20", "d

KeyboardInterrupt: 