<h1>This notebook retrieves from ES the info from jobs_archive about 10 top users, and sends alarm if usage is above certain thresholds</h1>

In [1]:
import numpy as np
import re
import json
from elasticsearch import Elasticsearch, exceptions as es_exceptions
from pandas.io.json import json_normalize
from IPython.display import display
from pandas import DataFrame
import pandas as pd
from datetime import datetime, timedelta
import datetime

<h2>Retrieve all job indexes from ES</h2>

In [2]:
#define function to filter on time
def time_filter(indices, last_days=1, pattern=''):
    if last_days == 0:
        return ["jobs_archive_*"]
    filtered = []
    if pattern:
        for i in indices:
            if pattern in i:
                filtered.append(i.rstrip())
        return filtered
    today = datetime.date.today()
    filtered = []
    datefmt = '%Y-%m-%d'
    for i in indices:
        day = re.sub(r'jobs_archive_', '', i).rstrip()
        #print(day)
        day = datetime.datetime.strptime(day, datefmt).date()
        diff = today - day
        if diff.days < last_days:
            filtered.append(i.rstrip())
    return filtered

In [3]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

#get job archive indices from ES
indices = es.cat.indices(index="jobs_archive_*", h="index", request_timeout=600).split('\n')
indices = sorted(indices)
indices = [x for x in indices if x != '']
if 'jobs_archive_2016_status' in indices:
    indices.remove('jobs_archive_2016_status')

<h2>Retrieve job archives of interest from ES</h2>

In [4]:
# retrieve job info from last 2 days
# use e.g. last_days=7 or pattern='2016-02' (no wildcard !)
NDAYS=2  #NDAYS=150 #NDAYS=''
PATTERN='' #PATTERN = '2016-03' #PATTERN=''
ind = time_filter(indices, last_days=NDAYS, pattern=PATTERN)
ind = ','.join(ind)
print(ind)

jobs_archive_2017-05-09,jobs_archive_2017-05-10


## Alerts and Alarms

In [5]:
from subscribers_new import subscribers
import alerts

S = subscribers()
A = alerts.alerts()

<h2>First Alarm</h2> 
<h3>get top 10 users/24 hours for walltime*core, and filter out sum walltime > 15 years</h3>

In [6]:
s = {
    "size": 0, 
    'query':{
        'bool':{
            'must':[
                { "term": {"prodsourcelabel":"user" } },
                { 'range' : {
                    'modificationtime' : {
                        "gte" : "now-1d",
                        "lt" :  "now"}
                    }                
                },
                { 'bool' : {
                    'must_not':[
                        { "term": {"produsername": "gangarbt" } },
                        { "term": {"processingtype":"pmerge" } } ]                        
                    }
                }
            ],
        }
    },
    "aggs": {
        "users":{
            "terms": { 
                "field": "produsername", 
                "order": {"walltime_core_sum": "desc"},
                "size": 10
            },
            "aggs": {
                "walltime_core_sum": {
                    "sum": {
                        "script" : {   # use scripted field to calculate corecount
                            "inline": "def core=doc['actualcorecount'].value; if (core!=null) {return doc['wall_time'].value * core} else {return doc['wall_time'].value}"
                        }
                    } 
                },
            }
        }
    }
}

res = es.search(index=ind, body=s, request_timeout=12000)
#print(res) 

agg = res['aggregations']['users']['buckets']
#print(agg)

#create df
df_w = json_normalize(agg)
df_w['walltime_core_sum.value'] = df_w['walltime_core_sum.value'].apply(lambda x: timedelta(seconds=int(x)).days/365.2)

LIMIT_WALLTIME = 15 # 5 for testing
df_w = df_w[df_w["walltime_core_sum.value"] > LIMIT_WALLTIME]
#display(df_w)
df_w.columns = ['jobs', 'user', 'walltime used']
print(df_w.to_string())


    jobs               user  walltime used
0  39802  Pavel Starovoitov      12.237130
1  96322   Yasuyuki Okumura       7.382256
2  48252  Ferdinand Schenck       6.640197


In [7]:
if df_w.shape[0]>0:
    test_name='Top Analysis users [Large wall time]'
    for u in S.get_immediate_subscribers(test_name):
        body = 'Dear ' + u.name+',\n\n'
        body += 'the following users used substantial wall time (more than 15 years/last 24 hours):\n\n'
        body += df_w.to_string() + '\n'
        body += '\n To get more information about this alert message and its interpretation, please visit:\n'
        body += 'http://atlas-kibana.mwt2.org:5601/app/kibana#/dashboard/FL-Analysis-User'
        body += '\nhttps://its.cern.ch/jira/browse/ADCDPA-1'
        body += '\n To change your alerts preferences please use the following link:\n'+u.link
        body += '\n\nBest regards,\nATLAS Alarm & Alert Service'
        #A.sendMail(test_name, u.email, body)
        #print(body)
    A.addAlert(test_name, u.name, str(df_w.shape[0])+' users with huge walltime.')
else:
    print('No Alarm')

<h2>Second Alarm</h2> 
<h3>get top 10 users/24 hours for inputfilebytes, and filter out sum input size > 500 TB</h3>

In [8]:
s = {
    "size": 0, # get one job entry only for debugging purposes    
    'query':{
        'bool':{
            'must':[
                { "term": {"prodsourcelabel":"user" } },
                { 'range' : {
                    'modificationtime' : {
                        "gte" : "now-1d",
                        "lt" :  "now"}
                    }                
                },
                { 'bool' : {
                    'must_not':[
                        { "term": {"produsername": "gangarbt" } },
                        { "term": {"processingtype":"pmerge" } } ]                        
                    }
                }
            ],
        }
    },
    "aggs": {
        "users":{
            "terms": { 
                "field": "produsername", 
                "order": {"inputsize_sum": "desc"},
                "size": 10
            },
            "aggs": {
                "inputsize_sum": {
                    "sum": { "field": "inputfilebytes" }                     
                },
            }
        }
    }
}

res = es.search(index=ind, body=s, request_timeout=12000)
#print(res) 

agg = res['aggregations']['users']['buckets']
#print(agg)

#create df
df_i = json_normalize(agg)
df_i['inputsize_sum.value'] = df_i['inputsize_sum.value'].apply(lambda x: x*0.00000000000089)

LIMIT_INPUTSIZE = 500 # 5 for testing
df_i = df_i[df_i["inputsize_sum.value"] > LIMIT_INPUTSIZE]
#display(df_i)

df_i.columns = ['jobs', 'input size [TB]', 'user']
print(df_i.to_string())

     jobs  input size [TB]                                        user
0   35109       195.861671                               Takuto Kunigo
1   48252       123.912593                           Ferdinand Schenck
2    6565        91.466921                                  Shogo Kido
3  145448        77.706381                      Katja Hannele Mankinen
4    5028        69.537501                              Frederic Derue
5   22804        69.479996                          Salah-eddine Dahbi
6    2128        69.249816  Helena de Fatima Nunes Casimiro dos Santos
7   32270        62.583431                     Andrew Stephen Chisholm
8    1107        47.332905                              Jonathan Crane
9    3179        43.969427                              Dominik Krauss


In [9]:
if df_i.shape[0]>0:
    test_name='Top Analysis users [Large input data size]'
    for u in S.get_immediate_subscribers(test_name):
        body = 'Dear ' + u.name+',\n\n'
        body += 'the following users processed rather substantial input data (>500 TB/last 24 hours):\n\n'
        body += df_i.to_string() + '\n'
        body += '\n To get more information about this alert message and its interpretation, please visit:\n'
        body += 'http://atlas-kibana.mwt2.org:5601/app/kibana#/dashboard/FL-Analysis-User'
        body += '\nhttps://its.cern.ch/jira/browse/ADCDPA-1'
        body += '\n To change your alerts preferences please use the following link:\n'+u.link
        body += '\n\nBest regards,\nATLAS Alarm & Alert Service'
        #A.sendMail(test_name, u.email, body)
        #print(body)
        A.addAlert(test_name, u.name, str(df_w.shape[0])+' users with huge walltime.')
else:
    print('No Alarm')        