<h1>This notebook retrieves from ES the info from jobs_archive about 10 top users, and sends alarm if usage is above certain thresholds</h1>

In [82]:
import numpy as np
import re
import json
from elasticsearch import Elasticsearch, exceptions as es_exceptions
from pandas.io.json import json_normalize
from IPython.display import display
from pandas import DataFrame
import pandas as pd
from datetime import datetime, timedelta
import datetime

<h2>Retrieve all job indexes from ES</h2>

In [83]:
#define function to filter on time
def time_filter(indices, last_days=1, pattern=''):
    if last_days == 0:
        return ["jobs_archive_*"]
    filtered = []
    if pattern:
        for i in indices:
            if pattern in i:
                filtered.append(i.rstrip())
        return filtered
    today = datetime.date.today()
    filtered = []
    datefmt = '%Y-%m-%d'
    for i in indices:
        day = re.sub(r'jobs_archive_', '', i).rstrip()
        #print(day)
        if '_reindexed' in day:
            day = re.sub(r'_reindexed', '', day).lstrip()        
        day = datetime.datetime.strptime(day, datefmt).date()
        diff = today - day
        if diff.days < last_days:
            filtered.append(i.rstrip())
    return filtered

In [84]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

#get job archive indices from ES
indices = es.cat.indices(index="jobs_archive_*", h="index", request_timeout=600).split('\n')
indices = sorted(indices)
indices = [x for x in indices if x != '']
if 'jobs_archive_2016_status' in indices:
    indices.remove('jobs_archive_2016_status')

<h2>Retrieve job archives of interest from ES</h2>

In [85]:
# retrieve job info from last 2 days
# use e.g. last_days=7 or pattern='2016-02' (no wildcard !)
NDAYS=2  #NDAYS=150 #NDAYS=''
PATTERN='' #PATTERN = '2016-03' #PATTERN=''
ind = time_filter(indices, last_days=NDAYS, pattern=PATTERN)
ind = ','.join(ind)
print(ind)

jobs_archive_2017-11-29,jobs_archive_2017-11-30


## Alerts and Alarms

In [86]:
from subscribers import subscribers
import alerts

S = subscribers()
A = alerts.alerts()

<h2>First Alarm</h2> 
<h3>get top 10 users/24 hours for walltime*core, and filter out sum walltime > 15 years</h3>
<h3>convert walltime in number of cores used per day, by assuming all jobs are single core</h3>

In [29]:
s = {
    "size": 0, 
    'query':{
        'bool':{
            'must':[
                { "term": {"prodsourcelabel":"user" } },
                { 'range' : {
                    'modificationtime' : {
                        "gte" : "now-1d",
                        "lt" :  "now"}
                    }                
                },
                { 'bool' : {
                    'must_not':[
                        { "term": {"produsername": "gangarbt" } },
                        { "term": {"processingtype":"pmerge" } } ,
                        { 'exists' : { "field" : "workinggroup" }}    # only users without workinggroup priviledges
                        ]                        
                    }
                }
            ],
        }
    },
    "aggs": {
        "users":{
            "terms": { 
                "field": "produsername", 
                "order": {"walltime_core_sum": "desc"},
                "size": 10
            },
            "aggs": {
                "walltime_core_sum": {
                    "sum": {
                        "script" : {   # use scripted field to calculate corecount
                            "inline": "def core=doc['actualcorecount'].value; if (core!=null) {return doc['wall_time'].value * core} else {return doc['wall_time'].value}"
                        }
                    } 
                },
            }
        }
    }
}

res = es.search(index=ind, body=s, request_timeout=12000)
#print(res) 

agg = res['aggregations']['users']['buckets']
#print(agg)

#create df
df_w = json_normalize(agg)
df_w['walltime_core_sum.value'] = df_w['walltime_core_sum.value'].apply(lambda x: timedelta(seconds=int(x)).days/365.2)
df_w['ncores']=df_w['walltime_core_sum.value'].apply(lambda x: x*365.) #transform walltime[year] in walltime[day]

LIMIT_WALLTIME = 15 # 5 for testing
df_w = df_w[df_w["walltime_core_sum.value"] > LIMIT_WALLTIME]

display(df_w)
df_w.columns = ['jobs', 'user', 'walltime used [years]', 'number of cores']
print(df_w.to_string())


Unnamed: 0,doc_count,key,walltime_core_sum.value,ncores


Empty DataFrame
Columns: [jobs, user, walltime used [years], number of cores]
Index: []


In [30]:
if df_w.shape[0]>0:
    test_name='Top Analysis users [Large wall time]'
    for u in S.get_immediate_subscribers(test_name):
        body = 'Dear ' + u.name+',\n\n'
        body += 'the following users used substantial wall time (more than 15 years/last 24 hours, corresponding to 5475 cores/day):\n\n'
        body += df_w.to_string() + '\n'
        body += '\n To get more information about this alert message and its interpretation, please visit:\n'
        body += 'http://atlas-kibana.mwt2.org:5601/app/kibana#/dashboard/FL-Analysis-User'
        body += '\nhttps://its.cern.ch/jira/browse/ADCDPA-1'
        body += '\n To change your alerts preferences please use the following link:\n'+u.link
        body += '\n\nBest regards,\nATLAS Alarm & Alert Service'
        A.sendMail(test_name, u.email, body)
        #print(body)
    A.addAlert(test_name, u.name, str(df_w.shape[0])+' users with huge walltime.')
else:
    print('No Alarm')

No Alarm


<h2>Second Alarm</h2> 
<h3>get top 10 users/24 hours for inputfilebytes, and filter out sum input size > 500 TB</h3>

In [31]:
s = {
    "size": 0, # get one job entry only for debugging purposes    
    'query':{
        'bool':{
            'must':[
                { "term": {"prodsourcelabel":"user" } },
                { 'range' : {
                    'modificationtime' : {
                        "gte" : "now-1d",
                        "lt" :  "now"}
                    }                
                },
                { 'bool' : {
                    'must_not':[
                        { "term": {"produsername": "gangarbt" } },
                        { "term": {"processingtype":"pmerge" } },
                        { "term": {"jobstatus" :"closed"} },
                        { "term": {"jobstatus" :"cancelled"} },
                        { 'exists' : { "field" : "workinggroup" }}]                        
                    }
                }
            ],
        }
    },
    "aggs": {
        "users":{
            "terms": { 
                "field": "produsername", 
                "order": {"inputsize_sum": "desc"},
                "size": 10
            },
            "aggs": {
                "inputsize_sum": {
                    "sum": { "field": "inputfilebytes" }                     
                },
            }
        }
    }
}

res = es.search(index=ind, body=s, request_timeout=12000)
#print(res) 

agg = res['aggregations']['users']['buckets']
#print(agg)

#create df
df_i = json_normalize(agg)
df_i['inputsize_sum.value'] = df_i['inputsize_sum.value'].apply(lambda x: x*0.00000000000089)

LIMIT_INPUTSIZE = 500 # 5 for testing
df_i = df_i[df_i["inputsize_sum.value"] > LIMIT_INPUTSIZE]
#display(df_i)

df_i.columns = ['jobs', 'input size [TB]', 'user']
print(df_i.to_string())

Empty DataFrame
Columns: [jobs, input size [TB], user]
Index: []


In [32]:
if df_i.shape[0]>0:
    test_name='Top Analysis users [Large input data size]'
    for u in S.get_immediate_subscribers(test_name):
        body = 'Dear ' + u.name+',\n\n'
        body += 'the following users processed rather substantial input data (>500 TB/last 24 hours):\n\n'
        body += df_i.to_string() + '\n'
        body += '\n To get more information about this alert message and its interpretation, please visit:\n'
        body += 'http://atlas-kibana.mwt2.org:5601/app/kibana#/dashboard/FL-Analysis-User'
        body += '\nhttps://its.cern.ch/jira/browse/ADCDPA-1'
        body += '\n To change your alerts preferences please use the following link:\n'+u.link
        body += '\n\nBest regards,\nATLAS Alarm & Alert Service'
        A.sendMail(test_name, u.email, body)
        #print(body)
        A.addAlert(test_name, u.name, str(df_w.shape[0])+' users with huge walltime.')
else:
    print('No Alarm')        

No Alarm


<h2>Third Alarm</h2> 
<h3>Notify if user job efficiency drops before 70%</h3>

In [33]:
s = {
    "size": 0, # get one job entry only for debugging purposes    
    'query':{
        'bool':{
            'must':[
                { "term": {"prodsourcelabel":"user" } },
                { 'range' : {
                    'modificationtime' : {
                        "gte" : "now-1d",
                        "lt" :  "now"}
                    }                
                },
                { 'bool' : {
                    'must_not':[
                        { "term": {"produsername": "gangarbt" } },
                        { "term": {"processingtype":"pmerge" } } ,
                        { "term": {"jobstatus" :"cancelled" } } ,
                        { "term": {"jobstatus" :"closed"}}
                        ]                        
                    }
                }
            ],
        }
    },
    "aggs": {
        "status":{
            "terms": { 
                "field": "jobstatus", 
                "order": {"corecount_sum": "desc"},
                "size": 5
            },
            "aggs": {
                "corecount_sum": {
                    "sum": { "field": "actualcorecount" }                     
                },
            }
        }
    }
}

            
res = es.search(index=ind, body=s, request_timeout=12000)
#print(res) 

agg = res['aggregations']['status']['buckets']
#print(agg)

#create df
df_e = json_normalize(agg)
#display(df_e)

finished = df_e[df_e['key']=='finished']
successful = finished['corecount_sum.value'].iloc[0]
failed = df_e[df_e['key']=='failed']
total = failed['corecount_sum.value'].iloc[0] + successful


LIMIT_EFFICIENCY = 0.7
Alarm = ''
if (total==0):
    Alarm = "Alarm, no finished user jobs in last 24 hours"
else:
    efficiency = successful/total
    print(str(efficiency))
    if (efficiency < LIMIT_EFFICIENCY):
        Alarm = "Alarm, user job efficiency is "+str(round(efficiency,1))    

if (len(Alarm)>0):
    print(Alarm)

0.870428329767


In [34]:
if (len(Alarm)>0):
    test_name='Top Analysis users [Low efficiency]'
    for u in S.get_immediate_subscribers(test_name):
        body = 'Dear ' + u.name+',\n\n'
        body += 'the following alarm was raised regarding the global user job efficiency in the last 24 hours:\n\n'
        body += Alarm + '\n'
        body += '\n The efficiency is defined as walltime of successful jobs divided by the walltime of successful plus failed jobs'
        body += '\n The efficiency is calculated on all user jobs in the last 24 hours.'
        body += '\n To get more information about this alert message and its interpretation, please visit:\n'
        body += 'http://atlas-kibana.mwt2.org:5601/app/kibana#/dashboard/FL-Analysis'
        body += '\nhttp://atlas-kibana.mwt2.org:5601/app/kibana#/dashboard/FL-Analysis-User'
        body += '\n To change your alerts preferences please use the following link:\n'+u.link
        body += '\n\nBest regards,\nATLAS Alarm & Alert Service'
        A.sendMail(test_name, u.email, body)
        #print(body)
        A.addAlert(test_name, u.name, Alarm)
else:
    print('No Alarm') 

No Alarm


<h2>Fourth alarm -- DISABLED --- TO BE REVIEWED</h2> 
<h3>get name of users with >70 retries in last 24 hours, should we also add a lower limit on the number of jobs?</h3>

In [87]:
s = {
    "size": 0, # get one job entry only for debugging purposes    
    'query':{
        'bool':{
            'must':[
                { "term": {"prodsourcelabel":"user" }}, #add jobstatus failed
                { "term": {"jobstatus":"failed" }},
                { 'range' : {
                    'modificationtime' : {
                        "gte" : "now-1d",
                        "lt" :  "now"}
                } }, 
                { 'range' : {
                    'attemptnr' : {
                        "gte" : "999", #"70",
                        "lt" :  "1000"},                    
                            
                } },
                { 'bool' : {
                    'must_not':[
                        { "term": {"produsername": "gangarbt" } },
                        { "term": {"processingtype":"pmerge" } },
                        ]                        
                    }
                }
            ],
        }
    },
    "aggs": {
        "status":{
            "terms": { 
                "field": "produsername", 
                "order": {"corecount_sum": "desc"},
                "size": 5
            },
            "aggs": {
                "corecount_sum": {
                    "sum": { "field": "actualcorecount" }                     
                },
            }
        }
    }
}

res = es.search(index=ind, body=s, request_timeout=12000)
#print(res) 

agg = res['aggregations']['status']['buckets']
#print(agg)

#create df
df_a = json_normalize(agg)
#display(df_a)
if df_a.shape[0]>0:
    df_a = df_a.drop("doc_count", 1)

    #LIMIT_JOBS = 5 #for testing
    #df_a = df_a[df_a["corecount_sum.value"] > LIMIT_JOBS]
    #display(df_a)

    df_a.columns = ['jobs', 'user']
    print(df_a.to_string())

In [88]:
if df_a.shape[0]>0:
    print('here')
    test_name='Top Analysis users [Retrial attempts]'
    for u in S.get_immediate_subscribers(test_name):
        body = 'Dear ' + u.name+',\n\n'
        body += 'the following users have jobs with more than 70 retrials in the last 24 hours:\n\n'
        body += df_a.to_string() + '\n'
        body += '\n To get more information about what each user is doing, please visit:\n'
        for i in df_a['user'].iteritems():
            body += 'https://bigpanda.cern.ch/tasks/?username='+str(i[1])+'\n'
        body += '\n If deemed necessary, please contact the user to ask what he/she is doing:\n'  
        body += '\nhttps://its.cern.ch/jira/browse/ADCDPA-1'
        body += '\n To change your alerts preferences please use the following link:\n'+u.link
        body += '\n\nBest regards,\nATLAS Alarm & Alert Service'
        A.sendMail(test_name, u.email, body)
        #print(body)
        A.addAlert(test_name, u.name, str(df_a.shape[0])+' users with jobs with large retrial attempts.')
else:
    print('No Alarm')  

No Alarm
