# Parameters

In [181]:
db_host      = 'atlas-kibana.mwt2.org'
db_port      = 9200
db_index     = 'jobs_archive_2018-*'

query_source = [
    'taskid',
    'pandaid',
    'jobstatus',
    'computingsite',
    'exeerrorcode',
    'exeerrordiag',
    'piloterrorcode',
    'piloterrordiag',
    'jobdispatchererrorcode',
    'jobdispatchererrordiag',
    'ddmerrorcode',
    'ddmerrordiag',
    'taskbuffererrorcode',
    'taskbuffererrordiag',
    'transexitcode',
    'transformation',
    'wall_time',
    'corecount',
]

query_should = [              
    {'term':{'jobstatus': 'failed'}},
]

query_must_not = [
    {'term':{'gShare': 'Analysis'}},
]

# Query database

In [182]:
import datetime

from elasticsearch import Elasticsearch, helpers 
from elasticsearch.helpers import scan

es = Elasticsearch([{'host':db_host, 'port':db_port}], timeout=60)

indices = es.cat.indices(index=db_index, h='index', 
                         request_timeout=600).split('\n')
indices = sorted(indices)
indices = [ii for ii in indices if ii != '']
indices = ','.join(indices)

job_query = {
    'size'   : 0, 
    '_source': query_source, 
    'query':   {'bool': {'should': query_should, 
                         'minimum_should_match':1, 
                         'must_not': query_must_not}}
}

scroll = scan(client=es, index=indices, query=job_query)

results = []
count = 1

print ('inf> index: ', db_index)

for res in scroll:
    r = res['_source']
    job_info = []
    
    for source in query_source:
        job_info.append(r[source])
            
    if job_info[0] == None:
        continue
        
    results.append(job_info)
    
    if (count % 100000) == 0:
        print ('inf> processed', count, 'events')
    count += 1
    
print ('inf> total', len(results), 'tasks queried')

inf> index:  jobs_archive_2018-*
inf> processed 100000 events
inf> processed 200000 events
inf> processed 300000 events
inf> processed 400000 events
inf> processed 500000 events
inf> processed 600000 events
inf> processed 700000 events
inf> processed 800000 events
inf> processed 900000 events
inf> processed 1000000 events
inf> processed 1100000 events
inf> processed 1200000 events
inf> processed 1300000 events
inf> processed 1400000 events
inf> processed 1500000 events
inf> processed 1600000 events
inf> processed 1700000 events
inf> processed 1800000 events
inf> processed 1900000 events
inf> processed 2000000 events
inf> processed 2100000 events
inf> processed 2200000 events
inf> processed 2300000 events
inf> processed 2400000 events
inf> processed 2500000 events
inf> processed 2600000 events
inf> processed 2700000 events
inf> processed 2800000 events
inf> processed 2900000 events
inf> processed 3000000 events
inf> processed 3100000 events
inf> processed 3200000 events
inf> processed 3

In [183]:
%matplotlib inline
import matplotlib.pyplot as plt 
import re

wall_trans = {}
wall_site  = {}

wall_exe   = {}
wall_pilot = {}
wall_jobds = {}
wall_ddm   = {}

wall_exe_total  = {}
wall_pilot_total = {}
wall_jobds_total = {}
wall_ddm_total   = {}

def get_filter(words):
    filtered = ''
    
    words_list = words.split()
    for word in words_list:
        tmp_word = word.replace(';', '')
        tmp_word = tmp_word.replace(':', '')
        
        if len(tmp_word) > 30:
            tmp_word = 'xxxx'
        if tmp_word.isdigit():
            tmp_word = 'xxxx'
        if '0x' in tmp_word:
            tmp_word = 'xxxx'
            
        match = re.match(r'\d{4}-\d{2}-\d{2}.*', tmp_word)
        if match:
            tmp_word = 'xxxx-xx-xx'
    
        filtered = filtered + " " + tmp_word
    
    return filtered
    
    
def fill_err(code, err, loss, wall_dict, wall_dict_total):
    if code in wall_dict_total.keys():            
        wall_dict_total[code] += loss
    else:
        wall_dict_total[code] = loss

    if err in wall_dict.keys():            
        wall_dict[err] += loss
    else:
        wall_dict[err] = loss
    
    
for result in results:
    exeerr  = result[5]
    execode = result[4]
    pilerr  = result[7]
    pilcode = result[6]
    diserr  = result[9]
    discode = result[8]
    ddmerr  = result[11]
    ddmcode = result[10]
    
    site    = result[3]
    process = result[15]
    core    = result[17]
    
    if core == None:
        core = 1
    loss    = (result[16]/3600)*core
    
    if process in wall_trans.keys():
        wall_trans[process] += loss
    else:
        wall_trans[process] = loss
        
    if site in wall_site.keys():
        wall_site[site] += loss
    else:
        wall_site[site] = loss
        
    if exeerr != None:
        exeerr = get_filter(exeerr)
        fill_err(execode, exeerr, loss, wall_exe, wall_exe_total)
    if pilerr != None:
        pilerr = get_filter(pilerr)
        fill_err(pilcode, pilerr, loss, wall_pilot, wall_pilot_total)
    if diserr != None:
        diserr = get_filter(diserr)
        fill_err(discode, diserr, loss, wall_jobds, wall_jobds_total)
    if ddmerr != None:
        ddmerr = get_filter(ddmerr)
        fill_err(ddmcode, ddmerr, loss, wall_ddm, wall_ddm_total)
            

            
            
            
data_trans = sorted(wall_trans.items(), key=lambda x: -x[1])  
data_site  = sorted(wall_site.items(), key=lambda x: -x[1])
data_exe   = sorted(wall_exe_total.items(), key=lambda x: -x[1])
data_pil   = sorted(wall_pilot_total.items(), key=lambda x: -x[1])
data_dis   = sorted(wall_jobds_total.items(), key=lambda x: -x[1])
data_ddm   = sorted(wall_ddm_total.items(), key=lambda x: -x[1])

data_exe_err   = sorted(wall_exe.items(), key=lambda x: -x[1])
data_pil_err   = sorted(wall_pilot.items(), key=lambda x: -x[1])
data_dis_err   = sorted(wall_jobds.items(), key=lambda x: -x[1])
data_ddm_err   = sorted(wall_ddm.items(), key=lambda x: -x[1])

#data_pilot = sorted(wall_pilot.items(), key=lambda x: -x[1])

print ('inf> writing transformation')
output = open('transformation.txt', 'w')
for ii in data_trans:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing computingsite')
output = open('computingsite.txt', 'w')
for ii in data_site:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing execution error codes')
output = open('exeerrorcode.txt', 'w')
for ii in data_exe:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing pilot error codes')
output = open('piloterrorcode.txt', 'w')
for ii in data_pil:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing dispatcher error codes')
output = open('dipatcherrorcode.txt', 'w')
for ii in data_dis:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing ddm error codes')
output = open('ddmerrorcode.txt', 'w')
for ii in data_ddm:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()


print ('inf> writing execution error messages')
output = open('exeerror.txt', 'w')
for ii in data_exe_err:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing pilot error messages')
output = open('piloterror.txt', 'w')
for ii in data_pil_err:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing dispatcher error messages')
output = open('dipatcherror.txt', 'w')
for ii in data_dis_err:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()

print ('inf> writing ddm error messages')
output = open('ddmerror.txt', 'w')
for ii in data_ddm_err:
    output.write('%s %s\n' % (ii[0], ii[1]))
output.close()


inf> writing transformation
inf> writing computingsite
inf> writing execution error codes
inf> writing pilot error codes
inf> writing dispatcher error codes
inf> writing ddm error codes
inf> writing execution error messages
inf> writing pilot error messages
inf> writing dispatcher error messages
inf> writing ddm error messages
