# Parameters

In [264]:
db_host      = 'atlas-kibana.mwt2.org'
db_port      = 9200
db_index     = 'jobs_archive_2018-04-0*'

query_source = [
    'jobstatus',
    'jobname',
    'computingsite', 
    'taskid', 
    'pandaid', 
    'attemptnr',
    'exeerrordiag',
    'piloterrordiag',
    'jobdispatchererrordiag',
    'transformation'
]

query_should = [              
    {'term':{'jobstatus': 'finished'}},
    {'term':{'jobstatus': 'failed'}},
    {'term':{'jobstatus': 'closed'}},
]

query_must_not = [
    {'term':{'gShare': 'Analysis'}},

]


# Query database

In [265]:
import datetime

from elasticsearch import Elasticsearch, helpers 
from elasticsearch.helpers import scan

es = Elasticsearch([{'host':db_host, 'port':db_port}], timeout=60)

indices = es.cat.indices(index=db_index, h='index', 
                         request_timeout=600).split('\n')
indices = sorted(indices)
indices = [ii for ii in indices if ii != '']
indices = ','.join(indices)

job_query = {
    'size'   : 0, 
    '_source': query_source, 
    'query':   {'bool': {'should': query_should, 
                         'minimum_should_match':1, 
                         'must_not': query_must_not}}
}

scroll = scan(client=es, index=indices, query=job_query)

results = {}
count = 1

print ('inf> index: ', db_index)

for res in scroll:
    r = res['_source']
    job_info = []
    
    for source in query_source:
        job_info.append(r[source])
            
    if r['taskid'] in results.keys():
        results[r['taskid']].append(job_info)
    else:
        results[r['taskid']] = [job_info]
    
    if (count % 100000) == 0:
        print ('inf> processed', count, 'events')
    count += 1
   
print ('inf> total', len(results), 'tasks queried')

inf> index:  jobs_archive_2018-04-0*
inf> processed 100000 events
inf> processed 200000 events
inf> processed 300000 events
inf> processed 400000 events
inf> processed 500000 events
inf> processed 600000 events
inf> processed 700000 events
inf> processed 800000 events
inf> processed 900000 events
inf> processed 1000000 events
inf> processed 1100000 events
inf> processed 1200000 events
inf> processed 1300000 events
inf> processed 1400000 events
inf> processed 1500000 events
inf> processed 1600000 events
inf> processed 1700000 events
inf> processed 1800000 events
inf> processed 1900000 events
inf> processed 2000000 events
inf> processed 2100000 events
inf> processed 2200000 events
inf> processed 2300000 events
inf> processed 2400000 events
inf> processed 2500000 events
inf> processed 2600000 events
inf> processed 2700000 events
inf> processed 2800000 events
inf> processed 2900000 events
inf> processed 3000000 events
inf> processed 3100000 events
inf> processed 3200000 events
inf> process

# Labeling

In [266]:
print ('inf> start labeling')
output = open('error.txt', 'w')

for key, values in results.items():
        
    job_initial = {}
    job_final   = {}
        
    for value in values:
        
        jobstatus      = value[0]
        jobname        = '%s %s' % (value[8], value[1])
        computingsite  = value[2]
        pandaid        = value[4]
        attemptnr      = value[5]
        exeerrordiag   = value[6]
        piloterrordiag = value[7]
        
        job_info = [jobstatus, computingsite, attemptnr, 
                    exeerrordiag, piloterrordiag]
        
        if not jobname in job_initial.keys():
            job_initial[jobname] = job_info
            job_final[jobname] = job_info
        else:
            if attemptnr < job_initial[jobname][2]:
                job_initial[jobname] = job_info
            if attemptnr > job_final[jobname][2]:
                job_final[jobname] = job_info
    
    job_success = {}
    job_fail    = {}
    
    for name, info in job_initial.items():
        if job_final[name][0] == 'finished':
            if job_final[name][1] in job_success.keys():
                job_success[job_final[name][1]] += 1
            else:
                job_success[job_final[name][1]] = 1
                    
        if job_final[name][0] == 'failed':
            if job_final[name][1] in job_fail.keys():
                job_fail[job_final[name][1]] += 1
            else:
                job_fail[job_final[name][1]] = 1
                

    for name, info in job_initial.items():
    
        jobstatus_init  = info[0]
        computing_init  = info[1]
        jobstatus_final = job_final[name][0]
        computing_final = job_final[name][1]
        exeerr_init     = info[3]
        piloterr_init   = info[4]
        
        # good job
        if jobstatus_init == 'finished':
            continue
    
        # OK?
        if (exeerr_init == 'OK'):
            continue
    
        # no error
        if (exeerr_init == None) and (piloterr_init == None):
            continue
        
        # 0=unkown, 1=resubmit, 2=athena problem, 3=site problem
        label = ''
        if jobstatus_final != 'finished':
            if (computing_final in job_success.keys()) and (job_success[computing_final] > 5): 
                label = 'resubmit'
            elif info[1] != job_final[name][1]:
                label = 'athena'
            else:
                nfails = 0
                for site in job_fail.keys():
                    if site == computing_final:
                        continue
                    nfails += job_fail[site]
                
                if nfails > 5:
                    label = 'athena'
                else:
                    label = 'site'
        else:
            if (computing_init in job_success.keys()) and (job_success[computing_init] > 5):
                label = 'resubmit'
            elif info[1] != job_final[name][1]:
                label = 'site'
            else:
                label = 'resubmit'
                
        if exeerr_init != None:
            exeerr_init = exeerr_init.replace('\n', '')
        if piloterr_init != None:
            piloterr_init = piloterr_init.replace('\n', '')
        
        output.write('%s,,,,%s,,,,%s,,,,%s\n' % (computing_init, exeerr_init, piloterr_init, label))
print ('done')

inf> start labeling
done
