<h1>This notebook retrieves from ES the info from jobs_archive to study tasks with extremely short or long walltimes</h1>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import re
import json
from elasticsearch import Elasticsearch, exceptions as es_exceptions
from elasticsearch.helpers import scan
from pandas.io.json import json_normalize
from IPython.display import display
from pandas import HDFStore,DataFrame
import pandas as pd
from datetime import datetime, timedelta
#import datetime

%matplotlib inline

<h2>Retrieve all job indexes from ES</h2>

In [14]:
#define function to filter on time
def time_filter(indices, last_days=1, pattern=''):
    if last_days == 0:
        return ["jobs_archive_*"]
    filtered = []
    if pattern:
        for i in indices:
            if pattern in i:
                filtered.append(i.rstrip())
        return filtered
    today = datetime.today().date()
    filtered = []
    datefmt = '%Y-%m-%d'
    for i in indices:
        day = re.sub(r'jobs_archive_', '', i).rstrip()
        #print(day)
        if '_reindexed' in day:
            day = re.sub(r'_reindexed', '', day).lstrip()        
        day = datetime.strptime(day, datefmt).date()
        diff = today - day
        if diff.days < last_days:
            filtered.append(i.rstrip())
    return filtered

In [15]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

#get job archive indices from ES
indices = es.cat.indices(index="jobs_archive_*", h="index", request_timeout=600).split('\n')
indices = sorted(indices)
indices = [x for x in indices if x != '']
if 'jobs_archive_2016_status' in indices:
    indices.remove('jobs_archive_2016_status')
#remove data due to central problem
#if 'jobs_archive_2016-12-29' in indices:
#    indices.remove('jobs_archive_2016-12-29')
#print(indices)

<h2>Retrieve job archives of interest from ES</h2>

In [16]:
#define function to create jobs object from scroll
def jobs_list(scroll, max=-1):
  
    i = 0   
    jobs = []
    
    for result in scroll:
        #print(result['_source']['pandaid'])
        #print_info(result)
        if (max<0):
            jobs.append(result)
        else:
            if (i<max):
                jobs.append(result)
        #if i<1: print_info(result)
        i = i+1
        if not i%100000:  
            print('processing hit '+str(i)+'...')

    return jobs

In [17]:
# retrieve job info from last 30 days
# use e.g. last_days=7 or pattern='2016-02' (no wildcard !)
NDAYS=30  #NDAYS=150 #NDAYS=''
PATTERN='' #PATTERN = '2016-03' #PATTERN=''
ind = time_filter(indices, last_days=NDAYS, pattern=PATTERN)
ind = ','.join(ind)
print(ind)

jobs_archive_2017-09-13,jobs_archive_2017-09-14,jobs_archive_2017-09-15,jobs_archive_2017-09-16,jobs_archive_2017-09-17,jobs_archive_2017-09-18,jobs_archive_2017-09-19,jobs_archive_2017-09-20,jobs_archive_2017-09-21,jobs_archive_2017-09-22,jobs_archive_2017-09-23,jobs_archive_2017-09-24,jobs_archive_2017-09-25,jobs_archive_2017-09-26,jobs_archive_2017-09-27,jobs_archive_2017-09-28,jobs_archive_2017-09-29,jobs_archive_2017-09-30,jobs_archive_2017-10-01,jobs_archive_2017-10-02,jobs_archive_2017-10-03,jobs_archive_2017-10-04,jobs_archive_2017-10-05,jobs_archive_2017-10-06,jobs_archive_2017-10-07,jobs_archive_2017-10-08,jobs_archive_2017-10-09,jobs_archive_2017-10-10,jobs_archive_2017-10-11,jobs_archive_2017-10-12


<h2>Retrieve jobs of interest from ES</h2>

In [18]:
def queryES(es, ind):
    
    #ES query: select finished analysis jobs with inputfile DAOD
    query = "(prodsourcelabel:user) AND (NOT produsername:gangarbt) AND (NOT processingtype:pmerge) AND ((jobstatus:failed) OR (jobstatus:finished))" 
 
    #use scroll to get all hits (allows unlimited result queries)
    scroll = scan(es, index=ind, q=query, scroll='5m', timeout="5m", size=100, _source=[
            "pandaid", "jeditaskid", "inputfiletype", "produsername", "cpuconsumptiontime", "wall_time", "nevents",
            "ninputdatafiles", "inputfilebytes", "starttime", "endtime", "creationtime", "jobstatus"
        ])

    #jobs = jobs_list(scroll, 10000)
    jobs = jobs_list(scroll)

    #save into Panda Dataframe
    df_orig = json_normalize(jobs)
    #print('Saving into Panda Dataframe') 

    return df_orig  

In [19]:
def get_result(df):
    
    df = df.drop(['_id', '_score', '_type', 'sort'], 1)
    
    #change name and manipulate strings
    df = df.rename(index=str, columns={"_index": "date", "_source.starttime":"starttime", 
         "_source.creationtime":"creationtime", "_source.endtime":"endtime",  "_source.jobstatus":"jobstatus",
         "_source.inputfiletype":"type", "_source.jeditaskid":"taskid", "_source.pandaid":"pandaid", 
         "_source.produsername":"user", "_source.cpuconsumptiontime":"cputime", "_source.wall_time":"walltime", "_source.nevents":"nevents", 
         "_source.ninputdatafiles":"nfiles", "_source.inputfilebytes":"size"})
    
    #creates new dataframe, with columns with same name untouched, and summing over other columns with different values
#    df_new = df.groupby(['name', 'scope']).agg({  
#        "pandaid": pd.Series.nunique, "user": pd.Series.nunique, "site": pd.Series.nunique, "taskid": pd.Series.nunique,
#        "cputime": pd.Series.sum, "walltime": pd.Series.sum, "nevents": pd.Series.sum, "nfiles": pd.Series.sum,
#         "size": pd.Series.sum # sum over unique pandaids, taskids, sites, users. Get the total/day for all the rest  
#        }).reset_index()
    
    return df   

In [88]:
NDAYS=21
#NDAYS=''
#PATTERN='2017-06-'
PATTERN=''
ind = time_filter(indices, last_days=NDAYS, pattern=PATTERN) #list of indices per day
#print(ind)

day = 1
for i in ind:

    print(i)
    df = queryES(es, i)    
    
    
    if (day == 1): #first day of the week
        df_save = df
        DATE = re.sub(r'jobs_archive_', '', i).rstrip() #save the date
        print(DATE) 
    
    else:
        df_save = pd.concat([df_save, df])
        print('concat df '+str(day))

    day += 1

#store to file
#store = HDFStore('walltimejobs_'+DATE+'.h5')
#store['df_save'] = df_save
#store.close()     
    
display(df_save.head(5))  

jobs_archive_2017-09-22
processing hit 100000...
processing hit 200000...
processing hit 300000...
processing hit 400000...
processing hit 500000...
processing hit 600000...
processing hit 700000...
processing hit 800000...
processing hit 900000...
2017-09-22
jobs_archive_2017-09-23
processing hit 100000...
processing hit 200000...
processing hit 300000...
processing hit 400000...
processing hit 500000...
processing hit 600000...
processing hit 700000...
concat df 2
jobs_archive_2017-09-24
processing hit 100000...
processing hit 200000...
processing hit 300000...
processing hit 400000...
processing hit 500000...
concat df 3
jobs_archive_2017-09-25
processing hit 100000...
processing hit 200000...
processing hit 300000...
processing hit 400000...
processing hit 500000...
concat df 4
jobs_archive_2017-09-26
processing hit 100000...
processing hit 200000...
processing hit 300000...
processing hit 400000...
processing hit 500000...
processing hit 600000...
processing hit 700000...
processi

Unnamed: 0,_id,_index,_score,_source.cpuconsumptiontime,_source.creationtime,_source.endtime,_source.inputfilebytes,_source.inputfiletype,_source.jeditaskid,_source.jobstatus,_source.nevents,_source.ninputdatafiles,_source.pandaid,_source.produsername,_source.starttime,_source.wall_time,_type,sort
0,3615781747,jobs_archive_2017-09-22,,38,2017-09-21T16:10:59,2017-09-22T02:43:42,435718100.0,AOD,12169960,finished,830,1.0,3615781747,Helena de Fatima Nunes Casimiro dos Santos,2017-09-22T02:41:22,140,jobs_data,[0]
1,3616970860,jobs_archive_2017-09-22,,6281,2017-09-22T12:22:21,2017-09-22T14:17:25,4880219000.0,DAOD_MUON1,12171548,finished,130000,2.0,3616970860,Johannes Junggeburth,2017-09-22T17:27:10,-11385,jobs_data,[0]
2,3615781740,jobs_archive_2017-09-22,,29,2017-09-21T16:10:58,2017-09-22T02:43:35,515855100.0,AOD,12169960,finished,997,1.0,3615781740,Helena de Fatima Nunes Casimiro dos Santos,2017-09-22T02:41:21,134,jobs_data,[1]
3,3615994479,jobs_archive_2017-09-22,,0,2017-09-21T19:21:46,2017-09-21T19:32:02,9490955000.0,AOD,12169513,failed,37985,1.0,3615994479,Andrey Minaenko,2017-09-21T19:22:37,565,jobs_data,[1]
4,3617741938,jobs_archive_2017-09-22,,40,2017-09-22T22:11:48,2017-09-22T22:33:35,1293158000.0,DAOD_FTAG2,12183802,finished,0,1.0,3617741938,Geisen Jannik,2017-09-22T22:28:10,325,jobs_data,[1]


In [None]:
display(df_save.head(5)) 

<h3>Combine jobs in tasks</h3>

In [87]:
DATE = '2017-10-11'
df_save = pd.read_hdf('walltimejobs_'+DATE+'.h5', 'df_save')

result = get_result(df_save)
display(result.head(5))
display(result.shape)

Unnamed: 0,date,cputime,creationtime,endtime,size,type,taskid,jobstatus,nevents,nfiles,pandaid,user,starttime,walltime
0,jobs_archive_2017-10-11,255,2017-10-10T22:20:11,2017-10-11T00:06:59,418096400.0,DAOD_SUSY7,12311404,finished,13631,1.0,3652368079,john anders,2017-10-10T23:59:12,467
1,jobs_archive_2017-10-11,119,2017-10-11T02:04:40,2017-10-11T02:19:57,78065250.0,DAOD_EGAM2,12311488,finished,1196,1.0,3652595982,Pavel Podberezko,2017-10-11T02:16:39,198
2,jobs_archive_2017-10-11,214,2017-10-10T22:20:11,2017-10-11T00:06:44,386821200.0,DAOD_SUSY7,12311404,finished,12720,1.0,3652368082,john anders,2017-10-10T23:59:12,452
3,jobs_archive_2017-10-11,1071,2017-10-10T20:07:16,2017-10-11T04:31:34,10136320000.0,DAOD_STDM4,12311453,finished,366604,13.0,3652218501,Christian Johnson,2017-10-11T03:44:39,2815
4,jobs_archive_2017-10-11,743,2017-10-10T23:16:10,2017-10-11T00:07:48,1645204000.0,DAOD_SUSY7,12311334,finished,29163,1.0,3652422368,john anders,2017-10-10T23:51:52,956


(1233753, 14)

In [23]:
result = result.sort_values('taskid')
display(result.head(5))

Unnamed: 0,date,cputime,creationtime,endtime,size,type,taskid,jobstatus,nevents,nfiles,pandaid,user,starttime,walltime
544676,jobs_archive_2017-10-11,2,2017-10-11T11:03:00,2017-10-11T11:24:29,7542443000.0,AOD,11970061,failed,0,1.0,3653266322,mark sutton,2017-10-11T11:11:54,755
330995,jobs_archive_2017-10-11,5,2017-10-11T10:12:38,2017-10-11T10:45:20,7542443000.0,AOD,11970061,failed,0,1.0,3653201821,mark sutton,2017-10-11T10:37:26,474
3736,jobs_archive_2017-10-11,2,2017-10-10T18:27:16,2017-10-11T06:37:02,7542443000.0,AOD,11970061,failed,0,1.0,3652117826,mark sutton,2017-10-11T06:35:02,120
54789,jobs_archive_2017-10-11,3,2017-10-11T06:53:40,2017-10-11T07:26:03,7542443000.0,AOD,11970061,failed,0,1.0,3653050294,mark sutton,2017-10-11T07:22:03,240
316455,jobs_archive_2017-10-12,2,2017-10-12T05:26:01,2017-10-12T06:13:34,7542443000.0,AOD,11970061,failed,0,1.0,3654762618,mark sutton,2017-10-12T06:11:34,120


In [72]:
#for taskid, df_taskid in result.groupby('taskid'):
#    print(df_taskid)#
#print(result.groupby('taskid'))
tasks = {k: v for k, v in result.groupby('taskid')}

print('Number of tasks '+str(len(list(tasks))))
print(list(tasks)[0])
display(tasks[list(tasks)[0]])

Number of tasks 8680
12320768


Unnamed: 0,date,cputime,creationtime,endtime,size,type,taskid,jobstatus,nevents,nfiles,pandaid,user,starttime,walltime
605662,jobs_archive_2017-10-11,69,2017-10-11T19:33:13,2017-10-11T20:05:55,6192445000.0,DAOD_SUSY1,12320768,failed,0,3.0,3653846727,Pepijn Johannes Bakker,2017-10-11T19:51:21,874
605669,jobs_archive_2017-10-11,70,2017-10-11T19:33:13,2017-10-11T20:00:54,8558777000.0,DAOD_SUSY1,12320768,failed,0,3.0,3653846729,Pepijn Johannes Bakker,2017-10-11T19:51:24,570
605666,jobs_archive_2017-10-11,116,2017-10-11T19:33:13,2017-10-11T19:58:19,2753118000.0,DAOD_SUSY1,12320768,failed,0,3.0,3653846726,Pepijn Johannes Bakker,2017-10-11T19:49:49,510
432409,jobs_archive_2017-10-12,109,2017-10-12T07:43:38,2017-10-12T08:14:45,439084900.0,DAOD_SUSY1,12320768,failed,0,3.0,3654995175,Pepijn Johannes Bakker,2017-10-12T08:11:27,198
429254,jobs_archive_2017-10-12,145,2017-10-12T07:43:38,2017-10-12T08:15:51,353026800.0,DAOD_SUSY1,12320768,failed,0,3.0,3654995162,Pepijn Johannes Bakker,2017-10-12T08:11:30,261
379452,jobs_archive_2017-10-12,72,2017-10-12T05:21:53,2017-10-12T07:19:49,6192445000.0,DAOD_SUSY1,12320768,failed,0,3.0,3654755367,Pepijn Johannes Bakker,2017-10-12T07:16:17,212
522895,jobs_archive_2017-10-11,187,2017-10-11T19:33:14,2017-10-11T19:59:07,2530131000.0,DAOD_SUSY1,12320768,failed,0,3.0,3653846730,Pepijn Johannes Bakker,2017-10-11T19:51:26,461
426139,jobs_archive_2017-10-12,110,2017-10-12T07:43:38,2017-10-12T08:14:43,6220280000.0,DAOD_SUSY1,12320768,failed,0,3.0,3654995166,Pepijn Johannes Bakker,2017-10-12T08:11:26,197
352129,jobs_archive_2017-10-12,96,2017-10-12T05:21:53,2017-10-12T07:36:29,2530131000.0,DAOD_SUSY1,12320768,failed,0,3.0,3654755370,Pepijn Johannes Bakker,2017-10-12T07:16:34,1195
368110,jobs_archive_2017-10-12,48,2017-10-11T19:33:13,2017-10-11T19:53:22,91988480.0,DAOD_SUSY1,12320768,finished,2473,3.0,3653846725,Pepijn Johannes Bakker,2017-10-11T19:49:49,213


In [80]:
#drop tasks younger than X days: keep only tasks where at least 1 job was created before X days

X = 1
#t = tasks[12320768]
keys=[]
for taskid, t in tasks.items():
    keep = False
    #display(t)
    for creation in t['creationtime']:
        #print(creation)
        time_from_creation = datetime.utcnow() - datetime.strptime(creation,'%Y-%m-%dT%H:%M:%S')
        #    print(time_from_creation) 
        if  time_from_creation.days>X:
            keep = True
            #print('keep task '+str(taskid) +' '+str(time_from_creation))
    if not(keep):
        #print('deleting '+str(taskid)+' '+str(time_from_creation))
        #del tasks[taskid]    
        keys.append(taskid) 

print('Number of tasks to be deleted '+str(len(keys))) 

Number of tasks to be deleted 8388


In [None]:
for x in keys:
    del tasks[x]        
print('Number of tasks after delection '+str(len(list(tasks)))) 

In [85]:
#display(t['creationtime'])
#for i in t['creationtime']:
#    print(i)

print(list(tasks)[1])
display(tasks[list(tasks)[1]])

12288019


Unnamed: 0,date,cputime,creationtime,endtime,size,type,taskid,jobstatus,nevents,nfiles,pandaid,user,starttime,walltime
248245,jobs_archive_2017-10-11,10,2017-10-10T15:27:38,2017-10-11T00:13:32,9.666610e+09,AOD,12288019,failed,0,1.0,3651905323,Andrey Minaenko,2017-10-11T00:11:05,147
254853,jobs_archive_2017-10-11,9,2017-10-10T14:13:26,2017-10-11T00:09:43,9.714956e+09,AOD,12288019,failed,0,1.0,3651809473,Andrey Minaenko,2017-10-11T00:08:15,88
740692,jobs_archive_2017-10-11,10,2017-10-11T21:33:41,2017-10-11T21:41:42,9.651500e+09,AOD,12288019,failed,0,1.0,3654028633,Andrey Minaenko,2017-10-11T21:38:14,208
234292,jobs_archive_2017-10-11,5,2017-10-10T17:59:21,2017-10-10T18:08:15,8.382762e+09,AOD,12288019,failed,0,1.0,3652088274,Andrey Minaenko,2017-10-10T18:06:49,86
740700,jobs_archive_2017-10-11,8,2017-10-11T21:33:41,2017-10-11T22:19:06,8.465794e+09,AOD,12288019,failed,0,1.0,3654028625,Andrey Minaenko,2017-10-11T22:16:15,171
238887,jobs_archive_2017-10-11,12,2017-10-10T15:27:46,2017-10-11T00:14:37,9.234948e+09,AOD,12288019,failed,0,1.0,3651905604,Andrey Minaenko,2017-10-11T00:12:59,98
254854,jobs_archive_2017-10-11,10,2017-10-10T14:13:26,2017-10-11T00:09:57,1.019374e+10,AOD,12288019,failed,0,1.0,3651809483,Andrey Minaenko,2017-10-11T00:08:16,101
100400,jobs_archive_2017-10-12,10,2017-10-11T14:32:11,2017-10-11T20:21:22,8.852869e+09,AOD,12288019,failed,0,1.0,3653444786,Andrey Minaenko,2017-10-11T20:13:30,472
30950,jobs_archive_2017-10-11,7936,2017-10-10T17:59:23,2017-10-10T20:41:17,9.007901e+09,AOD,12288019,failed,33402,1.0,3652088296,Andrey Minaenko,2017-10-10T18:06:54,9263
254844,jobs_archive_2017-10-11,8,2017-10-10T14:13:30,2017-10-11T00:12:10,1.031719e+10,AOD,12288019,failed,0,1.0,3651809799,Andrey Minaenko,2017-10-11T00:10:45,85
