# Joins columns from benchmarks and jobsarchive tables and outputs them for ML analysis

In [1]:
import pickle
import re
import datetime
from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import scan

import numpy as np
import pandas as pd

es = Elasticsearch([{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

benchmark_indices = ['benchmarks-2017.04','benchmarks-2017.05']
daysOfData=25

In [2]:
# get job archive indices
indices = es.cat.indices(index="jobs_archive_20*", h="index", request_timeout=600).split('\n')
indices = sorted(indices)
indices = [x for x in indices if x != '']
#for i in indices:
#    print(i)

def time_filter(indices, days=1, until=0):
    if days == 0:
        return ["jobs_archive_*"]
    today = datetime.date.today()
    filtered = []
    datefmt = '%Y-%m-%d'
    for i in indices:
        day = re.sub(r'jobs_archive_', '', i).rstrip()
        day = datetime.datetime.strptime(day, datefmt).date()
        diff = today - day
        if until <= diff.days < days + until:
            filtered.append(i.rstrip())
    return filtered

selIndices = time_filter(indices,daysOfData)
job_indices = ','.join(selIndices)
print(job_indices)

jobs_archive_2017-04-07,jobs_archive_2017-04-08,jobs_archive_2017-04-09,jobs_archive_2017-04-10,jobs_archive_2017-04-11,jobs_archive_2017-04-12,jobs_archive_2017-04-13,jobs_archive_2017-04-14,jobs_archive_2017-04-15,jobs_archive_2017-04-16,jobs_archive_2017-04-17,jobs_archive_2017-04-18,jobs_archive_2017-04-19,jobs_archive_2017-04-20,jobs_archive_2017-04-21,jobs_archive_2017-04-22,jobs_archive_2017-04-23,jobs_archive_2017-04-24,jobs_archive_2017-04-25,jobs_archive_2017-04-26,jobs_archive_2017-04-27,jobs_archive_2017-04-28,jobs_archive_2017-04-29,jobs_archive_2017-04-30,jobs_archive_2017-05-01


In [3]:
benchmarks_query={
    "size": 0,
    "query":{
       "match_all": {}
    }
}

res = scan(es, query=benchmarks_query, index=benchmark_indices, scroll='5m', timeout="5m", size=1000)

data=[]
benchmarksPerSite={}
count = 0
for r in res:
    pr=r['_source']['profiles']
    mt=r['_source']['metadata']
    site=mt['ATLASSite']
    if site not in benchmarksPerSite:
        benchmarksPerSite[site]=0
    benchmarksPerSite[site]+=1
    if 'fastBmk' not in pr: 
        fastBmk = 0
    else:
        fastBmk = pr['fastBmk']['value']
    if 'whetstone' not in pr:
        ws = 0
    else:
        ws = pr['whetstone']['score']
    doc=[mt['PanDAID'],mt['bogomips'],mt['ip'],site,mt['mp_num'],mt['cpuname'],mt['meminfo'],fastBmk,ws]
    data.append(doc)
    if not count%10000:
        print(count)
#     if count>100: break
    count+=1

print(len(data))
    
benchmark=pd.DataFrame(data,columns=['pandaid','bogomips','ip','site','mpnum','cpuname','meminfo','fastBmk','whetstone'])
del data

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
148765


In [4]:
#print(benchmark.dtypes)
benchmark.head()
pids=set(benchmark['pandaid'].tolist())
print('total benchmarks:', len(pids))


total benchmarks: 148755


In [5]:
benchmark.to_csv('benchmark.csv',index=False)


In [19]:
# select only production jobs as only these can potentially have benchmark done. 
# prodsourcelabel = prod_test, rc_test, managed, install
job_query = {
    "size": 0,
    "_source": ["cpuconsumptiontime", "wall_time", "hs06","nevents","inputfilebytes","prodsourcelabel","processingtype","computingsite"],
    'query':{
            'bool':{
                   'must':[
                        { "term": {"jobstatus": "finished" } },
                        { "term": {"prodsourcelabel": "managed" } }
                    ],
                   'should':[
                      # { "term": {"prodsourcelabel": "rc_test" } },
                      # { "term": {"prodsourcelabel": "prod_test" } },
                       { "term": {"prodsourcelabel": "managed" } }
                   ]
            }
    }
                
}

data=[]
scroll = scan(client=es, index=job_indices, query=job_query, scroll='5m', timeout="5m", size=10000)
count = 0
counttype={'managed':0,'rc_test':0,'prod_test':0,'install':0}
jobsPerSite={}
for res in scroll:
    count += 1
    #print(res)
    #if count>3: break
    
    r=res['_source']
    
    site=r['computingsite']
    if site not in jobsPerSite:
        jobsPerSite[site]=0
    jobsPerSite[site]+=1
    
    if not count%100000: 
        print(count, ' selected:', counttype)
        #print(data)
    
    pid=int(res['_id'])
    if pid not in pids: continue
    
    counttype[r['prodsourcelabel']]+=1
    cpu=r['cpuconsumptiontime']
    ifb=r['inputfilebytes']
    wall=r['wall_time']
    nevents=r['nevents']
    if wall>0: cpueff = cpu/wall
    if nevents>0:
        wallPerEvent = wall/nevents
        cpuPerEvent = cpu/nevents
    else:
        wallPerEvent = 0
        cpuPerEvent =0
    doc=[pid,cpu,wall,r['hs06'],r['processingtype'],nevents, cpueff, wallPerEvent, cpuPerEvent,ifb]
    data.append(doc)

job=pd.DataFrame(data,columns=['pandaid','cputime','walltime','hs06','processingtype','nevents','cpueff', 'wallPerEvent', 'cpuPerEvent','inputsize'])
del data

100000  selected: {'managed': 715, 'install': 0, 'rc_test': 0, 'prod_test': 0}
200000  selected: {'managed': 1386, 'install': 0, 'rc_test': 0, 'prod_test': 0}
300000  selected: {'managed': 2077, 'install': 0, 'rc_test': 0, 'prod_test': 0}
400000  selected: {'managed': 2737, 'install': 0, 'rc_test': 0, 'prod_test': 0}
500000  selected: {'managed': 3403, 'install': 0, 'rc_test': 0, 'prod_test': 0}
600000  selected: {'managed': 4040, 'install': 0, 'rc_test': 0, 'prod_test': 0}
700000  selected: {'managed': 4734, 'install': 0, 'rc_test': 0, 'prod_test': 0}
800000  selected: {'managed': 5388, 'install': 0, 'rc_test': 0, 'prod_test': 0}
900000  selected: {'managed': 6064, 'install': 0, 'rc_test': 0, 'prod_test': 0}
1000000  selected: {'managed': 6744, 'install': 0, 'rc_test': 0, 'prod_test': 0}
1100000  selected: {'managed': 7442, 'install': 0, 'rc_test': 0, 'prod_test': 0}
1200000  selected: {'managed': 8128, 'install': 0, 'rc_test': 0, 'prod_test': 0}
1300000  selected: {'managed': 8800, '

In [20]:
job.head()

Unnamed: 0,pandaid,cputime,walltime,hs06,processingtype,nevents,cpueff,wallPerEvent,cpuPerEvent,inputsize
0,3352703758,1331,2170,107,recon,300,0.613364,7.233333,4.436667,12659125
1,3354732793,55,282,10,evgen,50,0.195035,5.64,1.1,88453
2,3326375042,29513,11863,96,reprocessing,2338,2.487819,5.073995,12.623182,2620990676
3,3327803396,22947,3502,77,merge,64062,6.552541,0.054666,0.3582,22832203370
4,3330689156,7696,2433,90,merge,22779,3.163173,0.106809,0.337855,7124546461


#### store all the jobs together

In [21]:
job.to_csv('job.csv',index=False)

#### store jobs split per processingtype

In [12]:
UniqueProcessingTypes = job.processingtype.unique()
print(UniqueProcessingTypes)

#create a dict for all the dataframes to be filled later
ProcessingType = {elem : pd.DataFrame for elem in UniqueProcessingTypes}

#filling up data frames
for key in ProcessingType.keys():
    job[:][job.processingtype == key].to_csv(key+'.csv',index=False)

['recon' 'evgen' 'reprocessing' 'merge' 'simul' 'pile' 'pmerge'
 'eventIndex' 'deriv' 'overlay']


In [None]:
#full = pd.merge(benchmark, job, on='pandaid')
#full.head()
#full.to_csv('benchmark_job.csv',index=False)

In [None]:
#for ps in benchmarksPerSite:
#    print(ps, benchmarksPerSite[ps])

for ps in jobsPerSite:
    if ps not in benchmarksPerSite and jobsPerSite[ps]>1000: 
        print(ps, jobsPerSite[ps])#,'       <<<<<<<<<<<<<')
    #else:
        #print(ps, jobsPerSite[ps],benchmarksPerSite[ps])

In [None]:
print (jobsPerSite)
#print (benchmarksPerSite)