# Exports data as CSV. This data should classify tasks as CPU intensive, IO intensive, and generally bad. The data will be used to find optimal CPUs for each class.


In [23]:
import pickle
import re
import datetime
from elasticsearch import Elasticsearch, helpers

daysOfData=10

In [12]:
es = Elasticsearch([{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)
indices = es.cat.indices(index="jobs_archive_20*", h="index", request_timeout=600).split('\n')
indices = sorted(indices)
indices = [x for x in indices if x != '']
#for i in indices:
#    print(i)

def time_filter(indices, days=1, until=0):
    if days == 0:
        return ["jobs_archive_*"]
    today = datetime.date.today()
    filtered = []
    datefmt = '%Y-%m-%d'
    for i in indices:
        day = re.sub(r'jobs_archive_', '', i).rstrip()
        day = datetime.datetime.strptime(day, datefmt).date()
        diff = today - day
        if until <= diff.days < days + until:
            filtered.append(i.rstrip())
    return filtered

selIndices = time_filter(indices,daysOfData)
ind = ','.join(selIndices)
print(ind)

jobs_archive_2017-03-13,jobs_archive_2017-03-14,jobs_archive_2017-03-15,jobs_archive_2017-03-16,jobs_archive_2017-03-17,jobs_archive_2017-03-18,jobs_archive_2017-03-19,jobs_archive_2017-03-20,jobs_archive_2017-03-21,jobs_archive_2017-03-22


### agregate per taskID and CPU type. collect - average walltime, cputime, cores, input data, output data, execution time, stagein time 

In [25]:
# filter: jobstatus:finished NOT produsername:gangarbt
# aggregations on: jeditaskid,cpuconsumptionunit, ? processingtype, ?prodsourcelabel
# aggregate values: corecount, wall_time,cpuconsumptiontime, inputfilebytes

my_query = {
    "size": 0,
    'query':{
            'bool':{
                   'must':[
                       { "term": {"jobstatus": "finished" } }
                   ],
                   'must_not':[
                       { "term": {"produsername": "gangarbt" } }
                   ]
            }
    },
    "aggs": {
        "tasks":{
            "terms": { "field": "jeditaskid", "size": 100 },
             "aggs": {
                "cputype":{
                    "terms": {"field": "cpuconsumptionunit", "size": 100},
                    "aggs":{
                         "cputime": { "avg": { "field": "cpuconsumptiontime" } },
                         "cores":   { "avg": { "field": "corecount" } },
                         "walltime": { "avg": { "field": "wall_time" } },
                         "inputfilebytes": { "avg": { "field": "inputfilebytes" } },
                         "hs06sec": { "avg": { "field": "inputfilebytes" } }
                    }
                }
            }
        }
    }
}


res = es.search(index=ind, body=my_query, request_timeout=12000)
r=res['aggregations']['tasks']
#print(re)

In [55]:
def getCPUinfo(c):
    c = c.replace(" CPU","").replace(" @","")
    c = c.replace('  '," ").replace('  '," ").replace('  '," ").replace('  '," ")
    c = c.replace('(R)','').replace('(TM)','').replace('(tm)','')
    c = c.replace("Processor ","")
    w=c.split(" ")
    cache=0
    clock=0
    if w[-1]=='KB': 
        cache= int(w[-2])
        w=w[:-2]
    if w[-1].count("GHz")>0: 
        clock=float(w[-1].replace('GHz',''))
        w=w[:-1]
    c=' '.join(w)
    return (c,cache, clock)

In [56]:
allData=[]
uniqueCPU={}

for b in r['buckets']:
    print(b['key'], b['doc_count'])
    cpus=b['cputype']['buckets']
    for c in cpus:
        (cpu,cache,clock)=getCPUinfo(c['key'])

        if cpu not in uniqueCPU: uniqueCPU[cpu]=0
        uniqueCPU[cpu]+=1
        
        print(c['doc_count'], CPUstr, c['cputime']['value'],c['walltime']['value'],c['inputfilebytes']['value'],c['hs06sec']['value'], c['cores']['value'] )
        allData.append({
                'taskid': b['key'], 'tjobs': b['doc_count'], 
                'cpu':cpu, 'cache':cache, 'clock':clock,
                'cputime':c['cputime']['value'], 'walltime':c['walltime']['value'], 
                'jobs':c['doc_count'], 'cores':c['cores']['value'], 'hs06sec':c['hs06sec']['value'] 
                }) 
        print;
        
for uc in uniqueCPU:
    print ( uc, uniqueCPU[uc])

10903566 116583
10637 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 49454.21867067782 52103.063457741846 0.0 0.0 1.0
9612 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 48767.5836454432 53143.68778610071 0.0 0.0 1.0
7441 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 36907.09985217041 38366.0956860637 0.0 0.0 1.0
7224 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 45540.916251384275 46688.81243078627 0.0 0.0 1.0
6547 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 63153.82144493661 70414.91538109057 0.0 0.0 1.0
6212 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 43726.06825499034 46543.714906632325 0.0 0.0 1.0
4931 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 41730.16588927195 45030.562563374566 0.0 0.0 1.0
4366 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 55229.126889601466 58032.177049931284 0.0 0.0 1.0
4202 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 46186.27486910995 48731.75392670157 0.0 0.0 1.0
3438 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 59599.004363001746 64088.29726585224 0.0 0.0 1.0
3387 s+Intel Xeon E5-2640 0 2.50GHz 15360 KB 508

In [54]:
pickle.dump( allData, open( "CPU.pickle", "wb" ) )