In [1]:
#These lines set up inline plotting, and apply a standard size
# Standard includes
from datetime import date,timedelta
import re
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import matplotlib.pyplot as plt
import timeit
import numpy as np
import pandas as pd
import pickle

In [2]:
# Little method to append items to the dictionary
def addTo(theDict,jobType,variableName,variable):
    if (jobType,variableName) in theDict.keys():
        theDict[(jobType,variableName)].append(variable)
    else:
        theDict[(jobType,variableName)] = [variable]

In [3]:
# Time (for setting range)
nDays = 500
now = date.today()
then = now - timedelta(days=nDays)
# Set upper limit for number of jobs to process
maxHits = 2000000

In [4]:
# Define ElasticSearch and the relevant index
es = Elasticsearch(['atlas-kibana-dev.mwt2.org'],timeout=120)
jobIndex = "jobs_archive_2018*,jobs_archive_2017*"

In [5]:
# Define the trains and the quantities for collection
quantities = [
    'nevents',
    'actualcorecount',
    'wall_time',
    'inputfilebytes',
    'outputfilebytes',
    'IObytesReadRate',
    'IObytesWriteRate',
    'actualcorecount'
]

to_read = quantities
#to_read.append('homepackage')
#to_read.append('jobstatus')
#to_read.append('transformation')

In [6]:
# Set up query for EVGEN 
jobQueryEVNT = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "MCProd/19.2.5.12.2"}},
                            {"term":{"transformation": "Generate_tf.py"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [7]:
# Set up query for EVGEN MERGING
jobQueryEVNTMerge = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "AtlasProduction/19.2.5.12"}},
                            {"term":{"transformation": "EVNTMerge_tf.py"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [8]:
# Set up query for FULL SIMULATION
jobQueryFullSim = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "AtlasOffline/21.0.15"}},
                            {"term":{"transformation": "Sim_tf.py"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [9]:
# Set up query for FAST SIMULATION
jobQueryFastSim = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "Athena/21.0.31"}},
                            {"term":{"transformation": "Sim_tf.py"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [10]:
# Set up query for HITS MERGING (s3136)
jobQueryHitsMerge = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "AtlasOffline/21.0.15"}},
                            {"term":{"transformation": "HITSMerge_tf.py"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [11]:
# Set up query for RECONSTRUCTION (MC digi+reco)
jobQueryRec_MCDigi = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "AtlasOffline/21.0.20"}},
                            {"term":{"transformation": "Reco_tf.py"}},
                            {"term":{"processingtype": "pile"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [12]:
# Set up query for RECONSTRUCTION (RAWtoESD/ESDtoDPD)
jobQueryRec_RAWtoESD = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {
                                "bool": {
                                    "should": [
                                        {"term":{"homepackage": "Athena/21.0.53"}},
                                        {"term":{"homepackage": "Athena/21.0.54"}}
                                    ]
                                }
                            },
                            {"term":{"transformation": "Reco_tf.py"}},
                            {"term":{"processingtype": "reprocessing"}},
                            {"term":{"inputfiletype": "RAW"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [13]:
# Set up query for AOD/HIST merging + DRAW/DESD
jobQueryAODMerge = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "Athena/21.0.51"}},
                            {"term":{"processingtype": "reprocessing"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [14]:
# Set up query for DERIVATION
jobQueryDer = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"homepackage": "AthDerivation/21.2.34.0"}},
                            {"term":{"transformation": "Reco_tf.py"}},
                            {"term":{"jobstatus": "finished"}}
                        ],
                        "must_not": {"exists":{"field":"eventservice"}}
                    }
                }
            ]
        }
    }
}

In [15]:
# scan the DB
jobsEVNT = scan(es, query=jobQueryEVNT, index=jobIndex, scroll='5m', timeout="5m", size=10000) 
jobsEVNTMerge = scan(es, query=jobQueryEVNTMerge, index=jobIndex, scroll='5m', timeout="5m", size=10000) 
jobsFullSim = scan(es, query=jobQueryFullSim, index=jobIndex, scroll='5m', timeout="5m", size=10000)
jobsFastSim = scan(es, query=jobQueryFastSim, index=jobIndex, scroll='5m', timeout="5m", size=10000)
jobsHitsMerge = scan(es, query=jobQueryHitsMerge, index=jobIndex, scroll='5m', timeout="5m", size=10000)
jobsRec_MCDigi = scan(es, query=jobQueryRec_MCDigi, index=jobIndex, scroll='5m', timeout="5m", size=10000)
jobsRec_RAWtoESD = scan(es, query=jobQueryRec_RAWtoESD, index=jobIndex, scroll='5m', timeout="5m", size=10000)
jobsAODMerge = scan(es, query=jobQueryAODMerge, index=jobIndex, scroll='5m', timeout="5m", size=10000)
jobsDer = scan(es, query=jobQueryDer, index=jobIndex, scroll='5m', timeout="5m", size=10000)
allScrolls = [jobsEVNT,
              jobsEVNTMerge,
              jobsFullSim,
              jobsFastSim,
              jobsHitsMerge,
              jobsRec_MCDigi,
              jobsRec_RAWtoESD,
              jobsAODMerge,
              jobsDer]

In [16]:
# Dictionary to store results
data = {}

In [17]:
# Loop over the results
jobType = ""
for jobs in allScrolls:
    start_time = timeit.default_timer()
    if jobs is jobsEVNT: jobType = "Event generation"
    if jobs is jobsEVNTMerge: jobType = "EVNT merging"
    if jobs is jobsFullSim: jobType = "Full simulation"
    if jobs is jobsFastSim: jobType = "Fast simulation"
    if jobs is jobsHitsMerge: jobType = "HITS merging"
    if jobs is jobsRec_MCDigi: jobType = "DigiReco"
    if jobs is jobsRec_RAWtoESD: jobType = "RAWtoESD->AOD,perfDPD"
    if jobs is jobsAODMerge: jobType = "AOD/HISTMerge, DRAW/DESD"
    if jobs is jobsDer: jobType = "Derivation"
    print(jobType)
    jobCounter = 0.0
    for res in jobs: # Loop over jobs from that task
        breakout = False
        for item in quantities:
            if item not in res['_source'].keys(): breakout = True
        if(breakout): continue
        nevents = res['_source']['nevents']
        inputSize = res['_source']['inputfilebytes']
        outputSize = res['_source']['outputfilebytes']
        wallclock = res['_source']['wall_time']
        ioreadrate = res['_source']['IObytesReadRate']
        iowriterate = res['_source']['IObytesWriteRate']
        cores = res['_source']['actualcorecount']
        if any([wallclock is None, inputSize is None, outputSize is None, nevents is None, ioreadrate is None, iowriterate is None]):
            continue
        nevents = float(nevents)
        inputSize = float(inputSize)/1000000.0
        outputSize = float(outputSize)/1000000.0
        wallclock = float(wallclock)
        cores = float(cores)
        if wallclock==0.0: continue
        ioIntensity = (inputSize+outputSize)/(wallclock*cores)
        eventRate = nevents/(wallclock*cores)
        ioreadrate = float(ioreadrate)/1000000.0
        iowriterate = float(iowriterate)/1000000.0
        addTo(data,jobType,"I/O intensity",ioIntensity)
        addTo(data,jobType,"Event rate",eventRate)
        addTo(data,jobType,"IObytesReadRate",ioreadrate)
        addTo(data,jobType,"IObytesWriteRate",iowriterate)
        jobCounter += 1
        if jobCounter % 100000 == 0: print(jobCounter)
        if (jobCounter == maxHits): break
    print(jobType,jobCounter)
    print("Time to extract information = ",timeit.default_timer() - start_time)

Event generation
100000.0
200000.0
300000.0
400000.0
500000.0
600000.0
700000.0
800000.0
900000.0
1000000.0
1100000.0
1200000.0
1300000.0
1400000.0
1500000.0
1600000.0
1700000.0
1800000.0
1900000.0
2000000.0
Event generation 2000000.0
Time to extract information =  278.06806116364896
EVNT merging
100000.0
200000.0
300000.0
400000.0
500000.0
600000.0
700000.0
800000.0
900000.0
1000000.0
1100000.0
1200000.0
1300000.0
1400000.0
1500000.0
1600000.0
1700000.0
EVNT merging 1799174.0
Time to extract information =  91.58892871811986
Full simulation
100000.0
200000.0
300000.0
400000.0
500000.0
600000.0
700000.0
800000.0
900000.0
1000000.0
1100000.0
1200000.0
1300000.0
1400000.0
1500000.0
1600000.0
1700000.0
1800000.0
1900000.0
2000000.0
Full simulation 2000000.0
Time to extract information =  317.7027485985309
Fast simulation
100000.0
200000.0
300000.0
400000.0
500000.0
600000.0
700000.0
800000.0
900000.0
1000000.0
1100000.0
1200000.0
1300000.0
1400000.0
1500000.0
1600000.0
1700000.0
1800000.0


In [18]:
# Save as pickle
pickle.dump(data,open("io_test.p","wb"))