In [1]:
#These lines set up inline plotting, and apply a standard size
# Standard includes
from datetime import date,timedelta
import re
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import matplotlib.pyplot as plt
import timeit
import numpy as np
import pandas as pd
import pickle

In [2]:
# Little method to append items to the dictionary
def addTo(theDict,jobType,variableName,variable):
    if (jobType,variableName) in theDict.keys():
        theDict[(jobType,variableName)].append(variable)
    else:
        theDict[(jobType,variableName)] = [variable]

In [3]:
# Time (for setting range)
nDays = 300
now = date.today()
then = now - timedelta(days=nDays)
# Set upper limit for number of jobs to process
maxHits = 5000000

In [4]:
# Define ElasticSearch and the relevant index
es = Elasticsearch(['atlas-kibana-dev.mwt2.org'],timeout=120)
jobIndex = "jobs_archive_2018*,jobs_archive_2017*"

In [5]:
# Define the trains and the quantities for collection
quantities = [
    'nevents',
    'actualcorecount',
    'wall_time',
    'inputfilebytes',
    'outputfilebytes',
    'IObytesReadRate',
    'IObytesWriteRate',
    'actualcorecount'
]

to_read = quantities
#to_read.append('homepackage')
#to_read.append('jobstatus')
#to_read.append('transformation')

In [6]:
# Set up query: TOPQ1
jobQueryAOD = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"processingtype": "panda-client-0.5.96-jedi-run"}},
                            {"term":{"inputfiletype": "AOD"}}
                        ]
                    }
                }
            ]
        }
    }
}

In [7]:
# Set up query: SUSY2
jobQuerySUSY5 = {
    "_source": to_read,
    "query": {
        "bool":{
            "must": [
                {'range': {'endtime': {'gte': then, 'lt': now}}},
                {'range': {'nevents': {'gt': 0.0}}},
                {'range': {'wall_time': {'gt': 0.0}}},
                {'range': {'IObytesReadRate': {'gt': 0.0}}},
                {'range': {'IObytesWriteRate': {'gt': 0.0}}},
                {'range': {'inputfilebytes': {'gt': 0.0}}},
                {'range': {'outputfilebytes': {'gt': 0.0}}},
                {'range': {'actualcorecount': {'gt': 0.0}}},
                {
                    "bool": {
                        "must": [
                            {"term":{"processingtype": "panda-client-0.5.96-jedi-run"}},
                            {"term":{"inputfiletype": "DAOD_SUSY5"}}
                        ]
                    }
                }
            ]
        }
    }
}

In [8]:
# scan the DB
jobsAOD = scan(es, query=jobQueryAOD, index=jobIndex, scroll='5m', timeout="5m", size=10000) 
jobsSUSY5 = scan(es, query=jobQuerySUSY5, index=jobIndex, scroll='5m', timeout="5m", size=10000) 
allScrolls = [jobsAOD,jobsSUSY5]

In [9]:
# Dictionary to store results
data = {}

In [10]:
# Loop over the results
jobType = ""
for jobs in allScrolls:
    start_time = timeit.default_timer()
    if jobs is jobsAOD: jobType = "AOD"
    if jobs is jobsSUSY5: jobType = "SUSY5"
    print(jobType)
    jobCounter = 0.0
    for res in jobs: # Loop over jobs from that task
        breakout = False
        for item in quantities:
            if item not in res['_source'].keys(): breakout = True
        if(breakout): continue
        cores = res['_source']['actualcorecount']
        nevents = res['_source']['nevents']
        inputSize = res['_source']['inputfilebytes']
        outputSize = res['_source']['outputfilebytes']
        wallclock = res['_source']['wall_time']
        ioreadrate = res['_source']['IObytesReadRate']
        iowriterate = res['_source']['IObytesWriteRate']
        if any([wallclock is None, inputSize is None, outputSize is None, nevents is None, ioreadrate is None, iowriterate is None, cores is None]):
            continue
        nevents = float(nevents)
        inputSize = float(inputSize)/1000000.0
        outputSize = float(outputSize)/1000000.0
        wallclock = float(wallclock)
        ioreadrate = float(ioreadrate)/1000000.0
        iowriterate = float(iowriterate)/1000000.0
        cores = float(cores)
        ioIntensity = (inputSize+outputSize)/(wallclock*cores)
        eventRate = nevents/wallclock
        addTo(data,jobType,"I/O intensity",ioIntensity)
        addTo(data,jobType,"Event rate",eventRate)
        addTo(data,jobType,"IObytesReadRate",ioreadrate)
        addTo(data,jobType,"IObytesWriteRate",iowriterate)
        jobCounter += 1
        if jobCounter % 100000 == 0: print(jobCounter)
        if (jobCounter == maxHits): break
    print(jobType,jobCounter)
    print("Time to extract information = ",timeit.default_timer() - start_time)

AOD
100000.0
200000.0
300000.0
400000.0
500000.0
600000.0
700000.0
800000.0
900000.0
1000000.0
1100000.0
1200000.0
1300000.0
AOD 1379514.0
Time to extract information =  110.30352593003772
SUSY5
100000.0
200000.0
300000.0
400000.0
500000.0
600000.0
700000.0
800000.0
900000.0
1000000.0
1100000.0
1200000.0
1300000.0
1400000.0
SUSY5 1412717.0
Time to extract information =  122.61857725982554


In [11]:
# Save as pickle
pickle.dump(data,open("io_analysis.p","wb"))