In [11]:
#These lines set up inline plotting, and apply a standard size
%matplotlib inline
import matplotlib
matplotlib.rc('font', **{'size': 22})
# Standard includes
import datetime
import re
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import matplotlib.pyplot as plt
from time import time
import timeit
import numpy as np
import pickle

In [4]:
# Define query for the tasks
taskQuery = {
    "query": {
        "bool": {
            "should": [
                {"term":{"reqid": "18663"}}
                #{"term":{"transhome": "AthDerivation-21.2.23.0"}}
            ]
        }
    }
}

In [5]:
# Execute the queries
es = Elasticsearch(['atlas-kibana-dev.mwt2.org'],timeout=120)

# Tasks
taskIndex = "tasks*"
tasks = scan(es, query=taskQuery, index=taskIndex, scroll='5m', timeout="5m", size=1000)    

In [6]:
# Define the trains and the quantities for collection
quantities = [
    'cpuconsumptiontime',
    'nevents',
    'starttime',
    'endtime',
    'timeExe',
    'timeSetup',
    'timeGetJob',
    'timeStageIn',
    'timeStageOut',
    'actualcorecount',
    'wall_time',
    'inputfilebytes',
    'outputfilebytes',
    'Max_PSS_per_core'
]

# used for mc16e validation
trainsAndIDs = [
    (['DAOD_STDM5','DAOD_EGAM4','DAOD_EGAM2','DAOD_EXOT12','DAOD_EXOT9'],[],{}),
    (['DAOD_SUSY12','DAOD_STDM3','DAOD_EXOT15','DAOD_JETM3','DAOD_EXOT19','DAOD_HIGG4D6','DAOD_HIGG6D1','DAOD_HIGG1D1'],[],{}),
    (['DAOD_EXOT22','DAOD_SUSY4','DAOD_JETM11','DAOD_EXOT21','DAOD_STDM7','DAOD_SUSY8','DAOD_SUSY10'],[],{}),
    (['DAOD_HIGG2D1'],[],{}),
    (['DAOD_JETM9','DAOD_STDM4','DAOD_FTAG4'],[],{}),
    (['DAOD_MUON2','DAOD_HIGG4D4','DAOD_JETM7','DAOD_BPHY7','DAOD_EXOT17','DAOD_BPHY5','DAOD_EGAM7','DAOD_HIGG1D2'],[],{}),
    (['DAOD_STDM2','DAOD_SUSY18','DAOD_EXOT3','DAOD_EGAM1','DAOD_EGAM5','DAOD_EXOT2','DAOD_SUSY3','DAOD_EXOT5','DAOD_HIGG6D2'],[],{}),
    (['DAOD_JETM12','DAOD_EGAM3','DAOD_JETM10'],[],{}),
    (['DAOD_TOPQ1'],[],{}),
    (['DAOD_TCAL1','DAOD_EXOT10','DAOD_HIGG2D5'],[],{}),
    (['DAOD_SUSY1'],[],{}),
    (['DAOD_SUSY9'],[],{}),
    (['DAOD_EXOT13','DAOD_SUSY5','DAOD_SUSY7','DAOD_EXOT8','DAOD_EXOT4','DAOD_HIGG4D2'],[],{}),
    (['DAOD_STDM5','DAOD_EGAM4','DAOD_EGAM2','DAOD_EXOT12','DAOD_SUSY9','DAOD_EXOT9'],[],{}),
    (['DAOD_TAUP1','DAOD_HIGG4D5','DAOD_TOPQ5','DAOD_JETM4','DAOD_HIGG4D3','DAOD_SUSY16','DAOD_EXOT7'],[],{}),
    (['DAOD_HIGG8D1','DAOD_JETM6','DAOD_MUON1','DAOD_SUSY6','DAOD_JETM1','DAOD_MUON0','DAOD_TAUP3'],[],{}),
    (['DAOD_EGAM9','DAOD_EXOT20','DAOD_SUSY11','DAOD_EXOT6','DAOD_SUSY2','DAOD_HIGG4D1','DAOD_BPHY1','DAOD_BPHY4'],[],{})
]


In [7]:
# Get the IDs matching each train
start_time = timeit.default_timer()
matchingTasks = 0
for res in tasks:
    if 'output_formats' in res['_source'].keys():
        for item in trainsAndIDs:
            train = item[0]
            if set(res['_source']['output_formats']) == set(train):
                item[1].append(res['_id'])
                matchingTasks += 1
print("Total matching tasks = ",matchingTasks)
print("Time to extract information = ",timeit.default_timer() - start_time)

Total matching tasks =  51
Time to extract information =  0.0421378742903471


In [8]:
# Set up query for the jobs relevant to the trains
start_time = timeit.default_timer()
taskCounter = 0
jobIndex = "jobs_archive_2018*"

to_read = quantities
to_read.append('jeditaskid')
to_read.append('transformation')

for item in trainsAndIDs: # Loop over trains
    for theId in item[1]: # Loop over tasks for that train
        taskCounter += 1
        jobQuery = {
            "_source": to_read,
            "query": {
                "bool": {
                    "must": [
                        {"term":{"jeditaskid": theId}},
                        {"term":{"jobstatus": "finished"}}
                    ]
                }
            }
        }
        # query the jobs
        jobs = scan(es, query=jobQuery, index=jobIndex, scroll='5m', timeout="5m", size=1000)
        for res in jobs: # Loop over jobs from that task
            for quantity in quantities: # get the relevant quantities
                if quantity in res['_source'].keys(): 
                    if quantity not in item[2].keys(): # store the quantities in a long list
                        item[2][quantity] = [res['_source'][quantity]]
                    else:
                        item[2][quantity].append(res['_source'][quantity])
                    

In [12]:
# Save the per-train information
pickle.dump(trainsAndIDs,open("mc_request18663.p","wb"))