In [1]:
import calendar
import datetime

def convert_to_unix_time(time_string, time_format = "%Y-%m-%dT%H:%M:%S", time_delta_hours = 0):
    ## Make datetime object from time_string
    date = datetime.datetime.strptime(time_string, time_format)
    ## Define timedelta object to take into account any deviation from UTC of the input time_string
    ## Example: if time_string refers to UTC+1 we have to subtract 1h before evaluating the unix time
    time_delta = datetime.timedelta(hours = time_delta_hours)
    date += time_delta
    time_tuple = date.timetuple()
    ## Get unix time (this assumes that time_tuple was created with a UTC time)
    time_unix = calendar.timegm(time_tuple)
    
    return time_unix

print ('blub')

blub


In [2]:
from tables import *

class JobEntry(IsDescription):
    proddblock = StringCol(200)
    produsername = StringCol(100)
    creationtime = Int32Col()
    starttime = Int32Col()
    endtime = Int32Col()
    jeditaskid = Int32Col()
    computingsite = StringCol(100)
    queue_time = Int32Col()
    jobstatus = StringCol(20)

In [3]:
import os
from elasticsearch import Elasticsearch
from elasticsearch import VERSION
from collections import OrderedDict
from elasticsearch import helpers
import time

es = Elasticsearch([{'host':'atlas-kibana.mwt2.org', 'port':9200, 'timeout':200}])

fieldNames = ['computingsite', 'proddblock', 'produsername', 'jeditaskid', 'jobstatus', 'queue_time']
timeNames = ['creationtime', 'starttime', 'endtime']
fieldNames += timeNames

usrc={
    "query": {
        "bool": {
            "must": [
                    {"match": {"prodsourcelabel": "user"}}
                ],
        }
    },
    "_source": fieldNames
}

monthPrefixes = [
#    "2016-08",
#    "2016-09",
#    "2016-10",
#    "2016-11",
#    "2016-12",
#    "2017-01",
#    "2017-02",
#    "2017-03",
#    "2017-04",
#    "2017-05",
#    "2017-06"
#    "2017-07",
    #"2017-08",
    "2017-09"
]

indices = es.cat.indices(index="jobs_archive_*", h="index", request_timeout=600).split('\n')
my_indices = {}
for monthPrefix in monthPrefixes:
    my_indices[monthPrefix] = []
    my_indices[monthPrefix] += [l for l in indices if monthPrefix in l]

time_total = time.time()
time_write = 0

print ('start')

for monthPrefix in my_indices.keys():
    filters = Filters(complevel=1, complib='lzo')
    h5file = open_file("jobs_archive-{}.h5".format(monthPrefix), mode = "w", title = "jobs_archive_{}".format(monthPrefix), filters = filters)
    group = h5file.create_group("/", 'jobs', 'Jobs information')
    table = h5file.create_table(group, 'table', JobEntry, "Jobs table")
    job = table.row
    
    n_indices = len(my_indices[monthPrefix])

    for i, ind in enumerate(my_indices[monthPrefix]):
        print ('Processing ('+str(i+1)+'/'+str(n_indices)+')', ind)
        
        res = helpers.scan(es, query=usrc, index=ind, size=10000)

        for hit in res:
            source = hit['_source']
            if not source['proddblock']: continue
            if not source['proddblock'].startswith('data') and not source['proddblock'].startswith('mc'): continue
            if not 'DAOD' in source['proddblock'] and not 'NTUP' in source['proddblock']: continue
            source['proddblock'] = source['proddblock'].split(':', 1)[-1]
            for timeName in timeNames:
                if not source[timeName]:
                    source[timeName] = -1
                    continue
                source[timeName] = convert_to_unix_time(source[timeName])
            
            if source['queue_time'] is None:
                source['queue_time'] = -1
            if source['jeditaskid'] is None:
                source['jeditaskid'] = 0
            
            if len(source['proddblock']) > 200: print('proddblock too long!')
            if len(source['produsername']) > 100: print('produsername too long!')
            if len(source['computingsite']) > 100: print('computingsite too long!')
            if len(source['jobstatus']) > 20: print('jobstatus too long!')
            
            for fieldName in fieldNames:
                job[fieldName] = source[fieldName]
            job.append()
    
        time_tmp = time.time()
        
        table.flush()
        
        time_tmp = time.time() - time_tmp
        time_write += time_tmp
    
    h5file.close()

print ('done')

time_total = time.time() - time_total
print ('total:', time_total)
print ('write:', time_write)

start
Processing (1/31) jobs_archive_2017-08-15
Processing (2/31) jobs_archive_2017-08-29
Processing (3/31) jobs_archive_2017-08-26
Processing (4/31) jobs_archive_2017-08-27
Processing (5/31) jobs_archive_2017-08-14
Processing (6/31) jobs_archive_2017-08-30
Processing (7/31) jobs_archive_2017-08-25
Processing (8/31) jobs_archive_2017-08-01
Processing (9/31) jobs_archive_2017-08-10
Processing (10/31) jobs_archive_2017-08-07
Processing (11/31) jobs_archive_2017-08-13
Processing (12/31) jobs_archive_2017-08-31
Processing (13/31) jobs_archive_2017-08-04
Processing (14/31) jobs_archive_2017-08-18
Processing (15/31) jobs_archive_2017-08-20
Processing (16/31) jobs_archive_2017-08-19
Processing (17/31) jobs_archive_2017-08-28
Processing (18/31) jobs_archive_2017-08-22
Processing (19/31) jobs_archive_2017-08-24
Processing (20/31) jobs_archive_2017-08-11
Processing (21/31) jobs_archive_2017-08-08
Processing (22/31) jobs_archive_2017-08-06
Processing (23/31) jobs_archive_2017-08-03
Processing (24