## Script to aggregate Job data percentiles and dump into python



In [18]:
import json
import os
import numpy as np
import datetime
import calendar
import argparse
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch import VERSION
from collections import OrderedDict
from sys import stdout


def scan_es():
  prim_field = 'processingtype'
  time_fields = [
    ('timeStageIn', None, "_exists_:timeStageIn AND NOT timeStageIn:<0"),
    ('timeStageOut', None, "_exists_:timeStageOut AND NOT timeStageOut:<0"),
    ('timeSetup', None, "_exists_:timeSetup AND NOT timeSetup:<0"),
    ('timeExe', None, "_exists_:timeExe AND NOT timeExe:<0"),
    ('timeGetJob', None, "_exists_:timeGetJob AND NOT timeGetJob:<0"),
    ('wall_time', None, "_exists_:wall_time AND NOT wall_time:<0"),
    ('walltime_times_cores', "(doc['actualcorecount'].value) ? doc['wall_time'].value*doc['actualcorecount'].value : doc['wall_time'].value", "_exists_:wall_time AND NOT wall_time:<0"),
    ('dbTime', None, "_exists_:dbTime AND NOT dbTime:<0"),
    ('cputimeperevent', "(doc['nevents'].value) ? doc['cpuconsumptiontime'].value/doc['nevents'].value : 0", 
     "_exists_:cpuconsumptiontime"),
      
  ]
  mem_fields = [
    ('max_pss_per_core', "(doc['actualcorecount'].value) ? doc['maxpss'].value*1024/doc['actualcorecount'].value : doc['maxpss'].value*1024", "_exists_:maxpss AND NOT maxpss:<0"),
    # ('avgswap', "doc['avgswap'].value*1024", "_exists_:avgswap AND NOT avgswap:<0"),
  ]
  eff_fields = [
    ('cpu_eff', None, "_exists_:cpu_eff AND NOT cpu_eff:<0"),
    ('cpu_eff_per_core', "(doc['actualcorecount'].value) ? doc['cpu_eff'].value/doc['actualcorecount'].value : doc['cpu_eff'].value", "_exists_:cpu_eff AND NOT cpu_eff:<0"),
    ('cpueff_per_core_over_timeExe', "(doc['timeExe'].value && doc['actualcorecount'].value) ? doc['cpuconsumptiontime'].value/doc['actualcorecount'].value/doc['timeExe'].value:0", 
     "_exists_:timeExe AND NOT timeExe:<0"),      
#    ('cpueff_per_core_over_timeExe', None, "_exists_:cpueff_per_core_over_timeExe AND NOT cpueff_per_core_over_timeExe:<0"),
#    ('cpueff_per_core_over_timeExe'
  ]
  data_fields = [
    ('dbData', None, "_exists_:dbData AND NOT dbData:<0"),
    ('inputfilebytes', None, "_exists_:inputfilebytes AND NOT inputfilebytes:<0"),
    ('outputfilebytes', None, "_exists_:outputfilebytes AND NOT outputfilebytes:<0"),
    ('IObytesReadRate', None, "_exists_:IObytesReadRate AND NOT IObytesReadRate:<0"),
    ('IObytesWriteRate', None, "_exists_:IObytesWriteRate AND NOT IObytesWriteRate:<0"),
    ('IO_Intensity',"((doc['inputfilebytes'].value && doc['corecount'].value && doc['timeExe'].value ) ? (doc['inputfilebytes'].value+doc['outputfilebytes'].value)/((doc['timeStageIn'].value+doc['timeStageOut'].value+doc['timeExe'].value)*doc['corecount'].value) :   0) ",
     "_exists_:inputfilebytes AND NOT inputfilebytes:<0"),

  ]
  sec_fields = time_fields + mem_fields + eff_fields + data_fields
  indices = 'jobs_archive_*'

  dates = [
#GD    ('2017',)+get_date_range_year('2017'),
    ('2017-11_2018-01',)+get_date_range_month('2017-11')[0:1]+get_date_range_month('2018-01')[1:2],
    ('2017-11',)+get_date_range_month('2017-11'),
    ('2017-12',)+get_date_range_month('2017-12'),
    ('2018-01',)+get_date_range_month('2018-01'),
  ]
#GD  for date in ['2017-{:02.0f}'.format(l) for l in range(1, 13)]:
#GD    dates.append((date,)+get_date_range_month(date))

  es = Elasticsearch([{'host': 'atlas-kibana.mwt2.org', 'port': 9200, 'timeout': 300}])

  out_dict = {}

  for date_name, start_date, end_date in dates:
    stdout.write('\rRunning ES query for {}'.format(date_name))
    stdout.flush()
    out_dict[date_name] = {}
    for field, script, filter_string in sec_fields:
      query = get_query(prim_field, field, script, filter_string, start_time = get_utc_timestamp(start_date), end_time = get_utc_timestamp(end_date))
      res = es.search(index = indices, body = query, size = 1)
      out_dict[date_name][field] = res
    with open('es_scan.json', 'w') as out_file:
      json.dump(out_dict, out_file)

  stdout.write('\n')

def get_query(primary_field, field, script, filter_string, start_time, end_time, percents = [50, 75, 95, 99]):
  query = {
    "query": {
      "bool": {
        "must": [
          {
            "match_phrase": {
              "prodsourcelabel": {
                "query": "managed"
              }
            }
          },
          {
            "match_phrase": {
              "jobstatus": {
                "query": "finished"
              }
            }
          },
          {
            "range": {
              "modificationtime": {
                "gte": start_time,
                "lte": end_time,
                "format": "epoch_second"
              }
            }
          }
        ],
        "must_not": []
      }
    },
    "size": 0,
    "_source": {
      "excludes": []
    },
    "aggs": {
      primary_field: {
        "terms": {
          "field": primary_field,
          "size": 20,
          "order": {
            "_term": "desc"
          }
        },
        'aggs': {}
      }
    }
  }
  ## Add filters
  if filter_string is not None:
    query['query']['bool']['must'].append(get_filter_entry(filter_string))
  else:
    query['query']['bool']['must'].append({ "match_all": {} })
  ## Add sub-aggregations
  # for field, script in secondary_fields:
  #   query['aggs'][primary_field]['aggs'].update(get_agg_entry(field = field, script = script, percents = percents))
  ## Add sub-aggregation
  query['aggs'][primary_field]['aggs'].update(get_agg_entry(field = field, script = script, percents = percents))

  return query

def get_filter_entry(filter_string):
  return {
    "query_string": {
      "query": filter_string,
      "analyze_wildcard": True,
      "lowercase_expanded_terms": False
    }
  }

def get_agg_entry(field = None, script = None, percents = [50, 75, 95, 99]):
  agg_dict = {
    field: {
      "percentiles": {
        "percents": percents,
        "keyed": False
      }
    }
  }

  if script is not None:
    agg_dict[field]["percentiles"]["script"] = {
      "inline": script,
      "lang": "expression"
    }
  else:
    agg_dict[field]["percentiles"]['field'] = field

  return agg_dict

def get_utc_timestamp(date_string, format_string = '%Y-%m-%dT%H:%M:%S'):
  ## Make datetime object from time_string
  date = datetime.datetime.strptime(date_string, format_string)
  time_tuple = date.timetuple()
  ## Get unix time (this assumes that time_tuple was created with a UTC time)
  time_unix = calendar.timegm(time_tuple)
  
  return time_unix

def get_duration_string(time_seconds):
  minutes, seconds = divmod(time_seconds, 60)
  hours, minutes = divmod(minutes, 60)
  
  return '{:.0f}:{:02.0f}:{:02.0f}'.format(hours, minutes, seconds)

def get_date_range_year(date_name):
  year_start_string = '{}-01-01T00:00:00'.format(date_name)
  year_end_string = '{}-12-31T23:59:59'.format(date_name)
  return year_start_string, year_end_string

def get_date_range_month(date_name, date_format = '%Y-%m-%dT%H:%M:%S'):
  start_string = '{}-01T00:00:00'.format(date_name)
  start_year, start_month = [int(l) for l in date_name.split('-', 1)]
  end_month = (start_month % 12) + 1
  end_year = start_year+1 if (end_month == 1) else start_year
  end_string = '{}-{:02.0f}-01T00:00:00'.format(end_year, end_month)
  start_date = datetime.datetime.strptime(start_string, date_format)
  ## Subtract one second
  end_date = datetime.datetime.strptime(end_string, date_format) - datetime.timedelta(seconds = 1)
  end_string = datetime.datetime.strftime(end_date, date_format)
  return start_string, end_string

def get_date_range(in_string, date_format):
  date = datetime.datetime.strptime(in_string, date_format)




In [19]:
scan_es()

Running ES query for 2018-01_2018-01


In [12]:
type(tuple('a'))

tuple

In [17]:
a=(1,2)
a[1:2]

(2,)