Checks percentage of time consuming queries (TEST)
====
This notebook checks whether the percentage of queries with high completion times (>**nsec**) (as computed for a period of several minutes **interval**) exceeds a given value (**percentlimit**)  at any given time during the selected interval (**nhours**). It sends mails to all the people substribed to that alert. It is run every half an hour from a cron job (not yet).
In this way we can detect spikes that tend to cause server malfunctions.

In [77]:
from subscribers import subscribers
import alerts
import es_query

import datetime
import re
import json
import sys
from elasticsearch import Elasticsearch, exceptions as es_exceptions
from elasticsearch.helpers import scan

### Variables for this script:
1. Maximum allowed percentage of queries consuming more than 10s with respect to the total amount of queries. Alert goes off when ot is surpassed
2. Time interval to calculate the percentage
3. Time period for the scan

In [78]:
# Percentage of queries taking > 10s
percentlimit=10
# Time limit in seconds (defines 'high' completion times)
nsec=10
# Testing interval in minutes
interval=5
# Time period to scan from now backwards
nhours=1

### Get starting and current time for query interval 

We need :
1. Current UTC time (as set in timestamp on ES DB)
2. Previous date stamp (**nhours** ago) obtained from a time delta

In order to subtract the time difference we need **ct** to be a datetime object

In [79]:
# Get current UTC time (as set in timestamp on ES DB)
# In order to subtract the time difference we need ct to be a datetime object

# Following 2 lines are for testing purposes only
#curtime = '20170126T120000.000Z'
#ct = datetime.datetime.strptime(curtime, "%Y%m%dT%H%M%S.%fZ")

ct = datetime.datetime.utcnow()
ind = 'frontier-%d-%02d' % (ct.year, ct.month)
print('INDEX: ',ind)
curtime = ct.strftime('%Y%m%dT%H%M%S.%f')[:-3]+'Z'

td = datetime.timedelta(hours=nhours)
st = ct - td
starttime = st.strftime('%Y%m%dT%H%M%S.%f')[:-3]+'Z'

print('start time', starttime)
print('current time',curtime)


INDEX:  frontier-2017-10
start time 20171010T120458.698Z
current time 20171010T130458.698Z


### Establish connection to ES-DB and submit query

Send a query to the ES-DB to get the Frontier servers which served queries taking more than **nsec** seconds

In [80]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

myquery = es_query.es_query()

# Select ony those frontier severs having served queries which took more than the given time limit
querytxt = myquery.setquery('querytime:>'+str(nsec*1000),starttime,curtime)
res_page = es.search(index='frontier-*', body=querytxt, request_timeout=600, scroll='2m')

sid = res_page['_scroll_id']
res_total = res_page['hits']['total']
print(res_total)
scroll_size=res_page['hits']['total']

frontierservers = []
for i in range(len(res_page['hits']['hits'])):
   frontierserver = str(res_page['hits']['hits'][i]['_source']['frontierserver'])
   if frontierserver not in frontierservers:
      frontierservers.append(frontierserver)

# Start scrolling
while (scroll_size > 0):
   print("Scrolling...")
   res_page = es.scroll(scroll_id = sid, scroll = '2m')
   # Update the scroll ID
   sid = res_page['_scroll_id']
   # Get the number of results that we returned in the last scroll
   scroll_size = len(res_page['hits']['hits'])
   print("scroll size: ", str(scroll_size))
   # Do something with the obtained page

   for i in range(len(res_page['hits']['hits'])):
      frontierserver = str(res_page['hits']['hits'][i]['_source']['frontierserver'])
      if frontierserver not in frontierservers:
         frontierservers.append(frontierserver)

print (frontierservers)

0
[]


### Query for intervals of some minutes during the whole search period

We get the percentages of queries taking more than **nsec** seconds in intervals of **interval** minutes for the **nhours** period

In [81]:
td_min = datetime.timedelta(minutes=interval)
it = st
et = st + td_min
percmat={}
timdict={}
while it < ct:
   itime = it.strftime('%Y%m%dT%H%M%S.%f')[:-3]+'Z'
   etime = et.strftime('%Y%m%dT%H%M%S.%f')[:-3]+'Z'

   for frserver in frontierservers:
      querytxt = myquery.setquery('querytime:>'+str(nsec*1000)+' AND frontierserver:"'+frserver+'"',itime,etime)
      res_page = es.search(index='frontier-*', body=querytxt, request_timeout=600, scroll='2m')

    # sid = res_page['_scroll_id']
      res_lq = res_page['hits']['total']
      if res_lq > 0:
         querytxt = myquery.setquery('frontierserver:"'+frserver+'"',itime,etime)
         res_page = es.search(index='frontier-*', body=querytxt, request_timeout=600, scroll='2m')
         res_total = res_page['hits']['total']
         perc = '{:02.2f}'.format(res_lq/res_total*100)
         if int(float(perc)) >= percentlimit:
            if frserver not in percmat.keys() or float(perc) > float(percmat[frserver]):
               percmat[frserver] = perc
               timdict[frserver] = it.strftime('%Y-%m-%d %H:%M:%S')

#            print('Percentage of long time queries ', perc, ' for frontier server ', frserver, ' at ',
#                  it.strftime('%Y-%m-%d %H:%M:%S'))

   it = et
   et = it + td_min

# scroll_size=res_page['hits']['total']


### Submit an alert if any server had a percentage  of long time consuming queries beyond the established limit

Send the Frontier server name and the maximum percentage of long time queries observed for any given **interval** in minutes above the limit **percentlimit**

In [82]:
if len(percmat) > 0:
    S = subscribers()
    A = alerts.alerts()

    test_name = 'Long queries'
    users =  S.get_immediate_subscribers(test_name)
    for user in users:
        body = 'Dear ' + user.name +',\n\n'
        body += '\tthis mail is to let you know that the percentage of long time queries (>'
        body += str(nsec)+'s) is\n\n'
        for fkey in percmat:
          body += fkey
          body += ' : '
          body += str(percmat[fkey]) + '%'
          body += ' on ' + timdict[fkey] + ' UTC time\n'
        body += '\nBest regards,\nATLAS AAS'
        body += '\n\n To change your alerts preferences please you the following link:\n' + user.link
        A.sendMail(test_name, user.email, body)
##        A.addAlert(test_name, user.name, str(res_page))
