Checks operation of logstash instances (TEST)
====
This notebook checks if the logstash instances are processing events in real time. If that's not the case, there will be a delay resulting in no records for a given recent period of time.

In [72]:
from subscribers import subscribers
import alerts
import es_query

import datetime
import re
import json
import sys
from elasticsearch import Elasticsearch, exceptions as es_exceptions
from elasticsearch.helpers import scan

### Variables for this script
1. Period being checked (**nhours**)
2. Period to get the potentially working logstash instances (**nhoursall**)

In [73]:
# Period to submit alarm if no records found (from now backwards)
nhours=1
# Period to get all previously running server instances (from now backwards)
nhoursall=3

### Get current and starting time for the check
The frontier index of the current year and month is being used

In [74]:
# Get current UTC time (as set in timestamp on ES DB)
# In order to subtract the time difference we need ct to be a datetime object
ct = datetime.datetime.utcnow()
ind = 'frontier-%d-%02d' % (ct.year, ct.month)
print(ind)
curtime = ct.strftime('%Y%m%dT%H%M%S.%f')[:-3]+'Z'

td = datetime.timedelta(hours=nhoursall)
st = ct - td
starttime = st.strftime('%Y%m%dT%H%M%S.%f')[:-3]+'Z'

print('start time', starttime)
print('current time',curtime)

frontier-2017-10
start time 20171010T121110.940Z
current time 20171010T151110.940Z


### Set the query to get potentially running logstash instances (one per server)
This period is establish as **nhoursall** since the current time

(Avoid the still existing records for the metrics)

In [75]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

myquery = es_query.es_query()
querytxt = myquery.setquery('NOT tags:metric',starttime,curtime)

res_page = es.search(index='frontier-*', body=querytxt, request_timeout=600, scroll='2m')

### First run for 10000 events (default) 
* Get frontier servers and timestamp of latest recorded entry from first search


In [76]:
ifrontiersrvr = {}

print (res_page['hits']['hits'][0]['_source']['@timestamp'])
timestmp = res_page['hits']['hits'][0]['_source']['@timestamp'] 
tstmp = datetime.datetime.strptime(timestmp,('%Y-%m-%dT%H:%M:%S.%f'+'Z'))

for i in range(len(res_page['hits']['hits'])):
   frontier = str(res_page['hits']['hits'][i]['_source']['frontierserver'])
   timestmp = res_page['hits']['hits'][i]['_source']['@timestamp'] 
   tstmp = datetime.datetime.strptime(timestmp,('%Y-%m-%dT%H:%M:%S.%f'+'Z'))
   if frontier not in ifrontiersrvr.keys() or tstmp > ifrontiersrvr[frontier]:
     ifrontiersrvr[frontier] = tstmp

print (ifrontiersrvr)

2017-10-10T12:11:47.080Z
{'aiatlas037.cern.ch': datetime.datetime(2017, 10, 10, 12, 28, 59, 943000), 'aiatlas038.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 1, 113000), 'frontier-atlas1.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 29, 6, 313000), 'frontier-atlas3.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 28, 59, 110000), 'aiatlas073.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 2, 987000), 'ccosvms0014': datetime.datetime(2017, 10, 10, 12, 29, 5, 275000), 'frontier-atlas2.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 28, 58, 507000), 'aiatlas149.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 5, 172000), 'aiatlas034.cern.ch': datetime.datetime(2017, 10, 10, 12, 24, 32, 819000)}


* Second search without those servers already found to limit the number of records

In [78]:
searchtxt = 'NOT tags:metric'
for frontier in frontiersrvr:
   searchtxt += ' AND NOT frontierserver:'+'"'+str(frontier)+'"'

querytxt = myquery.setquery(searchtxt,starttime,curtime)

res_page = es.search(index='frontier-*', body=querytxt, request_timeout=600, scroll='2m')

sid = res_page['_scroll_id']
res_total = res_page['hits']['total']
print(res_total)
scroll_size=res_page['hits']['total']

for i in range(len(res_page['hits']['hits'])):
   frontier = str(res_page['hits']['hits'][i]['_source']['frontierserver'])
   timestmp = res_page['hits']['hits'][i]['_source']['@timestamp'] 
   tstmp = datetime.datetime.strptime(timestmp,('%Y-%m-%dT%H:%M:%S.%f'+'Z'))
   if frontier not in frontiersrvr.keys() or tstmp > ifrontiersrvr[frontier]:
      ifrontiersrvr[frontier] = tstmp

# Start scrolling
while (scroll_size > 0):
  print("Scrolling...")
  res_page = es.scroll(scroll_id = sid, scroll = '2m')
  # Update the scroll ID
  sid = res_page['_scroll_id']
  # Get the number of results that we returned in the last scroll
  scroll_size = len(res_page['hits']['hits'])
  print("scroll size: ", str(scroll_size))
  # Do something with the obtained page

  for i in range(len(res_page['hits']['hits'])):
     frontier = str(res_page['hits']['hits'][i]['_source']['frontierserver'])
     timestmp = res_page['hits']['hits'][i]['_source']['@timestamp'] 
     tstmp = datetime.datetime.strptime(timestmp,('%Y-%m-%dT%H:%M:%S.%f'+'Z'))
     if frontier not in frontiersrvr.keys() or tstmp > ifrontiersrvr[frontier]:
        ifrontiersrvr[frontier] = tstmp

print(ifrontiersrvr)


88015
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  10000
Scrolling...
scroll size:  8015
Scrolling...
scroll size:  0
{'aiatlas037.cern.ch': datetime.datetime(2017, 10, 10, 12, 28, 59, 943000), 'aiatlas038.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 1, 113000), 'frontier-atlas1.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 29, 6, 313000), 'frontier-atlas3.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 28, 59, 110000), 'aiatlas073.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 2, 987000), 'ccosvms0014': datetime.datetime(2017, 10, 10, 12, 29, 5, 275000), 'frontier-atlas2.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 28, 58, 507000), 'aiatlas149.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 5, 172000), 'aiatlas148.cern.ch': datetime.datetime(2017, 10, 10, 15, 11, 2, 936000), 'aiatl

### Check if timestamp of last record is within the checking period
Cheking which logstash instances are still alive in the last period of **nhours** hours

In [79]:
td = datetime.timedelta(hours=nhours)
st = ct - td

ffrontiersrvr={}

for frontier in ifrontiersrvr.keys():
   if ifrontiersrvr[frontier] < st:
      ffrontiersrvr[frontier] = ifrontiersrvr[frontier]

print (len(ffrontiersrvr), ffrontiersrvr)

8 {'aiatlas037.cern.ch': datetime.datetime(2017, 10, 10, 12, 28, 59, 943000), 'aiatlas038.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 1, 113000), 'frontier-atlas1.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 29, 6, 313000), 'frontier-atlas3.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 28, 59, 110000), 'aiatlas073.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 2, 987000), 'ccosvms0014': datetime.datetime(2017, 10, 10, 12, 29, 5, 275000), 'frontier-atlas2.lcg.triumf.ca': datetime.datetime(2017, 10, 10, 12, 28, 58, 507000), 'aiatlas149.cern.ch': datetime.datetime(2017, 10, 10, 12, 29, 5, 172000)}


In [None]:
if res_page['hits']['total'] > 0:
    S = subscribers()
    A = alerts.alerts()

    test_name = 'Long queries'
    users =  S.get_immediate_subscribers(test_name)
    for user in users:
        body = 'Dear ' + user.name +',\n\n'
        body += '\tthis mail is to let you know that the number of simultaneous threads went beyond '
        body += str(threadlimit) + ' on some servers \n\n' 
        for fkey in frontiersrvr:
          body += fkey
          body += ' : '
          body += str(frontiersrvr[fkey])
          body += '\n'
        body += '\nBest regards,\nATLAS AAS'
        body += '\n\n To change your alerts preferences please you the following link:\n' + user.link
        A.sendMail(test_name, user.email, body)
##        A.addAlert(test_name, user.name, str(res_page))