# Check if the packet loss data is abnormal

This notebook finds out all the links which have at least five packet loss measurements in the past one hour and the average value of the packet loss measurements is greater than 2%. It is run by a cron job every hour, and it will write the detailed information of every alarm into Elastic Search with the _index: alarms-year-month and _type: packetloss.

### import all the packages needed for this task

In [1]:
from elasticsearch import Elasticsearch, exceptions as es_exceptions, helpers
import sys
import datetime

### If needed to calculate packet loss for other time moment than "now", overwrite it bellow

In [2]:
cdt = datetime.datetime.utcnow()
#cdt = datetime.datetime(2017,1,21,9,0,0)

GT = (cdt - datetime.timedelta(hours=3)).strftime("%Y%m%dT%H%m%S+0000")
LT = cdt.strftime("%Y%m%dT%H%m%S+0000")
print('between: ', GT, ' and ', LT)

between:  20180320T010316+0000  and  20180320T040316+0000


### establish the Elastic Search connection

In [3]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

### define functions to write an alarm record into ES with detailed info

In [4]:
ipSite={}  # for mapping IP to Site name
toAlertOn=[]

def generate_doc(src_site_ip, dest_site_ip, measurements, avgpl):
   if src_site_ip not in ipSite:
        print('serious source mapping issue')
        return
   if dest_site_ip not in ipSite:
        print('serious destination mapping issue')
        return
   
   doc = {
        '_index':get_index_name(),
        '_type' : 'doc',
        'type' : 'packetloss',
        'src' : src_site_ip,
        'dest' : dest_site_ip,
        'srcSite' : ipSite[src_site_ip],
        'destSite' : ipSite[dest_site_ip],
        'alarmTime' : int( (cdt-datetime.datetime(1970,1,1) ).total_seconds() * 1000 ),
        'measurements' : measurements,
        'packetLossAvg' : avgpl
   }
   return doc

def get_index_name():
    date = cdt.strftime("%Y-%m")   # date format is yyyy-mm
    index_name = 'alarms-'+date
    return index_name



### get aggregated data for the past 2 hours
This query is composed of 3 parts: a) filter - takes only packet loss data, and production servers in last 1h. b) aggregation -  finds average packet loss per source and destination c) finds IP to site name mapping (both source and destination) 

In [5]:
query={
   "size": 0,
   "query": {
    "bool": {
      "must": [
        {"term": { "src_production" : True }},
        {"term": { "dest_production" : True }}
      ],
      "filter" : {
        "range" : {
          "timestamp" :{ "gt" : GT, "lt" : LT }
        }
      }
    }
   },
    "aggs" : {
      "src" : {
        "terms" : { "field" : "src", "size": 1000 },
        "aggs" : {
          "dest" : {
            "terms" : {"field" : "dest", "size": 1000},
            "aggs" : {
              "avgpl" : {
                "avg" :{
                  "field" : "packet_loss"
              }
            }
          }
        }
      }
    },
    "srcSites" : {
      "terms" : { "field" : "src", "size": 1000 },
        "aggs" : {
          "srcsitename" : {
            "terms" : { "field" : "src_site" }
        }
      }
    },
    "destSites" : {
      "terms" : { "field" : "dest", "size": 1000 },
        "aggs" : {
          "destsitename" : {
            "terms" : { "field" : "dest_site" }
        }
      }
    }
  }
}

#execute query
res = es.search(index="ps_packet_loss", body=query, request_timeout=120)
#print(res)

### proces IP to site name mapping data

In [6]:
srcsites=res['aggregations']['srcSites']['buckets']
print(srcsites)
for sS in srcsites:
   #print(sS)
   siteName=sS['srcsitename']['buckets']
   if len(siteName)==0:
      siteName='UnknownSite'
   else:
      siteName=siteName[0]['key']
   ipSite[sS['key']]=siteName

destsites=res['aggregations']['destSites']['buckets']
#print(destsites)
for dS in destsites:
   #print(dS)
   siteName=dS['destsitename']['buckets']
   if len(siteName)==0:
      siteName='UnknownSite'
   else:
      siteName=siteName[0]['key']
   ipSite[dS['key']]=siteName

print(ipSite)


[{'doc_count': 17101, 'srcsitename': {'sum_other_doc_count': 0, 'buckets': [{'doc_count': 17101, 'key': 'INFN-NAPOLI-ATLAS'}], 'doc_count_error_upper_bound': 0}, 'key': '90.147.67.252'}, {'doc_count': 14897, 'srcsitename': {'sum_other_doc_count': 0, 'buckets': [{'doc_count': 14897, 'key': 'UKI-SCOTGRID-GLASGOW'}], 'doc_count_error_upper_bound': 0}, 'key': '130.209.239.124'}, {'doc_count': 14368, 'srcsitename': {'sum_other_doc_count': 0, 'buckets': [{'doc_count': 14368, 'key': 'RAL-LCG2'}], 'doc_count_error_upper_bound': 0}, 'key': '130.246.176.109'}, {'doc_count': 14150, 'srcsitename': {'sum_other_doc_count': 0, 'buckets': [{'doc_count': 14150, 'key': 'DESY-HH'}], 'doc_count_error_upper_bound': 0}, 'key': '131.169.98.30'}, {'doc_count': 13966, 'srcsitename': {'sum_other_doc_count': 0, 'buckets': [{'doc_count': 13966, 'key': 'IN2P3-CC'}], 'doc_count_error_upper_bound': 0}, 'key': '193.48.99.76'}, {'doc_count': 13411, 'srcsitename': {'sum_other_doc_count': 0, 'buckets': [{'doc_count': 13

### process packet loss averages

In [7]:
src=res['aggregations']['src']['buckets']
#print(src)

for s in src:
   #print(s)
   source=s['key']
   for d in s['dest']['buckets']:
      destination=d['key']
      avgpl=d['avgpl']['value']
      docs=d['doc_count']
#      print(source, destination, docs, avgpl)
      if avgpl > 0.02 and docs > 4:
         toAlertOn.append(generate_doc(source, destination, docs, avgpl))

for alert in toAlertOn:
   print(alert)

{'_index': 'alarms-2018-03', 'alarmTime': 1521518476093, 'srcSite': 'INFN-NAPOLI-ATLAS', 'packetLossAvg': 1.0, '_type': 'doc', 'measurements': 120, 'type': 'packetloss', 'destSite': 'AGLT2', 'src': '90.147.67.252', 'dest': '192.41.230.59'}
{'_index': 'alarms-2018-03', 'alarmTime': 1521518476093, 'srcSite': 'UKI-SCOTGRID-GLASGOW', 'packetLossAvg': 0.8713450292397661, '_type': 'doc', 'measurements': 171, 'type': 'packetloss', 'destSite': 'INFN-ROMA1', 'src': '130.209.239.124', 'dest': '141.108.35.18'}
{'_index': 'alarms-2018-03', 'alarmTime': 1521518476093, 'srcSite': 'UKI-SCOTGRID-GLASGOW', 'packetLossAvg': 1.0, '_type': 'doc', 'measurements': 122, 'type': 'packetloss', 'destSite': 'AGLT2', 'src': '130.209.239.124', 'dest': '192.41.230.59'}
{'_index': 'alarms-2018-03', 'alarmTime': 1521518476093, 'srcSite': 'RAL-LCG2', 'packetLossAvg': 1.0, '_type': 'doc', 'measurements': 130, 'type': 'packetloss', 'destSite': 'AGLT2', 'src': '130.246.176.109', 'dest': '192.41.230.59'}
{'_index': 'alarm

### write alarms to Elasticsearch

In [8]:
try:
   res = helpers.bulk(es, toAlertOn, raise_on_exception=True,request_timeout=60)
   print("inserted:",res[0], '\tErrors:',res[1])
except es_exceptions.ConnectionError as e:
   print('ConnectionError ', e)
except es_exceptions.TransportError as e:
   print('TransportError ', e)
except helpers.BulkIndexError as e:
   print(e[0])
   for i in e[1]:
      print(i)
except:
   print('Something seriously wrong happened.')

inserted: 106 	Errors: []
