# Check if the packet loss data is abnormal

This notebook finds out all the links which have at least five packet loss measurements in the past one hour and the average value of the packet loss measurements is greater than 2%. It is run by a cron job every hour, and it will write the detailed information of every alarm into Elastic Search with the _index: alarms-year.month and _type: packetloss.

### import all the packages needed for this task

In [1]:
from elasticsearch import Elasticsearch, exceptions as es_exceptions, helpers
import sys
import datetime

### establish the Elastic Search connection

In [2]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

### define functions to write an alarm record into ES with detailed info

In [3]:
ipSite={}  # for mapping IP to Site name
toAlertOn=[]

def generate_doc(src_site_ip, dest_site_ip, measurements, avgpl):
   if src_site_ip not in ipSite:
        print('serious source mapping issue')
        return
   if dest_site_ip not in ipSite:
        print('serious destination mapping issue')
        return

   doc = {
        '_index':get_index_name(),
        '_type' : 'packetloss',
        'src' : src_site_ip,
        'dest' : dest_site_ip,
        'srcSite' : ipSite[src_site_ip],
        'destSite' : ipSite[dest_site_ip],
        'alarmTime' : int( (datetime.datetime.utcnow()-datetime.datetime(1970,1,1) ).total_seconds() * 1000 ),
        'measurements' : measurements,
        'packetLossAvg' : avgpl
   }
   return doc

def get_index_name():
    date = datetime.datetime.utcnow().strftime("%Y.%m")   # date format is yyyy.mm
    index_name = 'alarms-'+date
    return index_name



### list all network\_weather* indices

In [4]:
indices = es.cat.indices(index="network_weather-*", h="index", request_timeout=600).split('\n')
indices = [x for x in indices if x != '']
indices = [x.strip() for x in indices]
#print(indices)

### find indices to be used

In [5]:

cday  = datetime.datetime.utcnow()
pday  = cday - datetime.timedelta(days=1)
ind1 = 'network_weather-%d.%02d.%02d' % (cday.year, cday.month, cday.day)
ind2 = 'network_weather-%d.%02d.%02d' % (pday.year, pday.month, pday.day)

print ('checking for indices:', ind1, ind2)

ind=[]
if ind1 in indices :
   ind.append(ind1)
if ind2 in indices and cday.hour<3:
   ind.append(ind2)

if len(ind)==0:
   print ('no current indices found. Aborting.')
   sys.exit(1)
else:
   print('will use indices:', ind)

checking for indices: network_weather_2-2016.8.22 network_weather_2-2016.8.21
will use indices: ['network_weather_2-2016.8.22']


### get aggregated data for the past one hour
This query is composed of 3 parts: a) filter - takes only packet loss data, and production servers in last 1h. b) aggregation -  finds average packet loss per source and destination c) finds IP to site name mapping (both source and destination) 

In [13]:
query={
   "size": 0,
   "query": {
    "bool": {
      "must": [
        {"term": { "_type" : "packet_loss_rate"}},
        {"term": { "srcProduction" : True }},
        {"term": { "destProduction" : True }}
      ],
      "filter" : {
        "range" : {
          "timestamp" : {
            "gt": "now-10h"
          }
        }
      }
    }
   },
    "aggs" : {
      "src" : {
        "terms" : { "field" : "src", "size": 1000 },
        "aggs" : {
          "dest" : {
            "terms" : {"field" : "dest", "size": 1000},
            "aggs" : {
              "avgpl" : {
                "avg" :{
                  "field" : "packet_loss"
              }
            }
          }
        }
      }
    },
    "srcSites" : {
      "terms" : { "field" : "src", "size": 1000 },
        "aggs" : {
          "srcsitename" : {
            "terms" : { "field" : "srcSite" }
        }
      }
    },
    "destSites" : {
      "terms" : { "field" : "dest", "size": 1000 },
        "aggs" : {
          "destsitename" : {
            "terms" : { "field" : "destSite" }
        }
      }
    }
  }
}

#execute query
res = es.search(index=ind, body=query, request_timeout=120)
#print(res)

### proces IP to site name mapping data

In [14]:
srcsites=res['aggregations']['srcSites']['buckets']
#print(srcsites)
for sS in srcsites:
   #print(sS)
   siteName=sS['srcsitename']['buckets']
   if len(siteName)==0:
      siteName='UnknownSite'
   else:
      siteName=siteName[0]['key']
   ipSite[sS['key']]=siteName

destsites=res['aggregations']['destSites']['buckets']
#print(destsites)
for dS in destsites:
   #print(dS)
   siteName=dS['destsitename']['buckets']
   if len(siteName)==0:
      siteName='UnknownSite'
   else:
      siteName=siteName[0]['key']
   ipSite[dS['key']]=siteName

print(ipSite)


{'129.107.255.26': 'UnknownSite', '198.124.80.201': 'UnknownSite', '131.154.254.12': 'INFN-T1', '130.246.176.109': 'RAL-LCG2', '192.54.207.250': 'GRIF', '131.225.205.12': 'UnknownSite', '62.40.126.129': 'UnknownSite', '161.116.81.235': 'UnknownSite', '192.108.47.12': 'FZK-LCG2', '134.158.20.192': 'IN2P3-CPPM', '192.12.15.111': 'BNL-ATLAS', '137.222.74.15': 'UKI-SOUTHGRID-BRIS-HEP', '147.213.204.117': 'IEPSAS-Kosice', '132.230.202.235': 'UNI-FREIBURG', '193.170.243.215': 'Hephy-Vienna', '192.5.207.251': 'BU_ATLAS_Tier2', '198.124.80.193': 'UnknownSite', '132.195.125.213': 'wuppertalprod', '131.243.24.11': 'UnknownSite', '195.194.105.178': 'UKI-NORTHGRID-MAN-HEP', '134.158.103.10': 'IN2P3-LAPP', '72.36.96.4': 'MWT2', '109.105.124.86': 'NDGF-T1', '158.195.14.26': 'FMPhI-UNIBA', '144.206.237.142': 'RRC-KI', '144.16.111.26': 'INDIACMS-TIFR', '129.215.213.70': 'UKI-SCOTGRID-ECDF', '131.111.66.196': 'UKI-SOUTHGRID-CAM-HEP', '72.36.96.15': 'MWT2', '212.191.227.174': 'CYFRONET-LCG2', '193.136.7

### process packet loss averages

In [15]:
src=res['aggregations']['src']['buckets']
#print(src)

for s in src:
   #print(s)
   source=s['key']
   for d in s['dest']['buckets']:
      destination=d['key']
      avgpl=d['avgpl']['value']
      docs=d['doc_count']
#      print(source, destination, docs, avgpl)
      if avgpl > 0.02 and docs > 4:
         toAlertOn.append(generate_doc(source, destination, docs, avgpl))

for alert in toAlertOn:
   print(alert)

{'_index': 'alarms-2016.08', 'packetLossAvg': 0.8952380952380954, 'destSite': 'INFN-ROMA1', 'alarmTime': 1471902109616, 'src': '195.194.105.178', 'dest': '141.108.35.18', '_type': 'packetloss', 'srcSite': 'UKI-NORTHGRID-MAN-HEP', 'measurements': 105}
{'_index': 'alarms-2016.08', 'packetLossAvg': 1.0, 'destSite': 'UnknownSite', 'alarmTime': 1471902109616, 'src': '195.194.105.178', 'dest': '161.116.81.235', '_type': 'packetloss', 'srcSite': 'UKI-NORTHGRID-MAN-HEP', 'measurements': 11}
{'_index': 'alarms-2016.08', 'packetLossAvg': 1.0, 'destSite': 'UnknownSite', 'alarmTime': 1471902109616, 'src': '141.34.200.28', 'dest': '161.116.81.235', '_type': 'packetloss', 'srcSite': 'DESY-ZN', 'measurements': 14}
{'_index': 'alarms-2016.08', 'packetLossAvg': 0.8923076923076925, 'destSite': 'INFN-ROMA1', 'alarmTime': 1471902109616, 'src': '193.146.75.138', 'dest': '141.108.35.18', '_type': 'packetloss', 'srcSite': 'IFCA-LCG2', 'measurements': 13}
{'_index': 'alarms-2016.08', 'packetLossAvg': 0.919999

### write alarms to Elasticsearch

In [16]:
try:
   res = helpers.bulk(es, toAlertOn, raise_on_exception=True,request_timeout=60)
   print("inserted:",res[0], '\tErrors:',res[1])
except es_exceptions.ConnectionError as e:
   print('ConnectionError ', e)
except es_exceptions.TransportError as e:
   print('TransportError ', e)
except helpers.BulkIndexError as e:
   print(e[0])
   for i in e[1]:
      print(i)
except:
   print('Something seriously wrong happened.')

inserted: 109 	Errors: []
