# Send alert emails about packet loss based on alarms and user subscribing

This notebook is run by a cron job every hour, and its purpose is to send alert emails about packet loss for user specified site(s) based on alarms and user subscribing records. 

This notebook works following this procedure: 

(1) Get all the alarms of type packetloss for the past hour (call it NEW) and past past hour (call it OLD) from Elasticsearch

(2) Get the user subscribing records from Google Sheets calling APIs in subscribers.py

(3) Process the alarms data and subscribing data to make them easier to use for this monitoring task

(4) TN_old means total number of alarmed links involving a specific site ip (no matter from it or to it) for OLD time period

(5) TN_new means total number of alarmed links involving a specific site ip (no matter from it or to it) for NEW time period

(6) TN_delta means the change of value from TN_old to TN_new. We need to compare TN_delta v.s. +N and v.s. -N (tune N later)

(7) If a site ip never occurs in NEW and OLD, then it must be totally fine, and we do not care about it at all (TN_old == TN_new == TN_delta == 0)

(8) If a site ip occurs in NEW or OLD or both, then we may have TN_delta > 0 or == 0 or < 0 for this site ip, so we want to take a closer look at this site ip, so we do (9) (10) (11)

(9) If TN_delta >= +N, then overall the links connected to this site are becoming worse, so we send email

(10) If TN_delta <= -N, then overall the links connected to this site are becoming better, so we send email

(11) Otherwise, the overall status for this site is not changing or just changing slightly, so we do not send email

(12) In order to send email, we need a dictionary whose key is site ip and value is a list of relevant user emails



## Import necessary packages and classes

In [1]:
# Retrieve user subscribing records from google sheets.
from subscribers import subscribers
import alerts

S = subscribers()
A = alerts.alerts()

# Related to Elasticsearch queries
from elasticsearch import Elasticsearch, exceptions as es_exceptions, helpers
import datetime

# Regular Expression
import re

## Establish Elasticsearch connection

In [2]:
es = Elasticsearch(hosts=[{'host':'atlas-kibana.mwt2.org', 'port':9200}],timeout=60)

## Queries to find all the alarms of type Packet Loss for the past hour and past past hour

In [3]:
query_new = {
    "size": 1000,
    "query": {
        "bool": {
            "must": [
                {"term": { "type": "packetloss" }}
            ],
            "filter": {
                "range": {
                    "alarmTime": {
                        "gt": "now-3h"
                    }
                }
            }
        }
    }
}

query_old = {
    "size": 1000,
    "query": {
        "bool": {
            "must": [
                {"term": { "type": "packetloss" }}
            ],
            "filter": {
                "range": {
                    "alarmTime": {
                        "gt": "now-6h",
                        "lt": "now-3h"
                    }
                }
            }
        }
    }
}

print(query_new)
print(query_old)

{'query': {'bool': {'filter': {'range': {'alarmTime': {'gt': 'now-3h'}}}, 'must': [{'term': {'type': 'packetloss'}}]}}, 'size': 1000}
{'query': {'bool': {'filter': {'range': {'alarmTime': {'lt': 'now-3h', 'gt': 'now-6h'}}}, 'must': [{'term': {'type': 'packetloss'}}]}}, 'size': 1000}


## Execute the query

In [4]:
result_new = es.search(index='alarms', body=query_new, request_timeout=120)
print('Number of hits of new alarms:', result_new['hits']['total'] )

result_old = es.search(index='alarms', body=query_old, request_timeout=120)
print('Number of hits of old alarms:', result_old['hits']['total'] )

hits_new = result_new['hits']['hits']
hits_old = result_old['hits']['hits']

Number of hits of new alarms: 104
Number of hits of old alarms: 0


## Generate the two dictionaries for sites, one is from ip to name, one is from name to ip

In [5]:
site_ip_name = {}

for hit in hits_new:
    info = hit['_source']
    site_ip_name[info['src']] = info['srcSite']
    site_ip_name[info['dest']] = info['destSite']

for hit in hits_old:
    info = hit['_source']
    site_ip_name[info['src']] = info['srcSite']
    site_ip_name[info['dest']] = info['destSite']

print(site_ip_name)

{'129.93.5.165': 'Nebraska', '141.34.200.28': 'DESY-ZN', '192.41.236.31': 'AGLT2', '90.147.67.252': 'INFN-NAPOLI-ATLAS', '129.15.40.231': 'OU_OCHEP_SWT2', '192.111.108.112': 'Vanderbilt', '134.158.123.183': 'IN2P3-LPC', '192.41.230.59': 'AGLT2', '192.54.207.250': 'GRIF', '147.213.204.112': 'IEPSAS-Kosice', '192.41.236.35': 'UnknownSite', '138.253.60.82': 'UKI-NORTHGRID-LIV-HEP', '163.1.5.210': 'UKI-SOUTHGRID-OX-HEP', '129.215.213.70': 'UKI-SCOTGRID-ECDF', '193.48.99.76': 'IN2P3-CC', '192.5.207.251': 'BU_ATLAS_Tier2', '192.114.101.125': 'TECHNION-HEP', '206.12.154.60': 'CA-VICTORIA-WESTGRID-T2', '148.187.64.25': 'CSCS-LCG2', '192.170.227.163': 'UnknownSite', '131.243.24.11': 'UnknownSite', '192.68.51.219': 'UnknownSite', '134.219.225.13': 'UKI-LT2-RHUL', '192.231.127.41': 'Australia-ATLAS', '2a00:139c:5:4102::12': 'FZK-LCG2', '212.193.96.29': 'ru-PNPI', '134.158.159.85': 'GRIF', '129.107.255.29': 'UTA_SWT2', '194.36.11.38': 'UKI-LT2-QMUL', '134.158.20.192': 'IN2P3-CPPM', '131.111.66.196

In [6]:
site_name_ip = {}

for ip in site_ip_name:
    name = site_ip_name[ip]
    if name in site_name_ip:
        site_name_ip[name].append(ip)
    else:
        site_name_ip[name] = [ip]

print(site_name_ip)

{'UKI-SCOTGRID-ECDF': ['129.215.213.70'], 'UKI-SCOTGRID-GLASGOW': ['130.209.239.124'], 'DESY-HH': ['131.169.98.30'], 'CA-SCINET-T2': ['142.150.19.61'], 'UKI-LT2-QMUL': ['194.36.11.38'], 'BNL-ATLAS': ['192.12.15.111', '192.12.15.26'], 'UAM-LCG2': ['150.244.246.85'], 'RU-Protvino-IHEP': ['194.190.165.192'], 'LUCILLE': ['164.58.29.121'], 'UnknownSite': ['192.41.236.35', '192.170.227.163', '131.243.24.11', '192.68.51.219', '192.41.230.61', '149.165.224.247', '18.12.1.171', '129.107.255.26', '89.145.160.212'], 'UKI-NORTHGRID-LIV-HEP': ['138.253.60.82'], 'UKI-SOUTHGRID-CAM-HEP': ['131.111.66.196'], 'TECHNION-HEP': ['192.114.101.125'], 'CERN-PROD': ['128.142.223.247'], 'MWT2': ['149.165.225.223', '72.36.96.4', '192.170.227.160', '72.36.96.15'], 'UKI-LT2-RHUL': ['134.219.225.13'], 'OU_OCHEP_SWT2': ['129.15.40.231'], 'AGLT2': ['192.41.236.31', '192.41.230.59'], 'BU_ATLAS_Tier2': ['192.5.207.251'], 'pic': ['193.109.172.188'], 'DESY-ZN': ['141.34.200.28'], 'IN2P3-CPPM': ['134.158.20.192'], 'INFN-

## Calculate TN_old, the total number of alarmed links involving a specific site ip (either as source site or as destination site) for the OLD time period

In [7]:
TN_old = {}

def TN_old_add_one(ip):
    if ip in TN_old:
        TN_old[ip] += 1
    else:
        TN_old[ip] = 1

for alarm in hits_old:
    TN_old_add_one(alarm['_source']['src'])
    TN_old_add_one(alarm['_source']['dest'])

#TN_old

## Calculate TN_new, the total number of alarmed links involving a specific site ip (either as source site or as destination site) for the NEW time period

In [8]:
TN_new = {}

def TN_new_add_one(ip):
    if ip in TN_new:
        TN_new[ip] += 1
    else:
        TN_new[ip] = 1

for alarm in hits_new:
    TN_new_add_one(alarm['_source']['src'])
    TN_new_add_one(alarm['_source']['dest'])

#TN_new

## Calculate TN_delta, which is equal to ( TN_new - TN_old )

In [9]:
TN_delta = {}

for ip in TN_old:
    if ip in TN_new:
        TN_delta[ip] = TN_new[ip] - TN_old[ip]
    else:
        TN_delta[ip] = -TN_old[ip]

for ip in TN_new:
    if ip not in TN_old:
        TN_delta[ip] = TN_new[ip]

TN_delta

{'117.103.105.191': 1,
 '128.142.223.247': 2,
 '129.107.255.26': 10,
 '129.107.255.29': 1,
 '129.15.40.231': 5,
 '129.215.213.70': 2,
 '129.93.5.165': 2,
 '130.209.239.124': 2,
 '130.246.176.109': 3,
 '131.111.66.196': 3,
 '131.154.254.12': 1,
 '131.169.98.30': 2,
 '131.243.24.11': 3,
 '134.158.123.183': 2,
 '134.158.132.200': 2,
 '134.158.159.85': 3,
 '134.158.20.192': 1,
 '134.158.73.243': 1,
 '134.219.225.13': 3,
 '138.253.60.82': 3,
 '141.108.35.18': 14,
 '141.34.200.28': 2,
 '142.150.19.61': 1,
 '143.167.3.116': 1,
 '147.213.204.112': 16,
 '148.187.64.25': 3,
 '149.165.224.247': 3,
 '149.165.225.223': 3,
 '150.244.246.85': 1,
 '163.1.5.210': 2,
 '164.58.29.121': 1,
 '18.12.1.171': 2,
 '192.111.108.112': 2,
 '192.114.101.125': 3,
 '192.12.15.111': 3,
 '192.12.15.26': 1,
 '192.170.227.160': 1,
 '192.170.227.163': 3,
 '192.231.127.41': 3,
 '192.41.230.59': 32,
 '192.41.230.61': 11,
 '192.41.236.31': 1,
 '192.41.236.35': 9,
 '192.5.207.251': 2,
 '192.54.207.250': 1,
 '192.68.51.219': 

## Look at the distribution of TN_delta, so that we can tune the parameter N

In [10]:
for N in range(10):
    count_worse = 0
    count_better = 0
    count_stable = 0
    for ip in TN_delta:
        if TN_delta[ip] > N:
            count_worse += 1
        elif TN_delta[ip] < -N:
            count_better += 1
        else:
            count_stable += 1
    print('N=%d     links went bad=%d     links went good=%d     unchanged=%d' % (N, count_worse, count_better, count_stable))

N=0     links went bad=63     links went good=0     unchanged=0
N=1     links went bad=41     links went good=0     unchanged=22
N=2     links went bad=25     links went good=0     unchanged=38
N=3     links went bad=8     links went good=0     unchanged=55
N=4     links went bad=8     links went good=0     unchanged=55
N=5     links went bad=7     links went good=0     unchanged=56
N=6     links went bad=6     links went good=0     unchanged=57
N=7     links went bad=6     links went good=0     unchanged=57
N=8     links went bad=6     links went good=0     unchanged=57
N=9     links went bad=5     links went good=0     unchanged=58


## Let's use N=6 for now, and we will tune later

In [11]:
N = 6

ip_list_worse = []
ip_list_better = []

for ip in TN_delta:
    if TN_delta[ip] >= N:
        ip_list_worse.append(ip)
    elif TN_delta[ip] <= -N:
        ip_list_better.append(ip)

print('--- The ip of the site(s) which got worse:')
print(ip_list_worse)
print('--- The ip of the site(s) which got better:')
print(ip_list_better)

--- The ip of the site(s) which got worse:
['192.41.230.59', '147.213.204.112', '192.41.236.35', '89.145.160.212', '192.41.230.61', '141.108.35.18', '129.107.255.26']
--- The ip of the site(s) which got better:
[]


## Generate the dictionary: key = site name, value = a list of relevant user emails

In [12]:
user_interest_site_name = {}

def reg_user_interest_site_name(sitename, email):
    if sitename in user_interest_site_name:
        user_interest_site_name[sitename].append(email)
    else:
        user_interest_site_name[sitename] = [email]

test_name = 'PerfSONAR [Packet loss change for link(s) where your site is a source or destination]'
emailSubject = 'Significant change in the number of network paths with large packet loss where your subscribed site(s) are the source or destination'

users = S.get_immediate_subscribers(test_name)

# Handle blank answer, one site, several sites separated by comma, wildcard such as prefix* etc.
for user in users:
    sitenames = user.sites
    print(user.to_string(), sitenames)
    if len(sitenames) == 0:
        sitenames = ['.']  # Handle blank answer, so match all site names
    sitenames = [x.replace('*', '.') for x in sitenames]  # Handle several site names, and wildcard
    for sn in sitenames:
        p = re.compile(sn, re.IGNORECASE)
        for sitename in site_name_ip:
            if p.match(sitename):
                reg_user_interest_site_name(sitename, user)


user name:Ilija Vukotic  email:ilijav@gmail.com ['MWT2']
user name:  email:duncan.rand@imperial.ac.uk ['UKI*']


## Generate the dictionary: key = site ip, value = a list of relevant user emails

In [13]:
user_interest_site_ip = {}

def reg_user_interest_site_ip(siteip, email):
    if siteip in user_interest_site_ip:
        user_interest_site_ip[siteip].append(email)
    else:
        user_interest_site_ip[siteip] = [email]

for sitename in user_interest_site_name:
    for siteip in site_name_ip[sitename]:
        for user in user_interest_site_name[sitename]:
            reg_user_interest_site_ip(siteip, user)

print(user_interest_site_ip)

{'131.111.66.196': [<subscribers.user object at 0x7fbe55080668>], '192.170.227.160': [<subscribers.user object at 0x7fbe551738d0>], '194.80.35.169': [<subscribers.user object at 0x7fbe55080668>], '130.209.239.124': [<subscribers.user object at 0x7fbe55080668>], '138.253.60.82': [<subscribers.user object at 0x7fbe55080668>], '134.219.225.13': [<subscribers.user object at 0x7fbe55080668>], '72.36.96.15': [<subscribers.user object at 0x7fbe551738d0>], '143.167.3.116': [<subscribers.user object at 0x7fbe55080668>], '163.1.5.210': [<subscribers.user object at 0x7fbe55080668>], '72.36.96.4': [<subscribers.user object at 0x7fbe551738d0>], '129.215.213.70': [<subscribers.user object at 0x7fbe55080668>], '195.194.105.178': [<subscribers.user object at 0x7fbe55080668>], '194.36.11.38': [<subscribers.user object at 0x7fbe55080668>], '149.165.225.223': [<subscribers.user object at 0x7fbe551738d0>]}


## Generate info for sending alert emails (for the sites getting worse)

In [17]:
for ip in ip_list_worse:
    text = "The site %s (%s)'s network paths have worsened, the count of src-destination paths with packet-loss went from %d to %d.\n" % (site_ip_name[ip], ip, TN_old.get(ip,0), TN_new.get(ip,0))
    text += "These are all the problematic src-destination paths for the past hour:\n"
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if src_ip == ip:
            text += '    %s (%s)  --->  %s (%s) \n' % (site_ip_name[src_ip], src_ip, site_ip_name[dest_ip], dest_ip)
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if dest_ip == ip:
            text += '    %s (%s)  --->  %s (%s) \n' % (site_ip_name[src_ip], src_ip, site_ip_name[dest_ip], dest_ip)
    print(text)
    if ip not in user_interest_site_ip: continue
    for user in user_interest_site_ip[ip]:
        user.alerts.append(text)

The site AGLT2 (192.41.230.59)'s network paths have worsened, the count of src-destination paths with packet-loss went from 0 to 32.
These are all the problematic src-destination paths for the past hour:
    INFN-NAPOLI-ATLAS (90.147.67.252)  --->  AGLT2 (192.41.230.59) 
    UKI-SCOTGRID-GLASGOW (130.209.239.124)  --->  AGLT2 (192.41.230.59) 
    RAL-LCG2 (130.246.176.109)  --->  AGLT2 (192.41.230.59) 
    DESY-HH (131.169.98.30)  --->  AGLT2 (192.41.230.59) 
    IN2P3-CC (193.48.99.76)  --->  AGLT2 (192.41.230.59) 
    UKI-NORTHGRID-LIV-HEP (138.253.60.82)  --->  AGLT2 (192.41.230.59) 
    CERN-PROD (128.142.223.247)  --->  AGLT2 (192.41.230.59) 
    GRIF (134.158.159.85)  --->  AGLT2 (192.41.230.59) 
    RU-Protvino-IHEP (194.190.165.192)  --->  AGLT2 (192.41.230.59) 
    GRIF (192.54.207.250)  --->  AGLT2 (192.41.230.59) 
    UKI-SCOTGRID-ECDF (129.215.213.70)  --->  AGLT2 (192.41.230.59) 
    GRIF (134.158.132.200)  --->  AGLT2 (192.41.230.59) 
    UKI-SOUTHGRID-OX-HEP (163.1.5.210

## Generate info for sending alert emails (for the sites getting better)

In [18]:
for ip in ip_list_better:
    text = "The site %s (%s)'s network paths have improved, the count of src-destination paths with packet-loss went from %d to %d.\n" % (site_ip_name[ip], ip, TN_old.get(ip,0), TN_new.get(ip,0))
    wtext=""
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if src_ip == ip:
            text += '    %s (%s)  --->  %s (%s) \n' % (site_ip_name[src_ip], src_ip, site_ip_name[dest_ip], dest_ip)
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if dest_ip == ip:
            text += '    %s (%s)  --->  %s (%s) \n' % (site_ip_name[src_ip], src_ip, site_ip_name[dest_ip], dest_ip)   
    if len(wtext)>0:
        text += "These are the remaining problematic src-destination paths for the past hour:\n"
        text += wtext
#    print(text)
    for user in user_interest_site_ip[ip]:
        user.alerts.append(text)

# user_alert_all

## Send out alert email customized for each user

In [19]:
for user in users:
    if len(user.alerts)>0:
        body = 'Dear ' + user.name + ',\n\n'
        body = body + '\tThis mail is to let you know that there are significant changes in the number of paths with large packet-loss detected by perfSONAR for sites you requested alerting about.\n\n'
        for a in user.alerts:
            body = body + a + '\n'
   
        # Add in two items: 1) Where to go for more information and 2) who to contact to pursue fixing this   +SPM 20-Apr-2017
        body += '\n To get more information about this alert message and its interpretation, please visit:\n'
        body += '  http://twiki.opensciencegrid.org/bin/view/Documentation/NetworkingInOSG/PacketLossAlert\n'
        body += '\n If you suspect a network problem and wish to follow up on it please email the appropriate support list:\n'
        body += '     For OSG sites:  goc@opensciencegrid.org using Subject: Possible network issue\n'
        body += '     For WLCG sites:  wlcg-network-throughput@cern.ch using Subject: Possible network issue\n'
        body += ' Please include this alert email to help expedite your request for network debugging support.\n'
        body += '\n To change your alerts preferences please use the following link:\n' + user.link
        body += '\n\nBest regards,\nATLAS Networking Alert Service'
        #print(body)
        A.sendMail(emailSubject, user.email, body)
        A.addAlert(test_name, user.name,'change in packet loss')