# Send alert emails about packet loss based on alarms and user subscribing

This notebook is run by a cron job every hour, and its purpose is to send alert emails about packet loss for user specified site(s) based on alarms and user subscribing records. 

This notebook works following this procedure: 

(1) Get all the alarms of type packetloss for the past hour (call it NEW) and past past hour (call it OLD) from Elasticsearch

(2) Get the user subscribing records from Google Sheets calling APIs in subscribers.py

(3) Process the alarms data and subscribing data to make them easier to use for this monitoring task

(4) TN_old means total number of alarmed links involving a specific site ip (no matter from it or to it) for OLD time period

(5) TN_new means total number of alarmed links involving a specific site ip (no matter from it or to it) for NEW time period

(6) TN_delta means the change of value from TN_old to TN_new. We need to compare TN_delta v.s. +N and v.s. -N (tune N later)

(7) If a site ip never occurs in NEW and OLD, then it must be totally fine, and we do not care about it at all (TN_old == TN_new == TN_delta == 0)

(8) If a site ip occurs in NEW or OLD or both, then we may have TN_delta > 0 or == 0 or < 0 for this site ip, so we want to take a closer look at this site ip, so we do (9) (10) (11)

(9) If TN_delta >= +N, then overall the links connected to this site are becoming worse, so we send email

(10) If TN_delta <= -N, then overall the links connected to this site are becoming better, so we send email

(11) Otherwise, the overall status for this site is not changing or just changing slightly, so we do not send email

(12) In order to send email, we need a dictionary whose key is site ip and value is a list of relevant user emails



## Import necessary packages and classes

In [1]:
# Retrieve user subscribing records from google sheets. Using Xinran version based on Ilija version.
from subscribers import subscribers
google = subscribers()

# Related to Elasticsearch queries
from elasticsearch import Elasticsearch, exceptions as es_exceptions, helpers
import datetime

# Regular Expression
import re

## Establish Elasticsearch connection

In [2]:
es = Elasticsearch(hosts=[{'host':'cl-analytics.mwt2.org', 'port':9200}],timeout=60)

## List all alarms-yyyy.mm indices

In [3]:
indices = es.cat.indices(index="alarms-*", h="index", request_timeout=600).split('\n')
indices = [x for x in indices if x != '']
indices = [x.strip() for x in indices]
print(indices)

['alarms-2016.10', 'alarms-2016.09', 'alarms-2016.08']


## Find indices to be used

In [4]:
cday = datetime.datetime.utcnow()
pday = cday - datetime.timedelta(days=1)
ind1 = 'alarms-%d.%02d' % (cday.year, cday.month)
ind2 = 'alarms-%d.%02d' % (pday.year, pday.month)

print('checking for indices:', ind1, ind2)

ind=[]
if ind1 in indices:
    ind.append(ind1)
if ind2 != ind1 and ind2 in indices and cday.hour<3:   # not necessarily 3, just indicate it is the beginning period of new day
    ind.append(ind2)

if len(ind)==0:
    print('no current indices found. Aborting.')
    exit
else:
    print('will use indices:', ind)

checking for indices: alarms-2016.10 alarms-2016.10
will use indices: ['alarms-2016.10']


## Queries to find all the alarms of type Packet Loss for the past hour and past past hour

In [5]:
query_new = {
    "size": 1000,
    "query": {
        "bool": {
            "must": [
                {"term": { "_type": "packetloss" }}
            ],
            "filter": {
                "range": {
                    "alarmTime": {
                        "gt": "now-1h"
                    }
                }
            }
        }
    }
}

query_old = {
    "size": 1000,
    "query": {
        "bool": {
            "must": [
                {"term": { "_type": "packetloss" }}
            ],
            "filter": {
                "range": {
                    "alarmTime": {
                        "gt": "now-2h",
                        "lt": "now-1h"
                    }
                }
            }
        }
    }
}

print(query_new)
print(query_old)

{'query': {'bool': {'filter': {'range': {'alarmTime': {'gt': 'now-1h'}}}, 'must': [{'term': {'_type': 'packetloss'}}]}}, 'size': 1000}
{'query': {'bool': {'filter': {'range': {'alarmTime': {'gt': 'now-2h', 'lt': 'now-1h'}}}, 'must': [{'term': {'_type': 'packetloss'}}]}}, 'size': 1000}


## Execute the query

In [6]:
result_new = es.search(index=ind, body=query_new, request_timeout=120)
print('Number of hits of new alarms:', result_new['hits']['total'] )

result_old = es.search(index=ind, body=query_old, request_timeout=120)
print('Number of hits of old alarms:', result_old['hits']['total'] )

hits_new = result_new['hits']['hits']
hits_old = result_old['hits']['hits']

Number of hits of new alarms: 105
Number of hits of old alarms: 0


## Generate the two dictionaries for sites, one is from ip to name, one is from name to ip

In [7]:
site_ip_name = {}

for hit in hits_new:
    info = hit['_source']
    site_ip_name[info['src']] = info['srcSite']
    site_ip_name[info['dest']] = info['destSite']

for hit in hits_old:
    info = hit['_source']
    site_ip_name[info['src']] = info['srcSite']
    site_ip_name[info['dest']] = info['destSite']

print(site_ip_name)

{'130.246.47.129': 'UKI-SOUTHGRID-RALPP', '192.41.230.59': 'AGLT2', '134.158.20.192': 'IN2P3-CPPM', '147.213.204.117': 'IEPSAS-Kosice', '194.190.165.192': 'RU-Protvino-IHEP', '193.205.76.76': 'UnknownSite', '132.195.125.213': 'wuppertalprod', '149.165.224.247': 'UnknownSite', '144.206.236.189': 'RRC-KI-T1', '141.108.35.18': 'INFN-ROMA1', '192.231.127.41': 'Australia-ATLAS', '194.80.35.169': 'UKI-NORTHGRID-LANCS-HEP', '143.167.3.116': 'UKI-NORTHGRID-SHEF-HEP', '193.144.80.12': 'UnknownSite', '145.100.17.8': 'SARA-MATRIX', '165.91.55.4': 'UnknownSite', '192.41.236.35': 'UnknownSite', '134.158.103.10': 'IN2P3-LAPP', '192.12.15.111': 'BNL-ATLAS', '148.187.64.25': 'CSCS-LCG2', '149.165.225.223': 'MWT2', '193.60.193.3': 'UKI-SCOTGRID-DURHAM', '202.122.32.170': 'BEIJING-LCG2', '134.61.24.193': 'UnknownSite', '131.111.66.196': 'UKI-SOUTHGRID-CAM-HEP', '163.1.5.210': 'UKI-SOUTHGRID-OX-HEP', '131.154.254.12': 'INFN-T1', '130.246.176.109': 'RAL-LCG2', '193.136.75.146': 'NCG-INGRID-PT', '137.222.7

In [8]:
site_name_ip = {}

for ip in site_ip_name:
    name = site_ip_name[ip]
    if name in site_name_ip:
        site_name_ip[name].append(ip)
    else:
        site_name_ip[name] = [ip]

print(site_name_ip)

{'wuppertalprod': ['132.195.125.213'], 'RO-02-NIPNE': ['81.180.86.38'], 'BU_ATLAS_Tier2': ['192.5.207.251'], 'UKI-NORTHGRID-LANCS-HEP': ['194.80.35.169'], 'UTA_SWT2': ['129.107.255.29'], 'UKI-SOUTHGRID-OX-HEP': ['163.1.5.210'], 'UNI-FREIBURG': ['132.230.202.235'], 'INFN-T1': ['131.154.254.12'], 'EELA-UTFSM': ['146.83.90.7'], 'AGLT2': ['192.41.230.59'], 'RO-07-NIPNE': ['81.180.86.64'], 'UKI-SCOTGRID-ECDF': ['129.215.213.70'], 'RRC-KI': ['144.206.237.142'], 'CA-VICTORIA-WESTGRID-T2': ['206.12.154.60'], 'pic': ['193.109.172.188'], 'MWT2': ['149.165.225.223', '72.36.96.4', '192.170.227.160'], 'UnknownSite': ['193.205.76.76', '149.165.224.247', '193.144.80.12', '165.91.55.4', '192.41.236.35', '134.61.24.193', '152.84.101.141', '192.101.161.186', '161.116.81.235', '192.101.107.152', '90.147.66.50', '143.215.129.69', '193.206.93.45', '18.12.1.171'], 'Hephy-Vienna': ['193.170.243.215'], 'INFN-NAPOLI-ATLAS': ['90.147.67.252'], 'FZK-LCG2': ['192.108.47.12'], 'UKI-SOUTHGRID-CAM-HEP': ['131.111.66

## Calculate TN_old, the total number of alarmed links involving a specific site ip (either as source site or as destination site) for the OLD time period

In [9]:
TN_old = {}

def TN_old_add_one(ip):
    if ip in TN_old:
        TN_old[ip] += 1
    else:
        TN_old[ip] = 1

for alarm in hits_old:
    TN_old_add_one(alarm['_source']['src'])
    TN_old_add_one(alarm['_source']['dest'])

TN_old

{}

## Calculate TN_new, the total number of alarmed links involving a specific site ip (either as source site or as destination site) for the NEW time period

In [10]:
TN_new = {}

def TN_new_add_one(ip):
    if ip in TN_new:
        TN_new[ip] += 1
    else:
        TN_new[ip] = 1

for alarm in hits_new:
    TN_new_add_one(alarm['_source']['src'])
    TN_new_add_one(alarm['_source']['dest'])

TN_new

{'128.142.223.247': 1,
 '129.107.255.29': 1,
 '129.215.213.70': 2,
 '130.246.176.109': 2,
 '130.246.47.129': 1,
 '131.111.66.196': 2,
 '131.154.254.12': 1,
 '131.169.98.30': 2,
 '132.195.125.213': 1,
 '132.230.202.235': 10,
 '134.158.103.10': 2,
 '134.158.123.183': 1,
 '134.158.132.200': 1,
 '134.158.150.245': 10,
 '134.158.159.85': 2,
 '134.158.20.192': 1,
 '134.158.73.243': 4,
 '134.61.24.193': 4,
 '137.222.74.15': 4,
 '141.108.35.18': 15,
 '141.34.200.28': 1,
 '143.167.3.116': 1,
 '143.215.129.69': 2,
 '144.206.236.189': 2,
 '144.206.237.142': 1,
 '144.92.180.75': 1,
 '145.100.17.8': 2,
 '146.83.90.7': 1,
 '147.213.204.117': 1,
 '148.187.64.25': 2,
 '149.165.224.247': 1,
 '149.165.225.223': 2,
 '152.84.101.141': 1,
 '158.195.14.26': 1,
 '161.116.81.235': 37,
 '163.1.5.210': 2,
 '165.91.55.4': 1,
 '18.12.1.171': 4,
 '192.101.107.152': 3,
 '192.101.161.186': 1,
 '192.108.47.12': 2,
 '192.12.15.111': 1,
 '192.135.14.32': 4,
 '192.170.227.160': 4,
 '192.231.127.41': 1,
 '192.41.230.59':

## Calculate TN_delta, which is equal to ( TN_new - TN_old )

In [11]:
TN_delta = {}

for ip in TN_old:
    if ip in TN_new:
        TN_delta[ip] = TN_new[ip] - TN_old[ip]
    else:
        TN_delta[ip] = -TN_old[ip]

for ip in TN_new:
    if ip not in TN_old:
        TN_delta[ip] = TN_new[ip]

TN_delta

{'128.142.223.247': 1,
 '129.107.255.29': 1,
 '129.215.213.70': 2,
 '130.246.176.109': 2,
 '130.246.47.129': 1,
 '131.111.66.196': 2,
 '131.154.254.12': 1,
 '131.169.98.30': 2,
 '132.195.125.213': 1,
 '132.230.202.235': 10,
 '134.158.103.10': 2,
 '134.158.123.183': 1,
 '134.158.132.200': 1,
 '134.158.150.245': 10,
 '134.158.159.85': 2,
 '134.158.20.192': 1,
 '134.158.73.243': 4,
 '134.61.24.193': 4,
 '137.222.74.15': 4,
 '141.108.35.18': 15,
 '141.34.200.28': 1,
 '143.167.3.116': 1,
 '143.215.129.69': 2,
 '144.206.236.189': 2,
 '144.206.237.142': 1,
 '144.92.180.75': 1,
 '145.100.17.8': 2,
 '146.83.90.7': 1,
 '147.213.204.117': 1,
 '148.187.64.25': 2,
 '149.165.224.247': 1,
 '149.165.225.223': 2,
 '152.84.101.141': 1,
 '158.195.14.26': 1,
 '161.116.81.235': 37,
 '163.1.5.210': 2,
 '165.91.55.4': 1,
 '18.12.1.171': 4,
 '192.101.107.152': 3,
 '192.101.161.186': 1,
 '192.108.47.12': 2,
 '192.12.15.111': 1,
 '192.135.14.32': 4,
 '192.170.227.160': 4,
 '192.231.127.41': 1,
 '192.41.230.59':

## Look at the distribution of TN_delta, so that we can tune the parameter N

In [12]:
for N in range(10):
    count_worse = 0
    count_better = 0
    count_stable = 0
    for ip in TN_delta:
        if TN_delta[ip] > N:
            count_worse += 1
        elif TN_delta[ip] < -N:
            count_better += 1
        else:
            count_stable += 1
    print('N=%d     links went bad=%d     links went good=%d     unchanged=%d' % (N, count_worse, count_better, count_stable))

N=0     links went bad=68     links went good=0     unchanged=0
N=1     links went bad=35     links went good=0     unchanged=33
N=2     links went bad=17     links went good=0     unchanged=51
N=3     links went bad=16     links went good=0     unchanged=52
N=4     links went bad=9     links went good=0     unchanged=59
N=5     links went bad=9     links went good=0     unchanged=59
N=6     links went bad=7     links went good=0     unchanged=61
N=7     links went bad=7     links went good=0     unchanged=61
N=8     links went bad=5     links went good=0     unchanged=63
N=9     links went bad=5     links went good=0     unchanged=63


## Let's use N=3 for now, and we will tune later

In [13]:
N = 3

ip_list_worse = []
ip_list_better = []

for ip in TN_delta:
    if TN_delta[ip] >= N:
        ip_list_worse.append(ip)
    elif TN_delta[ip] <= -N:
        ip_list_better.append(ip)

print('--- The ip of the site(s) which got worse:')
print(ip_list_worse)
print('--- The ip of the site(s) which got better:')
print(ip_list_better)

--- The ip of the site(s) which got worse:
['141.108.35.18', '134.158.150.245', '192.41.236.35', '134.61.24.193', '193.136.75.146', '18.12.1.171', '137.222.74.15', '132.230.202.235', '192.135.14.32', '193.170.243.215', '193.62.56.9', '134.158.73.243', '81.180.86.64', '161.116.81.235', '81.180.86.38', '192.101.107.152', '192.170.227.160']
--- The ip of the site(s) which got better:
[]


## Generate the dictionary: key = site name, value = a list of relevant user emails

In [14]:
user_interest_site_name = {}

def reg_user_interest_site_name(sitename, email):
    if sitename in user_interest_site_name:
        user_interest_site_name[sitename].append(email)
    else:
        user_interest_site_name[sitename] = [email]

taskName = 'Packet loss increase for link(s) where your site is a source or destination'

subscribe_records = google.getSubscribers_withSiteName(taskName)

# Handle blank answer, one site, several sites separated by comma, wildcard such as prefix* etc.
for record in subscribe_records:
    email = record[1]
    sitenames = record[3].strip()
    if len(sitenames) == 0:
        sitenames = '.'  # Handle blank answer, so match all site names
    sitenames = [x.strip().replace('*', '.') for x in sitenames.split(',')]  # Handle several site names, and wildcard
    for sn in sitenames:
        p = re.compile(sn, re.IGNORECASE)
        for sitename in site_name_ip:
            if p.match(sitename):
                reg_user_interest_site_name(sitename, email)

user_interest_site_name

AttributeError: 'subscribers' object has no attribute 'getSubscribers_withSiteName'

## Generate the dictionary: key = site ip, value = a list of relevant user emails

In [None]:
user_interest_site_ip = {}

def reg_user_interest_site_ip(siteip, email):
    if siteip in user_interest_site_ip:
        user_interest_site_ip[siteip].append(email)
    else:
        user_interest_site_ip[siteip] = [email]

for sitename in user_interest_site_name:
    for siteip in site_name_ip[sitename]:
        for email in user_interest_site_name[sitename]:
            reg_user_interest_site_ip(siteip, email)

user_interest_site_ip

## The variable user_alert_all holds all the needed info to send an email to a specific user

In [None]:
user_alert_all = {}

for user in google.getAllUserBasicInfo():
    user_info = {}
    user_info['email'] = user[0]
    user_info['fullname'] = user[1]
    user_info['link'] = user[2]
    user_info['alerts'] = []
    user_alert_all[user[0]] = user_info    # email should be unique globally, so it is used as key

user_alert_all

## Generate info for sending alert emails (for the sites getting worse)

In [None]:
for ip in ip_list_worse:
    text = "The site %s (%s)'s links got worse, total number from %d to %d links.\n" % (site_ip_name[ip], ip, TN_old.get(ip,0), TN_new.get(ip,0))
    text += "These are all the bad links for the past hour:\n"
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if src_ip == ip:
            text += '    This site  --->  %s (%s) \n' % (site_ip_name[dest_ip], dest_ip)
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if dest_ip == ip:
            text += '    %s (%s)  --->  This site \n' % (site_ip_name[src_ip], src_ip)
    print(text)
    for email in user_interest_site_ip[ip]:
        user_alert_all[email]['alerts'].append(text)

# user_alert_all

## Generate info for sending alert emails (for the sites getting better)

In [None]:
for ip in ip_list_better:
    text = "The site %s (%s)'s links got improved, total number from %d to %d links.\n" % (site_ip_name[ip], ip, TN_old.get(ip,0), TN_new.get(ip,0))
    text += "These are all the bad links for the past hour:\n"
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if src_ip == ip:
            text += '    This site  --->  %s (%s) \n' % (site_ip_name[dest_ip], dest_ip)
    for alarm in hits_new:
        src_ip = alarm['_source']['src']
        dest_ip = alarm['_source']['dest']
        if dest_ip == ip:
            text += '    %s (%s)  --->  This site \n' % (site_ip_name[src_ip], src_ip)
    print(text)
    for email in user_interest_site_ip[ip]:
        user_alert_all[email]['alerts'].append(text)

# user_alert_all

## Dummy sendMail function for development purpose

In [None]:
def sendMailDummy(subject, to, body):
    if len(body['alerts']) == 0:
        print('======== Do not send alert email to %s as there is no alert for this user ========' % to)
    else:
        print('========= Send the following email to a user =========')
        print('------ Email subject ------')
        subject = 'Alert email customized for %s' % body['fullname']
        print(subject)
        print('------ Email to -----------')
        print(to)
        print('------ Email body ---------')
        text = 'Hi %s,\n\n' % body['fullname']
        text += '    The following are all the alerts about packet loss that you are interested in:\n\n\n'
        for alert in body['alerts']:
            text += alert
            text += '\n\n'
        text += 'Thank you for using this system. If you want to update your settings or unsubscribe, please use this link: %s' % body['link']
        text += '\n\nBest,\nThe team\n\n\n'
        print(text)
        print('======================================================')
    print()
    print()

## Send out alert email customized for each user

In [None]:
for email in user_alert_all:
    sendMailDummy('auto_subject', email, user_alert_all[email])