In [1]:
%matplotlib inline

# DataFrame
import pandas as pd

# Elasticsearch
from elasticsearch import Elasticsearch, helpers

# datetime
from datetime import datetime

# isnan()
import math

# plot
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# OneClassSVM
# from sklearn.svm import OneClassSVM

# stats
# from scipy import stats

plt.style.use('ggplot')

## 3 src sites (MWT2) and 75 dest sites, collect and clean data!

In [6]:
es = Elasticsearch(['atlas-kibana.mwt2.org:9200'], timeout=6000)

In [41]:
list_src_ip = ['192.170.227.160', '149.165.225.223', '72.36.96.4']
list_dest_ip = ['192.41.230.59', '192.41.236.31', '192.231.127.41', '202.122.32.170', '192.12.15.26', 
                '192.5.207.251', '132.206.245.252', '142.150.19.61', '206.12.154.60', '128.142.223.247', 
                '148.187.64.25', '212.191.227.174', '131.169.98.30', '141.34.200.28', '146.83.90.7', 
                '109.105.125.232', '158.195.14.26', '192.108.47.12', '144.92.180.75', '134.158.132.200', 
                '134.158.159.85', '192.54.207.250', '147.213.204.112', '193.146.75.138', '147.156.116.40', 
                '193.48.99.76', '134.158.20.192', '134.158.103.10', '134.158.123.183', '193.48.83.165', 
                '144.16.111.26', '90.147.66.50', '192.84.128.112', '141.108.35.18', '131.154.254.12', 
                '159.93.229.151', '164.58.29.121', '109.105.124.86', '129.93.239.148', '129.15.40.231', 
                '193.109.172.188', '147.231.25.192', '130.246.176.109', '81.180.86.38', '81.180.86.64', 
                '85.122.31.74', '144.206.237.142', '144.206.236.189', '194.190.165.192', '145.100.17.8', 
                '206.12.24.251', '200.136.80.20', '203.185.96.100', '117.103.105.191', '157.82.112.68', 
                '206.12.9.2', '150.244.247.2', '128.227.221.44', '194.36.11.38', '128.40.4.25', 
                '194.80.35.169', '138.253.60.82', '195.194.105.178', '193.60.193.3', '129.215.213.70', 
                '130.209.239.124', '193.62.56.9', '131.111.66.196', '163.1.5.210', '130.246.47.129', 
                '139.184.80.18', '129.107.255.29', '192.111.108.112', '134.79.118.72', '132.195.125.213']
print('The size of list_src_ip is {}'.format(len(list_src_ip)))
print('The size of list_dest_ip is {}'.format(len(list_dest_ip)))

The size of list_src_ip is 3
The size of list_dest_ip is 75


In [42]:
raw_data_pool = {}

def make_sure(src_site_name, src_site_ip, dest_site_name, dest_site_ip):
    if src_site_ip not in raw_data_pool:
        raw_data_pool[src_site_ip] = {}
        print("In raw_data_pool, created new src_site_ip {} representing src_site_name {}".format(src_site_ip, src_site_name))
    if dest_site_ip not in raw_data_pool[src_site_ip]:
        raw_data_pool[src_site_ip][dest_site_ip] = pd.DataFrame()
        print("In raw_data_pool, created source site {}'s new dest_site_ip {} representing dest_site_name {}".format(src_site_name, dest_site_ip, dest_site_name))

def put_data(src_site_name, src_site_ip, dest_site_name, dest_site_ip, timestamp_epoch, column_type, value):
    make_sure(src_site_name, src_site_ip, dest_site_name, dest_site_ip)
    raw_data_pool[src_site_ip][dest_site_ip].set_value(timestamp_epoch, column_type, value)

In [43]:
# src_site_name = 'source_site_name'
# dest_site_name = 'destination_site_name'
src_site_ip = 'xxxxxx'
dest_site_ip = 'xxxxxx'

# timestamp = { 'gte': '2016-01-01', 'lt': '2016-06-01' }
timestamp = { 'gte': '2016-01-01' }

my_query = {}
my_query['query'] = {}
my_query['query']['filtered'] = {}
my_query['query']['filtered']['query'] = { "match_all": {} }
my_query['query']['filtered']['filter'] = {}
my_query['query']['filtered']['filter']['bool'] = {}
my_query['query']['filtered']['filter']['bool']['must'] = []
# my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'srcSite': src_site_name } })
# my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'destSite': dest_site_name } })
# my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'src': src_site_ip } })
# my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'dest': dest_site_ip } })
my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'srcProduction': True } })
my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'destProduction': True } })
my_query['query']['filtered']['filter']['bool']['must'].append({ 'range': { 'timestamp': timestamp } })
my_query['fielddata_fields'] = [ 'timestamp' ]

print(my_query)

my_index = "network_weather_2-*"

print(my_index)

{'query': {'filtered': {'query': {'match_all': {}}, 'filter': {'bool': {'must': [{'term': {'srcProduction': True}}, {'term': {'destProduction': True}}, {'range': {'timestamp': {'gte': '2016-01-01'}}}]}}}}, 'fielddata_fields': ['timestamp']}
network_weather_2-*


In [44]:
# Note!!! here name actually is IP address!!!

set_timestamp = set()

def collect_and_clean(src_site_name, dest_site_name):
    
#     set_src = set()
#     set_dest = set()
    
    # Set the source/destination pair
#     my_query['query']['filtered']['filter']['bool']['must'][0]['term']['src'] = src_site_name
#     my_query['query']['filtered']['filter']['bool']['must'][1]['term']['dest'] = dest_site_name
#     my_query['query']['filtered']['filter']['bool']['must'][0]['term']['src'] = '*'
#     my_query['query']['filtered']['filter']['bool']['must'][1]['term']['dest'] = '*'
    
    print(my_query)
    print()
    
    # Collect traceroute raw data
    my_type = 'traceroute'
    scroll = list(helpers.scan(client=es, query=my_query, index=my_index, doc_type=my_type, request_timeout=6000))
    count = 0
    for result in scroll:
        count += 1
        timestamp_epoch = result['fields']['timestamp'][0]
        set_timestamp.add(timestamp_epoch)
#         set_src.add(result['_source']['src'])
#         set_dest.add(result['_source']['dest'])
        info = result['_source']
#         print('{}.  {}  {}  {}  {}'.format(count, info.get('src', 'NoSrc'), info.get('dest', 'NoDest'), info.get('hash', 'NoHash'), info.get('hops', 'NoHops')))
#         put_data(src_site_name, info.get('src', 'NoSrc'), dest_site_name, info.get('dest', 'NoDest'), timestamp_epoch, 'iso_8601', info.get('timestamp', 'NoTimestamp'))
#         put_data(src_site_name, info.get('src', 'NoSrc'), dest_site_name, info.get('dest', 'NoDest'), timestamp_epoch, 'hash', info.get('hash', 'NoHash'))
#         put_data(src_site_name, info.get('src', 'NoSrc'), dest_site_name, info.get('dest', 'NoDest'), timestamp_epoch, 'hops', info.get('hops', 'NoHops'))
        if count % 100 == 0:
            print('Collecting raw data, count == {}'.format(count))
    if count == 0:
        print('No raw data found. Finish this src/dest pair now.')
        print()
        print()
        print('===============================================================')
        return
    print('Number of raw records of {} is {}'.format(my_type, count))
    
#     # Collect packet_loss_rate raw data
#     my_type = 'packet_loss_rate'
#     scroll = list(helpers.scan(client=es, query=my_query, index=my_index, doc_type=my_type, request_timeout=6000))
#     count = 0
#     for result in scroll:
#         count += 1
#         timestamp_epoch = result['fields']['timestamp'][0]
# #         set_src.add(result['_source']['src'])
# #         set_dest.add(result['_source']['dest'])
#         put_data(src_site_name, result['_source']['src'], dest_site_name, result['_source']['dest'], timestamp_epoch, 'iso_8601', result['_source']['timestamp'])
#         put_data(src_site_name, result['_source']['src'], dest_site_name, result['_source']['dest'], timestamp_epoch, 'packet_loss', result['_source']['packet_loss'])
#     print('Number of raw records of {} is {}'.format(my_type, count))
    
#     # Collect throughput raw data
#     my_type = 'throughput'
#     scroll = list(helpers.scan(client=es, query=my_query, index=my_index, doc_type=my_type, request_timeout=6000))
#     count = 0
#     for result in scroll:
#         count += 1
#         timestamp_epoch = result['fields']['timestamp'][0]
#         set_src.add(result['_source']['src'])
#         set_dest.add(result['_source']['dest'])
#         put_data(src_site_name, result['_source']['src'], dest_site_name, result['_source']['dest'], timestamp_epoch, 'iso_8601', result['_source']['timestamp'])
#         put_data(src_site_name, result['_source']['src'], dest_site_name, result['_source']['dest'], timestamp_epoch, 'throughput', result['_source']['throughput'])
#     print('Number of raw records of {} is {}'.format(my_type, count))
    
    print()
#     for s in set_src:
#         print('src_site_name {} has src_site_ip {}'.format(src_site_name, s))
#     for s in set_dest:
#         print('dest_site_name {} has dest_site_ip {}'.format(dest_site_name, s))
#     print('Please manually count(), sort_index(), to_pickle() using src_site_ip and dest_site_ip !!!!!!')
    
#     # View statistics of raw_data_pool (already de-duplicated)
    print()
    print('De-duplication result:')
    print(raw_data_pool[src_site_name][dest_site_name].count(axis='index'))
    
#     # Sort in-place
    raw_data_pool[src_site_name][dest_site_name].sort_index(inplace=True)
    
#     # Store this DataFrame to disk file
    raw_data_pool[src_site_name][dest_site_name].to_pickle('My_data_Path/Path_clean_from_{}_to_{}_sorted.pkl'.format(src_site_name, dest_site_name))
    
#     # How to get this DataFrame
#     # raw_data_pool[src_site][dest_site]

    print('========================== Data done and stored on disk. ==============================================')


print('Successfully defined the collect_and_clean(src_site_name, dest_site_name) function')

Successfully defined the collect_and_clean(src_site_name, dest_site_name) function


In [45]:
count = 0
for src_site_name in list_src_ip:
    if count == 1:
        break
    for dest_site_name in list_dest_ip:
        count += 1
        print('=============================================================================')
        print('{}. Now the pair is {} / {}'.format(count, src_site_name, dest_site_name))
        print()
        collect_and_clean(src_site_name, dest_site_name)
        print('=============================================================================')
        if count == 1:
            break

1. Now the pair is 192.170.227.160 / 192.41.230.59

{'query': {'filtered': {'query': {'match_all': {}}, 'filter': {'bool': {'must': [{'term': {'srcProduction': True}}, {'term': {'destProduction': True}}, {'range': {'timestamp': {'gte': '2016-01-01'}}}]}}}}, 'fielddata_fields': ['timestamp']}

Collecting raw data, count == 100
Collecting raw data, count == 200
Collecting raw data, count == 300
Collecting raw data, count == 400
Collecting raw data, count == 500
Collecting raw data, count == 600
Collecting raw data, count == 700
Collecting raw data, count == 800
Collecting raw data, count == 900
Collecting raw data, count == 1000
Collecting raw data, count == 1100
Collecting raw data, count == 1200
Collecting raw data, count == 1300
Collecting raw data, count == 1400
Collecting raw data, count == 1500
Collecting raw data, count == 1600
Collecting raw data, count == 1700
Collecting raw data, count == 1800
Collecting raw data, count == 1900
Collecting raw data, count == 2000
Collecting raw 

KeyError: '192.170.227.160'

In [54]:
a = pd.DataFrame(list(set_timestamp))
a.min()   # 07/28/2016  ==  1469724756000
a.max()   # 08/05/2016  ==  1470398661000  just now
print('No path records involving MWT2 three sites')
# 1) 'srcProduction': True
# 2) 'destProduction': True
# 3) 'gte': '2016-01-01'
# Number of raw records of traceroute is 68268

No path records involving MWT2 three sites
