In [1]:
%matplotlib inline

# DataFrame
import pandas as pd

# Elasticsearch
from elasticsearch import Elasticsearch, helpers

# datetime
from datetime import datetime

# isnan()
import math

# plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
# this is just to make plots look nice
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')
figsize = 12
s = {
 u'axes.edgecolor': u'#333333',
 u'axes.facecolor': u'#eeeeee',
 u'axes.grid': True,
 u'axes.labelsize': u'large',
 u'axes.labelcolor': u'#333333',
 u'axes.titlesize': u'x-large',
 u'examples.directory': u'',
 u'legend.fancybox': True,
 u'lines.linewidth': 2.0,
 u'patch.edgecolor': u'#eeeeee',
 u'patch.linewidth': 0.5,
 u'figure.figsize':(10,7)}
mpl.rcParams.update(s)

In [3]:
raw_data_pool = {}

def make_sure(src_site, dest_site):
    if src_site not in raw_data_pool:
        raw_data_pool[src_site] = {}
    if dest_site not in raw_data_pool[src_site]:
        raw_data_pool[src_site][dest_site] = pd.DataFrame()

def put_data(src_site, dest_site, timestamp_epoch, column_type, value):
    make_sure(src_site, dest_site)
    raw_data_pool[src_site][dest_site].set_value(timestamp_epoch, column_type, value)

In [4]:
es = Elasticsearch(['atlas-kibana.mwt2.org:9200'], timeout=6000)

In [6]:
src_site = "source_site"
dest_site = "destination_site"

timestamp = { 'gte': '2016-01-01', 'lt': '2016-06-01' }

my_query = {}
my_query['query'] = {}
my_query['query']['filtered'] = {}
my_query['query']['filtered']['query'] = { "match_all": {} }
my_query['query']['filtered']['filter'] = {}
my_query['query']['filtered']['filter']['bool'] = {}
my_query['query']['filtered']['filter']['bool']['must'] = []
my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'srcSite': src_site } })
my_query['query']['filtered']['filter']['bool']['must'].append({ 'term': { 'destSite': dest_site } })
my_query['query']['filtered']['filter']['bool']['must'].append({ 'range': { 'timestamp': timestamp } })
my_query['fielddata_fields'] = [ 'timestamp' ]

print(my_query)

my_index = "network_weather_2-*"

print(my_index)

{'fielddata_fields': ['timestamp'], 'query': {'filtered': {'filter': {'bool': {'must': [{'term': {'srcSite': 'source_site'}}, {'term': {'destSite': 'destination_site'}}, {'range': {'timestamp': {'lt': '2016-06-01', 'gte': '2016-01-01'}}}]}}, 'query': {'match_all': {}}}}}
network_weather_2-*


In [5]:
def collect_and_clean(src_site, dest_site):
    
    # Set the source/destination pair
    my_query['query']['filtered']['filter']['bool']['must'][0]['term']['srcSite'] = src_site
    my_query['query']['filtered']['filter']['bool']['must'][1]['term']['destSite'] = dest_site
    print(my_query)
    print()
    
    # Collect latency raw data
    my_type = 'latency'
    scroll = list(helpers.scan(client=es, query=my_query, index=my_index, doc_type=my_type, request_timeout=6000))
    count = 0
    for result in scroll:
        count += 1
        timestamp_epoch = result['fields']['timestamp'][0]
        put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', result['_source']['timestamp'])
        put_data(src_site, dest_site, timestamp_epoch, 'delay_median', result['_source']['delay_median'])
        put_data(src_site, dest_site, timestamp_epoch, 'delay_mean', result['_source']['delay_mean'])
        put_data(src_site, dest_site, timestamp_epoch, 'delay_sd', result['_source']['delay_sd'])
    print('Number of raw records of {} is {}'.format(my_type, count))
    
    # Collect packet_loss_rate raw data
    my_type = 'packet_loss_rate'
    scroll = list(helpers.scan(client=es, query=my_query, index=my_index, doc_type=my_type, request_timeout=6000))
    count = 0
    for result in scroll:
        count += 1
        timestamp_epoch = result['fields']['timestamp'][0]
        put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', result['_source']['timestamp'])
        put_data(src_site, dest_site, timestamp_epoch, 'packet_loss', result['_source']['packet_loss'])
    print('Number of raw records of {} is {}'.format(my_type, count))
    
    # Collect throughput raw data
    my_type = 'throughput'
    scroll = list(helpers.scan(client=es, query=my_query, index=my_index, doc_type=my_type, request_timeout=6000))
    count = 0
    for result in scroll:
        count += 1
        timestamp_epoch = result['fields']['timestamp'][0]
        put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', result['_source']['timestamp'])
        put_data(src_site, dest_site, timestamp_epoch, 'throughput', result['_source']['throughput'])
    print('Number of raw records of {} is {}'.format(my_type, count))
    
    # View statistics of raw_data_pool (already de-duplicated)
    print()
    print('De-duplication result:')
    print(raw_data_pool[src_site][dest_site].count(axis='index'))
    
    # Sort in-place
    raw_data_pool[src_site][dest_site].sort_index(inplace=True)
    
    # Store this DataFrame to disk file
    raw_data_pool[src_site][dest_site].to_pickle('My_data/My_data_from_{}_to_{}_sorted.pkl'.format(src_site, dest_site))
    
    # How to get this DataFrame
    # raw_data_pool[src_site][dest_site]


print('Successfully defined the collect_and_clean(src_site, dest_site) function')

Successfully defined the collect_and_clean(src_site, dest_site) function
