In [8]:
from datetime import datetime
from elasticsearch import Elasticsearch, helpers
import numpy as np
import pandas as pd
import pprint
from __future__ import division
from __future__ import print_function
from datetime import datetime
import calendar
from zipfile import ZipFile
import os
import sys
import dateutil
import zipfile
es = Elasticsearch(['atlas-kibana.mwt2.org:9200'])

In [2]:
# This is legacy code for collect_and_clean_Scroll and collect_and_cleanRawData_RealTime
# Newer functions directly give back the data and don't store it globaly
raw_data_pool = {}

def make_sure(src_site, dest_site):
    if src_site not in raw_data_pool:
        raw_data_pool[src_site] = {}
    if dest_site not in raw_data_pool[src_site]:
        raw_data_pool[src_site][dest_site] = pd.DataFrame()

def put_data(src_site, dest_site, timestamp_epoch, column_type, value):
    make_sure(src_site, dest_site)
    raw_data_pool[src_site][dest_site].set_value(timestamp_epoch, column_type, value)

## Define search functions 

In [3]:
# this function collects data from elastic search 
def collect_and_clean_Scroll(es, src_site, dest_site, timeframe):
    print("--- Starting data collection for {} to {}".format(src_site, dest_site))
    es_query = \
    {
        "fielddata_fields":[
            "timestamp"
        ],
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "bool": {
                        "must":[
                            { 'term': { 'srcSite': src_site } },
                            { 'term': { 'destSite': dest_site } },
                            { 'range': { 'timestamp': timeframe } }
                        ]
                    }
                }
            }
        }
    }
    es_index = "network_weather_2-*"
    
    # Collect latency raw data
    event_type = 'latency'
    scroll = list(helpers.scan(client=es, query=es_query, index=es_index, doc_type=event_type, request_timeout=6000))
    count = 0
    for dataPoint in scroll:
        count += 1
        timestamp_epoch = dataPoint['fields']['timestamp'][0]
        put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
        put_data(src_site, dest_site, timestamp_epoch, 'delay_median', dataPoint['_source']['delay_median'])
        put_data(src_site, dest_site, timestamp_epoch, 'delay_mean', dataPoint['_source']['delay_mean'])
        put_data(src_site, dest_site, timestamp_epoch, 'delay_sd', dataPoint['_source']['delay_sd'])
    print('Number of raw records of {} is {}'.format(event_type, count))
    
    # Collect packet_loss_rate raw data
    event_type = 'packet_loss_rate'
    scroll = list(helpers.scan(client=es, query=es_query, index=es_index, doc_type=event_type, request_timeout=6000))
    count = 0
    for dataPoint in scroll:
        count += 1
        timestamp_epoch = dataPoint['fields']['timestamp'][0]
        put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
        put_data(src_site, dest_site, timestamp_epoch, 'packet_loss', dataPoint['_source']['packet_loss'])
    print('Number of raw records of {} is {}'.format(event_type, count))
    
    # Collect throughput raw data
    event_type = 'throughput'
    scroll = list(helpers.scan(client=es, query=es_query, index=es_index, doc_type=event_type, request_timeout=6000))
    count = 0
    for dataPoint in scroll:
        count += 1
        timestamp_epoch = dataPoint['fields']['timestamp'][0]
        put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
        put_data(src_site, dest_site, timestamp_epoch, 'throughput', dataPoint['_source']['throughput'])
    print('Number of raw records of {} is {}'.format(event_type, count))
    
    # View statistics of raw_data_pool (already de-duplicated)
    print()
    print('De-duplication result:')
    print(raw_data_pool[src_site][dest_site].count(axis='index'))
    
    # Sort in-place
    raw_data_pool[src_site][dest_site].sort_index(inplace=True, ascending=False)
    
    # Store this DataFrame to disk file
    raw_data_pool[src_site][dest_site].to_pickle('raw_data/raw_data_from_{}_to_{}_and_time_{}_to_{}_sorted.pkl'.format(src_site, dest_site, timeframe["gte"], timeframe["lt"]))
    
    # How to get this DataFrame
    # raw_data_pool[src_site][dest_site]

    
    
    
# this method should only be used for "small" timeframes containig less than 10.000 results
# otherwise it will crash
# a good cache size could be 1000
def collect_and_cleanRawData_RealTime(es, src_site, dest_site, timeframe, cache_size):
    print("--- Starting data collection for {} to {}".format(src_site, dest_site))
    es_sorting = "timestamp:desc"
    es_size = cache_size
    es_query = \
    {
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "bool": {
                        "must":[
                            { 'term': { 'srcSite': src_site } },
                            { 'term': { 'destSite': dest_site } },
                            { 'range': { 'timestamp': timeframe } }
                        ]
                    }
                }
            }
        }
    }
    es_index = "network_weather_2-*"
    
    # Collect latency raw data
    print("--- Collectiong latency data")
    event_type = 'latency'
    es_from = 0
    response = es.search(index=es_index, body=es_query, size=10, from_=es_from, sort=es_sorting, doc_type = event_type, request_timeout=600)
    numDataPoints = response['hits']['total']
    while (es_from <= numDataPoints):
        response = es.search(index=es_index, body=es_query, size=es_size, from_=es_from, sort=es_sorting, doc_type = event_type, request_timeout=600)
        for dataPoint in response['hits']['hits']:
            timestamp_epoch = dataPoint['_source']['timestamp']
            put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
            put_data(src_site, dest_site, timestamp_epoch, 'delay_median', dataPoint['_source']['delay_median'])
            put_data(src_site, dest_site, timestamp_epoch, 'delay_mean', dataPoint['_source']['delay_mean'])
            put_data(src_site, dest_site, timestamp_epoch, 'delay_sd', dataPoint['_source']['delay_sd'])
        es_from = es_from + es_size
    print('Number of raw records of {} is {}'.format(event_type, numDataPoints))
        
    # Collect packet_loss_rate raw data
    print("--- Collectiong packet_loss_rate data")
    event_type = 'packet_loss_rate'
    es_from = 0
    response = es.search(index=es_index, body=es_query, size=10, from_=es_from, sort=es_sorting, doc_type = event_type, request_timeout=600)
    numDataPoints = response['hits']['total']
    while (es_from <= numDataPoints):
        response = es.search(index=es_index, body=es_query, size=es_size, from_=es_from, sort=es_sorting, doc_type = event_type, request_timeout=600)
        for dataPoint in response['hits']['hits']:
            timestamp_epoch = dataPoint['_source']['timestamp']
            put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
            put_data(src_site, dest_site, timestamp_epoch, 'packet_loss', dataPoint['_source']['packet_loss'])
        es_from = es_from + es_size
    print('Number of raw records of {} is {}'.format(event_type, numDataPoints))
    
    # Collect throughput raw data
    print("--- Collectiong throughput data")
    event_type = 'throughput'
    es_from = 0
    response = es.search(index=es_index, body=es_query, size=10, from_=es_from, sort=es_sorting, doc_type = event_type, request_timeout=600)
    numDataPoints = response['hits']['total']
    while (es_from <= numDataPoints):
        response = es.search(index=es_index, body=es_query, size=es_size, from_=es_from, sort=es_sorting, doc_type = event_type, request_timeout=600)
        for dataPoint in response['hits']['hits']:
            timestamp_epoch = dataPoint['_source']['timestamp']
            put_data(src_site, dest_site, timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
            put_data(src_site, dest_site, timestamp_epoch, 'throughput', dataPoint['_source']['throughput'])
        es_from = es_from + es_size
    
    print('Number of raw records of {} is {}'.format(event_type, numDataPoints))
    # View statistics of raw_data_pool (already de-duplicated)
    print("")
    print('De-duplication result:')
    print(raw_data_pool[src_site][dest_site].count(axis='index'))
    
    

In [4]:
def collect_and_clean_byIP_Scroll(es, src_data, dest_data, timeframe, saveToDisk = True):
    '''
    This will collect raw data from elastic search for the connection from a source, to a given destination.
    Within the given timeframe.
    The resulting data is returned and by can be as well wirtten to disk (default).
    '''
    print("--- Starting data collection for {} to {}, in the time of {} to {}".format(
            src_data['name'], dest_data['name'], timeframe["gte"], timeframe["lt"]))
    data = pd.DataFrame()
    es_query = \
    {
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "bool": {
                        "must":[
                            { 'term': { 'src': src_data["latency"] } },
                            { 'term': { 'dest': dest_data["latency"] } },
                            { 'range': { 'timestamp': timeframe } }
                        ]
                    }
                }
            }
        }
    }
    es_index = "network_weather_2-*"
    
    # Collect latency raw data
    event_type = 'latency'
    scroll = list(helpers.scan(client=es, query=es_query, index=es_index, doc_type=event_type, request_timeout=6000))
    count = 0
    for dataPoint in scroll:
        count += 1
        time = dateutil.parser.parse(dataPoint['_source']['timestamp'])
        timestamp_epoch = calendar.timegm(time.timetuple())
        data.set_value(timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
        data.set_value(timestamp_epoch, 'delay_median', dataPoint['_source']['delay_median'])
        data.set_value(timestamp_epoch, 'delay_mean', dataPoint['_source']['delay_mean'])
        data.set_value(timestamp_epoch, 'delay_sd', dataPoint['_source']['delay_sd'])
    print('Number of raw records of {} is {}'.format(event_type, count))
    
    # Collect packet_loss_rate raw data
    event_type = 'packet_loss_rate'
    scroll = list(helpers.scan(client=es, query=es_query, index=es_index, doc_type=event_type, request_timeout=6000))
    count = 0
    for dataPoint in scroll:
        count += 1
        time = dateutil.parser.parse(dataPoint['_source']['timestamp'])
        timestamp_epoch = calendar.timegm(time.timetuple())
        data.set_value(timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
        data.set_value(timestamp_epoch, 'packet_loss', dataPoint['_source']['packet_loss'])
    print('Number of raw records of {} is {}'.format(event_type, count))
    
    # rebuilding our query
    es_query = \
    {
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "bool": {
                        "must":[
                            { 'term': { 'src': src_data["throughput"] } },
                            { 'term': { 'dest': dest_data["throughput"] } },
                            { 'range': { 'timestamp': timeframe } }
                        ]
                    }
                }
            }
        }
    }
    
    # Collect throughput raw data
    event_type = 'throughput'
    scroll = list(helpers.scan(client=es, query=es_query, index=es_index, doc_type=event_type, request_timeout=6000))
    count = 0
    for dataPoint in scroll:
        count += 1
        time = dateutil.parser.parse(dataPoint['_source']['timestamp'])
        timestamp_epoch = calendar.timegm(time.timetuple())
        data.set_value(timestamp_epoch, 'iso_8601', dataPoint['_source']['timestamp'])
        data.set_value(timestamp_epoch, 'throughput', dataPoint['_source']['throughput'])
    print('Number of raw records of {} is {}'.format(event_type, count))
    print(" ")
    
    # View statistics of raw_data_pool (already de-duplicated)
    print()
    print('De-duplication result:')
    print(data.count(axis='index'))
    
    # Sort in-place
    data.sort_index(inplace=True, ascending=False)
    
    # Store this DataFrame to disk file
    if saveToDisk:
        print("Saving results to disk.")
        print("")
        data.to_pickle('raw_data/raw_data_by_IP_from_{}_to_{}_and_time_{}_to_{}_sorted.pkl'.format(src_data['name'], dest_data['name'], timeframe["gte"], timeframe["lt"]))
    
    # return the data
    return data


In [13]:
def starlikeConnector(es, center_point_data, out_points_list, timeframe, saveToDisk=saveToDisk):
    for dest_data in out_points_list:
        collect_and_clean_byIP_Scroll(es, center_point_data, dest_data, timeframe, saveToDisk=saveToDisk)
        collect_and_clean_byIP_Scroll(es, dest_data, center_point_data, timeframe, saveToDisk=saveToDisk)

        
def zipFolder(path="raw_data/"):
    zipf = zipfile.ZipFile('folder_zipped.zip', 'w', zipfile.ZIP_DEFLATED)
    for root, dirs, files in os.walk(path):
        for file in files:
            zipf.write(os.path.join(root, file))
    zipf.close()
        
            

## Collect raw data

In [6]:
# TODO: extract traceroute data, which is in the throughput <-> throughput connections
# it is typed as: "traceroute"
CERN_data = {'name': "CERN", "latency": "128.142.223.247", "throughput": "128.142.223.246"}
RAL_data = {'name': "RAL", "latency": "130.246.176.109", "throughput": "130.246.176.110"}
PIC_data = {'name': "PIC", "latency": "193.109.172.188", "throughput": "193.109.172.187"}
TRIUMF_data = {'name': "TRIUMF", "latency": "206.12.9.2", "throughput": "206.12.9.1"}
BNL_data = {'name': "BNL", "latency": "192.12.15.26", "throughput": "192.12.15.23"}
KIT_data = {'name': "KIT", "latency": "192.108.47.12", "throughput": "192.108.47.6"}

out_points_list = [RAL_data, PIC_data, TRIUMF_data, BNL_data, KIT_data]



timeframe = { 'gte': '2016-05-01', 'lt': '2016-09-06' }
starlikeConnector(es, CERN_data, out_points_list, timeframe, saveToDisk=True)

--- Starting data collection for CERN to RAL, in the time of 2016-05-01 to 2016-09-06
Number of raw records of latency is 18810
Number of raw records of packet_loss_rate is 18908
Number of raw records of throughput is 661
 

De-duplication result:
iso_8601        13164
delay_median    12314
delay_mean      12314
delay_sd        12314
packet_loss     12345
throughput        639
dtype: int64
Saving results to disk.

--- Starting data collection for RAL to CERN, in the time of 2016-05-01 to 2016-09-06
Number of raw records of latency is 17954
Number of raw records of packet_loss_rate is 17906
Number of raw records of throughput is 665
 

De-duplication result:
iso_8601        12521
delay_median    11688
delay_mean      11688
delay_sd        11688
packet_loss     11704
throughput        646
dtype: int64
Saving results to disk.

--- Starting data collection for CERN to PIC, in the time of 2016-05-01 to 2016-09-06
Number of raw records of latency is 16611
Number of raw records of packet_loss