# Exports perfsonar data in one frame per src site

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from time import time

import numpy as np
import pandas as pd

es = Elasticsearch(['atlas-kibana.mwt2.org:9200'],timeout=60)
indices = "network_weather-2017.*"

## set parameters

In [2]:
type = 'packet_loss_rate'
start_date = '2017-05-10 00:00:00'
end_date = '2017-10-10 23:59:59'

start = pd.Timestamp(start_date)
end   = pd.Timestamp(end_date)

## get unique sources

In [3]:
sources={}
my_query = {
    "size": 0,
    "aggs" : { 
        "sources" : { 
            "terms" : { 
              "field" : "src",
               "size": 500
            }
        }
    },
    'query': {
       'bool':{
            'must':[
                {'range': {'timestamp': {'gte': start.strftime('%Y%m%dT%H%M00Z'), 'lt': end.strftime('%Y%m%dT%H%M00Z')}}},
                {'term': {'_type': type}}
            ]
        }
    }
}

res = es.search(index=indices, body=my_query)

print(res['hits'])
for s in res['aggregations']['sources']['buckets']:
    print(s)
    sources[s['key']]=s['doc_count']

{'hits': [], 'max_score': 0.0, 'total': 1166905324}
{'key': '130.246.176.109', 'doc_count': 25168431}
{'key': '192.108.47.12', 'doc_count': 24046952}
{'key': '193.48.99.76', 'doc_count': 23976170}
{'key': '192.12.15.26', 'doc_count': 21604925}
{'key': '134.158.159.85', 'doc_count': 19901298}
{'key': '134.158.132.200', 'doc_count': 19532659}
{'key': '192.170.227.160', 'doc_count': 19500717}
{'key': '149.165.225.223', 'doc_count': 19490807}
{'key': '129.93.183.249', 'doc_count': 19432427}
{'key': '202.122.32.170', 'doc_count': 19355050}
{'key': '72.36.96.4', 'doc_count': 19194927}
{'key': '134.158.73.243', 'doc_count': 18500934}
{'key': '206.12.9.2', 'doc_count': 18268913}
{'key': '145.100.17.8', 'doc_count': 18251442}
{'key': '147.231.25.192', 'doc_count': 17761374}
{'key': '192.41.230.59', 'doc_count': 17648629}
{'key': '193.109.172.188', 'doc_count': 17472010}
{'key': '194.190.165.192', 'doc_count': 17230895}
{'key': '130.209.239.124', 'doc_count': 17193950}
{'key': '192.54.207.250', 

## get actual data

In [4]:

for k,v in sources.items():
    print ('source server:',k,'\tdocuments', v)
    my_query = {
        'query': { 
           'bool':{
                'must':[
                    {'range': {'timestamp': {'gte': start.strftime('%Y%m%dT%H%M00Z'), 'lt': end.strftime('%Y%m%dT%H%M00Z')}}},
                    {'term': {'src': k}},
                    {'term': {'_type': type}}
                ]
            }
        }
    }
    
    scroll = scan(client=es, index=indices, query=my_query)
    
    count = 0
    
    allData={} # will be like this: {'dest_host':[[timestamp],[value]], ...} 
    
    for res in scroll:
    #     if count<2: print(res) 
        if not count%1000000: print(count)
        dst = 'd_'+res['_source']['dest'].replace('.','_').replace(':','_') # old data - dest, new data - dest_host
        if dst not in allData: allData[dst]=[[],[]]
        allData[dst][0].append(res['_source']['timestamp'] )
        allData[dst][1].append(res['_source']['packet_loss'])

        count=count+1

    dfs=[]
    for dest,data in allData.items():
        ts=pd.to_datetime(data[0],unit='ms')
        df=pd.DataFrame({dest:data[1]}, index=ts )
        df.sort_index(inplace=True)
        df.index = df.index.map(lambda t: t.replace(second=0))
        df = df[~df.index.duplicated(keep='last')]
        dfs.append(df)
        #print(df.head(2))
    
    full_df = pd.concat(dfs, axis=1)
    print(full_df.shape)
    
    hdf = pd.HDFStore( k + '.h5')
    hdf.put('data', full_df, format='table', data_columns=True)
    hdf.close()
    
#     break

source server: 164.113.255.2 	documents 374505
0
(218035, 2)
source server: 2607:f388:107c:502::b 	documents 148
0
(147, 1)
source server: 206.166.0.130 	documents 34272
0
(17076, 1)
source server: 195.194.105.178 	documents 16832504
0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
(220219, 73)
source server: 131.243.187.37 	documents 206856
0
(205866, 1)
source server: 192.101.161.186 	documents 1294098
0
1000000
(221098, 7)
source server: 109.105.125.232 	documents 11598107
0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000


GET http://atlas-kibana.mwt2.org:9200/_search/scroll?scroll=5m [status:N/A request:60.045s]
Traceback (most recent call last):
  File "/usr/lib/python3.4/site-packages/urllib3/connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "/usr/lib/python3.4/site-packages/urllib3/connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib64/python3.4/http/client.py", line 1227, in getresponse
    response.begin()
  File "/usr/lib64/python3.4/http/client.py", line 386, in begin
    version, status, reason = self._read_status()
  File "/usr/lib64/python3.4/http/client.py", line 348, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/lib64/python3.4/socket.py", line 378, in readinto
    return self._sock.recv_into(b)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):


ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='atlas-kibana.mwt2.org', port=9200): Read timed out. (read timeout=60))