## Install packages

In [1]:
from elasticsearch import Elasticsearch, exceptions as es_exceptions
from elasticsearch.helpers import scan
es = Elasticsearch([{'host':'uct2-es-door.mwt2.org', 'port':9200}])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

## Set time range and index in query

In [11]:
#change index depending on month
ind="stashcp-2016.8"

myquery1={
  "query": {
    "filtered": {
      "query": {
        "query_string": {
          "query": "*",
          "analyze_wildcard": True
        }
      },
      "filter": {
        "bool": {
          "must": [
            {
              "query": {
                "query_string": {
                  "analyze_wildcard": True,
                  "query": "*"
                }
              }
            },
            {
              "range": {
                "timestamp": {
                  "gte": 1467383312083,
                  "lte": 1467387756827,
                  "format": "epoch_millis"
                }
              }
            }
          ],
        }
      }
    }
  },
}
myquery2={
  "query": {
    "filtered": {
      "query": {
        "query_string": {
          "query": "*",
          "analyze_wildcard": True
        }
      },
      "filter": {
        "bool": {
          "must": [
            {
              "query": {
                "query_string": {
                  "analyze_wildcard": True,
                  "query": "*"
                }
              }
            },
            {
              "range": {
                "timestamp": {
                  "gte": 1471010364437,
                  "lte": 1471036424501,
                  "format": "epoch_millis"
                }
              }
            }
          ],
        }
      }
    }
  },
}

## Pull data from ES and convert to panda dataframe

In [13]:
page= es.search(index=ind, body=myquery2, scroll='2m', search_type='scan', size=1)

sid = page['_scroll_id']

scroll_size = page['hits']['total']
results=[]

while (scroll_size > 0):
    page = es.scroll(scroll_id = sid, scroll = '2m')
    results.append(page['hits']['hits'])
    sid = page['_scroll_id']
    scroll_size = len(page['hits']['hits'])

scroll_size = page['hits']['total']
Res=[]
for i in range(0,scroll_size):
    Res.append(results[i][0]['_source'])

print('Number of records: ', scroll_size)

df = pd.DataFrame(Res)

#ignore the following fields
del df['destination_space']
del df['xrdcp_version']
del df['tries']
del df['download_size']
del df['filesize']
del df['IP']

begin=[]
for i in range(scroll_size):
    begin.append((int(df['timestamp'][i])-int(df['download_time'][i]))) #convert to minutes
    
df['begin']=begin
print(df.head(2))
print(list(df.columns.values))

Number of records:  4188
                      cache download_time           end1 end2  \
0  root://hcc-stash.unl.edu         36342  1471011229225    0   
1  root://hcc-stash.unl.edu         37140  1471011229936    0   

                            filename                      host    sitename  \
0  user/taburaad/public/2gb_file.tar  root://hcc-stash.unl.edu  Tusker-CE1   
1  user/taburaad/public/2gb_file.tar  root://hcc-stash.unl.edu   Crane-CE1   

          start1 start2 start3   status      timestamp xrdexit1 xrdexit2  \
0  1471011192883      0      0  Success  1471011229000        0       -1   
1  1471011192796      0      0  Success  1471011229000        0       -1   

  xrdexit3          begin  
0       -1  1471011192658  
1       -1  1471011191860  
['cache', 'download_time', 'end1', 'end2', 'filename', 'host', 'sitename', 'start1', 'start2', 'start3', 'status', 'timestamp', 'xrdexit1', 'xrdexit2', 'xrdexit3', 'begin']


In [14]:
def plot_jobs(site, name):
    columns=list(df.columns.values)
    df_site=pd.DataFrame(columns=columns)
    for i in range(scroll_size):
        if df['host'][i]==site:
            df_site=df_site.append(df.iloc[[i]])
    length=df_site.shape[0]
    df_site['trunk']=0
    df_site['timeout']=0
    for i in range(length):
        begin=int(df_site['begin'].iloc[i])//1000
        end=int(df_site['timestamp'].iloc[i])//1000
        df_site['begin'].iloc[i]=datetime.datetime.fromtimestamp(begin).strftime('%Y-%m-%d %H:%M:%S')
        df_site['timestamp'].iloc[i]=datetime.datetime.fromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S')
        if df_site['status'].iloc[i]=='Trunk Success':
            df_site['trunk'].iloc[i]=1
        if df_site['status'].iloc[i]=='Timeout':
            df_site['timeout'].iloc[i]=1 
    
    df_site=df_site.set_index(['timestamp'])    
    df_site.index = pd.to_datetime(df_site.index, unit='s')
    df_site['begin']=pd.to_datetime(df_site.begin, unit='s')

    jobs = pd.concat([pd.Series(1, df_site.begin), pd.Series(-1, df_site.index)]).resample('1Min', how='sum').cumsum()
    trunk=df_site.trunk.resample('1Min',how='sum')
    timeout=df_site.timeout.resample('1Min',how='sum')

    plt.figure(figsize=[18, 8]) 
    plt.title('Concurrent jobs vs. completion status of ' + site)
    #plt.xlabel('Time', fontsize=18)
    #plt.ylabel('xlabel', fontsize=18)
    jobs.plot(label="# Jobs", legend=True).set_ylabel('# jobs running', color='b',fontsize=18)
    trunk.plot(secondary_y=True, label="# Jobs ending at origin", legend=True).set_ylabel('# jobs ending', color='k',fontsize=18)
    timeout.plot(secondary_y=True, label="# Jobs Timeout", legend=True).set_xlabel('Time', color='k',fontsize=18)
    plt.savefig(name+'.png')
    plt.show()
