In [15]:
# Automating support for ATLAS Distributed Computing Operations

#Three sentence description: This project delivers a system to automate workflows usually done by
#human shifters. The first milestone is to provide a recommendation
#system with an integrated feedback loop. The second milestone is to
#automate recommendations and decision based on machine learning algorithms.

In [16]:
#These lines set up inline plotting, and apply a standard size
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import scipy
import re
from elasticsearch import Elasticsearch
matplotlib.rc('font', **{'size': 15})
from elasticsearch.helpers import scan
from time import time

In [17]:
##converting to epoch time

import time
import datetime

today = time.strftime('%d/%m/%Y')
yesterday = (datetime.date.today() - datetime.timedelta(1)).strftime("%d/%m/%Y")
last_week = (datetime.date.today() - datetime.timedelta(7)).strftime("%d/%m/%Y")
last_month = (datetime.date.today() - datetime.timedelta(31)).strftime("%d/%m/%Y")
today_epoch = int(time.mktime(datetime.datetime.strptime(today, "%d/%m/%Y").timetuple()))*1000
yesterday_epoch = int(time.mktime(datetime.datetime.strptime(yesterday, "%d/%m/%Y").timetuple()))*1000
last_week_epoch = int(time.mktime(datetime.datetime.strptime(last_week, "%d/%m/%Y").timetuple()))*1000
last_month_epoch = int(time.mktime(datetime.datetime.strptime(last_month, "%d/%m/%Y").timetuple()))*1000
print(today_epoch,yesterday_epoch,last_week_epoch,last_month_epoch)
print(today_epoch-yesterday_epoch,today_epoch-last_week_epoch)
##add three zeros because the time format specified in the script is milliseconds (epoch_millis).
 

1487743200000 1487656800000 1487138400000 1485064800000
86400000 604800000


In [18]:
# Define the query 
# All transfer submits in the period


# Define the query
new_query = {
  "size": 0,
  "query": {
    "bool": {
    "must": [
    {
      "range": {
        "@timestamp": {
        "gte": last_month_epoch,
        "lte": today_epoch,
        "format": "epoch_millis"
         }
        }
       },
      {"term": {"event_type": "transfer-done"}},
      {"term": {"payload.scope": "data16_13TeV"}}
    ]
    }
  }
}

In [19]:
# Define the query 
# All transfer failures in period


# Define the query
my_query = {
  "size": 0,
  "query": {
    "bool": {
    "must": [
    {
      "range": {
        "@timestamp": {
        "gte": last_month_epoch,
        "lte": today_epoch,
        "format": "epoch_millis"
         }
        }
       },
      {"term": {"event_type": "transfer-failed"}},
      {"term": {"payload.scope": "data16_13TeV"}}
    ]
    }
  }
}





In [20]:
es = Elasticsearch(['atlas-kibana.mwt2.org:9200'],timeout=60) 
my_index = "rucio-events-2017*"
scroll = scan(es, query=my_query, index=my_index, scroll='5m', timeout="5m", size=100)
new_scroll = scan(es, query=new_query, index=my_index, scroll='5m', timeout="5m", size=100)


In [None]:
#for res in new_scroll:
#    print(res['_source']['event_type'])
    

In [None]:

# Put this on ice for now
"""
error_list = ['SOURCE CHECKSUM MISMATCH User defined checksum and source checksum do not match',
              'TRANSFER SOURCE CHECKSUM MISMATCH User defined checksum and source checksum do not match',
              'TRANSFER CHECKSUM MISMATCH Source and destination checksums do not match',
              'the server responded with an error 421',
              'the server responded with an error 451',
              'the server responded with an error 500',
              'the server responded with an error 530',
              'TRANSFER SOURCE CHECKSUM srm-ifce err: Communication error on send',
              'Transfer process died with:',
              'TRANSFER  globus_xio: Unable to connect to',
              'TRANSFER DESTINATION MAKE_PARENT srm-ifce err: Permission denied',
              'DESTINATION SRM_PUTDONE call to srm_ifce error:',
              'DESTINATION OVERWRITE srm-ifce err: Communication error on send',
              'TRANSFER DESTINATION OVERWRITE srm-ifce err: Communication error on send',
              'TRANSFER SOURCE SRM_GET_TURL error on the turl request ',
              'TRANSFER DESTINATION SRM_PUTDONE call to srm_ifce error:',
              'Operation timed out, operation timeout',
              'User specified source file size is', #X but stat returned Y
             ]
"""

exceptions = ('[gfalt_copy_file][perform_copy][srm_plugin_filecopy][srm_resolve_turls][srm_resolve_put_turl] DESTINATION OVERWRITE [srm_plugin_prepare_dest_put][srm_plugin_delete_existing_copy][gfal_srm_unlinkG][gfal_srm_rm_srmv2_internal] error reported from srm_ifce, [SE][srmRm][SRM_AUTHORIZATION_FAILURE] No approachable VFS found for user!',
              'error on the bring online request',
              'bring-online timeout has been exceeded',
              'Transfer process died with:')

count=0
size_failures = []
endpointPairs = {}
problem_sources = {}
problem_destinations = {}
duration_failures = []
reasons = {}
failure_types = {}
data_types = {}
for res in scroll:
    if not count%10000:  print(count)
    if count<1: print(res)
    count += 1
    if res['_source']['event_type']=='transfer-failed':
        name = res['_source']['payload']['name']
        size_failures.append(res['_source']['payload']['file-size'])
        duration_failures.append(res['_source']['payload']['duration'])
        source = res['_source']['payload']['src-rse']
        destination = res['_source']['payload']['dst-rse']
        reason = res['_source']['payload']['reason']
        src_type = res['_source']['payload']['src-type']
        pair = (source,destination)
        if pair not in endpointPairs.keys():
            endpointPairs[pair] = 1
        if pair in endpointPairs.keys():
            endpointPairs[pair] = endpointPairs[pair]+1
            
        if src_type not in failure_types.keys():
            failure_types[src_type] = 1
        if src_type in failure_types.keys():
            failure_types[src_type] = failure_types[src_type]+1   
            
        if source not in problem_sources.keys():
            problem_sources[source] = 1
        if source in problem_sources.keys():
            problem_sources[source] = problem_sources[source]+1
        
        if destination not in problem_destinations.keys():
            problem_destinations[destination] = 1
        if destination in problem_destinations.keys():
            problem_destinations[destination] = problem_destinations[destination]+1
        
        
        #categorising the error messages
        for word in exceptions:
            if not word in reasons.keys() and word in reason:
                reasons[word] = 1
            if word in reasons.keys() and word in reason:
                reasons[word] += 1

        for word in reason.split(' '):
            if not word.isupper() and not any(part in reason for part in exceptions):
                if " ".join(item for item in reason.split(' ')[:reason.split(' ').index(word)]) not in reasons.keys():
                    reasons[" ".join(item for item in reason.split(' ')[:reason.split(' ').index(word)])] = 1
                if " ".join(item for item in reason.split(' ')[:reason.split(' ').index(word)]) in reasons.keys():
                    reasons[" ".join(item for item in reason.split(' ')[:reason.split(' ').index(word)])] += 1
                break
         
        #data format
        
        format_list = ['HIST','log','AOD','DAOD','RAW','DRAW','ESD','DESDM']
        
        counter = 0
        
        for word in name.split('.'):
            if name.split('.').index(word) == 0:
                for element in format_list:
                    if (word.split('_')[0] == element):
                        if element not in data_types.keys():
                            data_types[element] = 1
                        if element in data_types.keys():
                            data_types[element] += 1
                        counter += 1
            if name.split('.').index(word) == 4:
                for element in format_list:
                    if (word.split('_')[0] == element):
                        if element not in data_types.keys():
                            data_types[element] = 1
                        if element in data_types.keys():
                            data_types[element] += 1
                        counter += 1
        
        if counter == 0:
            print('format ',name,' not in format_list' )
        
       
                    
                
        
        
        
           # if reason not in reasons.keys():
           #     reasons[reason] = 1
           # if reason in reasons.keys():
           #     reasons[reason] = reasons[reason]+1
           
   
    #check if string is in a list of strings ex: looking for "or" in ["red","orange"] will output True
     #result = any(word_to_check in word for word in worldlist)
    #conversely:
    #any(substring in string for substring in substring_list)
#It will return True if any of the substrings in substring_list is contained in string.
        
    #string manipulation for the reasons:
    #word.split(' ')  # Split on whitespace
#word.startswith("H") word.endswith("H")
#uppercase string.upper() lowercase string.lower()
#word.isdigit()         #test if string contains digits
          
count = 0
pairs = {}
sources = {}
destinations = {}
size_all = []
duration_all = []
all_types = {}
for res in new_scroll:
    if not count%10000: print(count)
    if count<1: print(res)
    count += 1
    if res['_source']['event_type']=='transfer-done':
        size_all.append(res['_source']['payload']['file-size'])
        duration_all.append(res['_source']['payload']['duration'])
        new_source = res['_source']['payload']['src-rse']
        new_destination = res['_source']['payload']['dst-rse']
        src_type = res['_source']['payload']['src-type']
        new_pair = (new_source,new_destination)
        if new_pair not in pairs.keys():
            pairs[new_pair] = 1
        if new_pair in pairs.keys():
            pairs[new_pair] = pairs[new_pair]+1

        if new_source not in sources.keys():
            sources[new_source] = 1
        if new_source in sources.keys():
            sources[new_source] = sources[new_source]+1
        
        if new_destination not in destinations.keys():
            destinations[new_destination] = 1
        if new_destination in destinations.keys():
            destinations[new_destination] = destinations[new_destination]+1
            
        if src_type not in all_types.keys():
            all_types[src_type] = 1
        if src_type in all_types.keys():
            all_types[src_type] = all_types[src_type]+1 
            
    


0
{'_source': {'@timestamp': '2017-02-09T00:36:16.290Z', 'created_at': '2017-02-09 00:36:15', 'payload': {'started_at': '2017-02-08 20:15:28', 'transferred_at': '2017-02-09 00:33:12', 'transfer-link': 'https://fts3.cern.ch:8449/fts3/ftsmon/#/job/04db112d-2f61-5550-b0ee-c64327fcdc7b', 'created_at': None, 'submitted_at': '2017-02-08 20:11:37', 'protocol': 'srm', 'transfer-id': '04db112d-2f61-5550-b0ee-c64327fcdc7b', 'src-type': 'TAPE', 'previous-request-id': '2a371460bbba492f800f977d94605bf2', 'checksum-adler': '0a8e1498', 'src-rse': 'IN2P3-CC_DATATAPE', 'scope': 'data16_13TeV', 'name': 'data16_13TeV.00301932.physics_Main.daq.RAW._lb0651._SFO-4._0002.data', 'file-size': 2621880440, 'guid': None, 'bytes': 2621880440, 'activity': 'User Subscriptions', 'tool-id': 'rucio-conveyor', 'dst-rse': 'IN2P3-CC_DATADISK', 'src-url': 'srm://ccsrm.in2p3.fr:8443/srm/managerv2?SFN=/pnfs/in2p3.fr/data/atlas/atlasdatatape/data16_13TeV/RAW/other/data16_13TeV.00301932.physics_Main.daq.RAW/data16_13TeV.003019

In [None]:
bins = 50
bin_width = int(max(size_failures)/bins)
plt.hist(size_failures,bins=np.arange(0, max(size_failures) + bin_width, bin_width))
plt.xlabel('File Size')
plt.ylabel('Failures')
plt.title('Failures as a Function of File Size')


In [None]:
bin_width = int(max(duration_failures)/bins)
plt.hist(duration_failures,bins=np.arange(0, max(duration_failures) + bin_width, bin_width))
plt.xlabel('Duration')
plt.ylabel('Failures')
plt.title('Failures as a Function of Event Duration')

In [None]:
bins = 50
bin_width = int(max(size_all)/bins)
plt.hist(size_all,bins=np.arange(0, max(size_failures) + bin_width, bin_width))
plt.xlabel('File Size')
plt.ylabel('Transfers')
plt.title('Transfers as a Function of File Size')

In [None]:
bin_width = int(max(duration_all)/bins)
plt.hist(duration_all,bins=np.arange(0, max(duration_all) + bin_width, bin_width))
plt.xlabel('Duration')
plt.ylabel('Transfers')
plt.title('Transfers as a Function of Duration')




In [None]:
#normalising transfer duration plot

hist_duration_failures = np.histogram(duration_failures,bins, range = (0,max(duration_failures)))
hist_duration_all      = np.histogram(duration_all,bins,range=(0,max(duration_all)))

duration_values = []
for i in range(len(hist_duration_failures[0])):
    duration_values.append(hist_duration_failures[0][i]/(hist_duration_failures[0][i] + hist_duration_all[0][i]))
    
duration_norm = (duration_values,np.arange(0,max(duration_failures),max(duration_failures)/bins))

plt.bar(duration_norm[1],duration_norm[0],width=max(duration_failures)/bins)
plt.xlabel('Duration')
plt.ylabel('Failure Rate')
plt.title('Transfer Duration Failure Rate')

In [None]:
#normalising file size plot
# syntax numpy.histogram(a, bins=10, range=None, normed=False, weights=None, density=None)

failures = np.histogram(size_failures, bins, range = (0 , max(size_failures)) ) 
all_transfers = np.histogram(size_all, bins, range = (0 , max(size_failures)) )

values = []
for i in range(len(failures[0])):
    values.append(failures[0][i]/(failures[0][i]+all_transfers[0][i]))

failures_norm = (values,np.arange(0, max(size_failures), max(size_failures)/(bins)))

plt.bar(failures_norm[1],failures_norm[0],width=max(size_failures)/bins)
plt.xlabel('File Size')
plt.ylabel('Failure Rate')
plt.title('File Size Failure Rate')

#for i in range(len(values)):
#    print(values[i],'\t\t',failures[0][i],all_transfers[0][i],)

  

In [None]:
import operator
sorted_endpointPairs = sorted(endpointPairs.items(), key=operator.itemgetter(1))
sorted_problem_sources = sorted(problem_sources.items(),key=operator.itemgetter(1))
sorted_problem_destinations = sorted(problem_destinations.items(),key=operator.itemgetter(1))

#for item in sorted_endpointPairs:
    #if item[1] > 100:
    #    print(item[0],'\t\t',item[1])
        
        
        
#for site in sorted_problem_sources:
    #sorted_problem_sources[site[0]] = site[1]/count
#    print(site[0],site[1])
        
#for site in sorted_problem_destinations:
#    print(site[0],site[1])
    
        

    

In [None]:
## failure rates for the site pairs
ratios = {}
source_ratios = []
destination_ratios = []


## Number of failed transfers needed
threshold = 100
ratio_threshold = 0.3

for item in sorted_endpointPairs:
    if item[0] in pairs and item[1] >= threshold and item[1]/(pairs[item[0]]+item[1]) >= ratio_threshold:
        #print(item[0],item[1]/pairs[item[0]])
        ratios[item[0]] = item[1]/(pairs[item[0]]+item[1])
        
sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1))

print('Failure Rates for Site Pairs:\n')
for item in sorted_ratios:
    print(item[0],'\t\t',item[1])
print('\n')
    
for item in sorted_problem_sources:
    if item[0] in sources and item[1] >= threshold:
        source_ratios.append([item[0],item[1]/(sources[item[0]]+item[1])])

sorted_problem_source_ratios = sorted(source_ratios,key=operator.itemgetter(1))

print('Source Failure Rates:\n')
for item in sorted_problem_source_ratios:
    print(item[0],'\t\t',item[1])
print('\n')           
    
for item in sorted_problem_destinations:
    if item[0] in destinations and item[1] >= threshold:
        destination_ratios.append([item[0],item[1]/(destinations[item[0]]+item[1])])

sorted_problem_destination_ratios = sorted(destination_ratios,key=operator.itemgetter(1))
        
print('Destination Failure Rates:\n')
for item in sorted_problem_destination_ratios:
    print(item[0],'\t\t',item[1])
print('\n')

In [None]:
#2d histogram. duration vs file size for failures
#list of values: 
#size_failures
#duration_failures

#x, y, bins=10, range=None, normed=False, weights=None, cmin=None, cmax=None, hold=None, data=None, **kwargs)

plt.hist2d(size_failures, duration_failures,bins = 25 , range=[[0, max(size_failures)],[0, max(duration_failures)]], cmin=1)
plt.colorbar()
plt.xlabel('size')
plt.ylabel('duration')
plt.title('duration vs file size for failures')







In [None]:
#2d histogram. duration vs file size for transfers
#duration_all
#size_all

plt.hist2d(size_all, duration_all,bins = 25 , range=[[0, max(size_all)],[0, max(duration_all)]], cmin=1)
plt.colorbar()
plt.xlabel('size')
plt.ylabel('duration')
plt.title('duration vs file size for transfers')



In [None]:
##heatmap of failure rates
#import pandas as pd


rows = []
columns = []

##used for debugging (matrices are hard)
test_pairs = {}
test_pairs[('a','b')] = 1
test_pairs[('b','c')] = 2
test_pairs[('a','c')] = 1
test_pairs[('c','d')] = 3
sorted_pairs = sorted(test_pairs.items(), key=operator.itemgetter(1))

for pairs in sorted_ratios:
    if not pairs[0][0] in columns:
        columns.append(pairs[0][0])
    if not pairs[0][1] in rows:
        rows.append(pairs[0][1])


values = np.zeros((len(rows),len(columns)))

for pairs in sorted_ratios:
    row_index = rows.index(pairs[0][1])
    column_index = columns.index(pairs[0][0])
    value = pairs[1]
    values.itemset((row_index,column_index),value) 
    

fig, ax = plt.subplots()
im = ax.pcolor(values, cmap='Reds', edgecolor='black')
fig.colorbar(im)

ax.xaxis.set(ticks=np.arange(0.5, len(columns)), ticklabels=columns)
ax.yaxis.set(ticks=np.arange(0.5, len(rows)), ticklabels=rows)
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)
ax.set_aspect('equal')

plt.xlabel('Sending Site')
plt.ylabel('Receiving Site')
plt.title('Failure Rates')

plt.show()



##kept for posterity
#p = pd.DataFrame(values, rows, columns, dtype=None, copy=False)

#todo: compare top and bottom failure rate pairs for noticeable differences. file size etc?




In [None]:
#Reasons for errors

#sort dict of error reasons
sorted_reasons = sorted(reasons.items(), key=operator.itemgetter(1))

print(len(sorted_reasons))
for i in sorted_reasons:
    print(i)


In [None]:
print(failure_types)
print(all_types)
for key in failure_types:
    print(key,' failure rate: ',failure_types[key]/(all_types[key]+failure_types[key]))

In [None]:
#file formats

sorted_types = sorted(data_types.items(), key=operator.itemgetter(1))

print(len(sorted_types))
for i in sorted_types:
    print(i)