# Triage Workflow Run Error

This notebook will look through Tibanna's StepFunction interface on AWS, shows some graphs, then determine the last failed execution and show you what the error for the last execution was.


In [1]:
# first we just load up some stuff
# %load init.py
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from core import utils
import boto3
import json
pd.set_option('display.max_columns',100);pd.set_option('display.max_rows',1000)
%matplotlib inline


ImportError: No module named pandas

In [None]:
# Connect to AWS and see what's going on
client = boto3.client('stepfunctions', region_name='us-east-1')
base_arn = 'arn:aws:states:us-east-1:643366669028:%s:%s'
STEP_FUNCTION_ARN = base_arn % ('stateMachine', 'run_sbg_workflow_2')

executions = client.list_executions(stateMachineArn=STEP_FUNCTION_ARN, maxResults=1000)
exec_df = pd.DataFrame(executions.get('executions'))
exec_df.sort_values(by='startDate', ascending=False, inplace=True)
exec_df.head()






In [None]:
#clean up the names of runs so we can better group things for graphing

to_replace = ['fastqc*', 'validate*', 'test*']
value = ['fastqc', 'validate', 'testing']
exec_df['type'] = exec_df.name.replace(to_replace='[-,_].*$', value='', regex=True, inplace=False)


fig, axs = plt.subplots(1,2, figsize=(15,3))

group_by_status = exec_df.groupby(['type', 'status'])
group_by_status.size().unstack().plot(kind='bar', stacked=True, ax=axs[0])

overview = exec_df.groupby(['status'])
overview.size().plot(kind='bar', stacked=True, ax=axs[1])


Now onto the diagnostics.  Below you can use num_days to shorten the length of the query which will show you all the errors found in that time period.

In [None]:
# Failures in last num_days
num_days = 10
import datetime
cutoff = datetime.datetime.today() - datetime.timedelta(days=10) 
latest = exec_df.loc[(exec_df.startDate > cutoff) & (exec_df.status.apply(lambda x: x in ['FAILED',]))]
print("%s errors found for all step functions since %s" % 
      (len(latest), cutoff.date()))

By default we will get the most recent error, but you can set exec_name if you want to look at a particular execution.

In [None]:
# by default though we get the most recent errror
exec_name = latest.iloc[0]['name']

# use to view errors for a specific step-function run result
# exec_name = 'fastqc-0-11-4-1-1_4DNFIWJZJ6JS.fastq.gz'

def get_exec_arn(name):
    try:
        return latest.loc[latest.name == name, 'executionArn'].iloc[0]
    except:
        print("run %s not found " % name)
        return ''
       
def get_exec_url(name):
    try:
        arn = latest.loc[latest.name == name, 'executionArn'].iloc[0]
        baseurl = 'https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/'
        return '%s%s' % (baseurl, arn)
    except:
        print("run %s not found" % name)
        return ''
    
url = get_exec_url(exec_name)
arn = get_exec_arn(exec_name)
print("you can view the url for the failing run by clicking on the link below:\n")
print(url)

In [None]:
# now let's see which lambda for the run failed
resp = client.get_execution_history(executionArn=arn, reverseOrder=True)

# find the failing lambda
for i, event in enumerate(resp.get('events')):
    if event.get('type') == 'LambdaFunctionScheduled':
        break
        
failing_lambda = resp.get('events')[:i+1]
lambda_arn = failing_lambda[-1]['lambdaFunctionScheduledEventDetails']['resource']
input_json = failing_lambda[-1]['lambdaFunctionScheduledEventDetails']
lambda_time = failing_lambda[-1]['timestamp']
failure = failing_lambda[0].get('executionFailedEventDetails')
if failure is None:
    failure = failing_lambda[0].get('executionAbortedEventDetails')

print("lambda %s \nRunning at %s failed \nFailure is %s" % 
      (lambda_arn, str(lambda_time), failure))
print
print
print("input json for the lambda was:") 
print
print(input_json)


In [None]:
json.loads(failure['cause'])['errorMessage']

And what follows is the cloudwatch logs if we can find any

In [None]:
awslambda = boto3.client('lambda', region_name='us-east-1')
lambda_details = awslambda.get_function_configuration(FunctionName=lambda_arn)
print("lambda was last updated %s UTC time" % 
      lambda_details['LastModified'])


# need to conver to format aws likes
import datetime
import pytz

epoch = pytz.utc.localize(datetime.datetime.utcfromtimestamp(0))

def unix_time_millis(dt):
    return int((dt - epoch).total_seconds()) * 1000

log_stream = "/aws/lambda/%s" % lambda_details['FunctionName']

cloudwatch = boto3.client('logs', region_name='us-east-1')
end_time_local = failing_lambda[0]['timestamp']

end_time = unix_time_millis(end_time_local.astimezone(pytz.utc))

start_time_local = failing_lambda[-1]['timestamp']
start_time = unix_time_millis(start_time_local.astimezone(pytz.utc))
cloudwatch.filter_log_events(logGroupName=log_stream,
                            startTime=start_time,
                            endTime=end_time)





Some of the checks below should help identify common problems, such as file doesn't exist...

In [None]:
# TODO: ensure workflow exists on target server

# TODO: check the input files exists on s3

# TODO: validate input for step
