In [None]:
import os

import boto3
import numpy as np
import pandas as pd
from tqdm import tqdm

JOB_QUEUE_ARN = 'arn:aws:batch:us-west-2:176245438256:job-queue/BatchJobQueue-fzRFEEEvL4l1jLRp'

os.environ['AWS_PROFILE'] = 'saml-pub'
batch = boto3.client('batch')
log_client = boto3.client('logs')


def chunk(itr, n: int = 100):
    for i in range(0, len(itr), n):
        yield itr[i:i + n]

def get_all_batch_jobs():
    response = batch.list_jobs(jobQueue=JOB_QUEUE_ARN, jobStatus='FAILED')
    jobs = response['jobSummaryList']

    while 'nextToken' in response:
        response = batch.list_jobs(jobQueue=JOB_QUEUE_ARN, jobStatus='FAILED', nextToken=response['nextToken'])
        jobs.extend(response['jobSummaryList'])

    return jobs

def get_last_log_event(log_stream):
    try:
        next_token = None
        n_events_requested = 1
        events = []

        # FFS: https://github.com/boto/boto3/issues/3718
        # This implementation is what was recommended in the boto3 issue above but is HTTP request intensive as you're
        # paging through a pile of empty pages of events.
        # it should be possible to refactor it such that you:
        #    - describe the log stream: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/logs/client/describe_log_streams.html
        #    - use the lastEventTimestamp in the response above to set a startTime and endTime in the get_log_events call: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/logs/client/get_log_events.html
        # which would make it only 2 HTTP calls ever.
        while len(events) < n_events_requested:
            log_query = {
                'logGroupName': '/aws/batch/job',
                'logStreamName': log_stream,
            }
            if next_token:
                log_query['nextToken'] = next_token

            response = log_client.get_log_events(**log_query)
            events.extend(response['events'])

            if 'nextBackwardToken' in response:
                if response['nextBackwardToken'] == next_token:
                    break # no new token, so at the start
                next_token = response['nextBackwardToken']


        return events[-1]['message']
    except:
        return None


def get_attempts(batch_jobs):
    batch_attempts = []
    for jobs in tqdm(chunk(batch_jobs, 100), total=np.ceil(len(batch_jobs) / 100)):
        response = batch.describe_jobs(jobs=[job['jobId'] for job in jobs])
        for job in response['jobs']:
            for attempt in job['attempts']:
                container = attempt['container']

                log_line = None
                log_stream = container.get('logStreamName')
                if log_stream:
                    log_line = get_last_log_event(log_stream)

                batch_attempts.append(
                    (job['jobName'], job['jobId'] ,attempt['statusReason'] ,container.get('reason'), log_line)
                )
    return batch_attempts

if __name__ == '__main__':
    batch_jobs = get_all_batch_jobs()
    batch_attempts = get_attempts(batch_jobs)

    # NOTE: We *name* batch jobs using their associated HyP3 job_id; this job_id is the batch job_id
    df = pd.DataFrame(batch_attempts, columns=['job_name', 'job_id', 'status', 'reason', 'last_log_line'])
    df.to_csv('scripts/2024-08-28-ARIA-failures.csv')