# Utility script

In [None]:
def get_starttime_ms():
    """ 
    Function to get the start time in milli seconds
    """
    return datetime.now().timestamp()*1000

def get_elapsed_time(st_time: datetime, display:bool = True):
    """ get elapsed time in seconds"""
    elapsed_secs = (datetime.now()-st_time).total_seconds()
    elapsed_secs = round(elapsed_secs,2)
    if display:
        print(f"execution time: {elapsed_secs:.2f} seconds")
    return elapsed_secs 

In [None]:
def get_events_from_cloudwatch_logs(
    log_client: 'CloudWatchLogsClient',
    log_groupname: str,
    start_time_ms: int=None,
    pattern: str=None) -> list:
    """ 
    Function to get the lambda log events
    params:
        start_time_ms (int): start time in milliseconds. Filters the logs where log_time >= starttime_ms
        pattern (str): pattern to filter the logs. Should match cloudWatch Patterns
        https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html
        
    """
    _filter_dict= {'logGroupName':log_groupname}
    if start_time_ms:
        _filter_dict['startTime'] = int(start_time_ms)
    if pattern:
        _filter_dict['filterPattern'] = pattern
    
    all_events = []
    
    try:
        while True:
            _events = log_client.filter_log_events(**_filter_dict)
            all_events.extend(_events.get('events'))
            # break if there is no nextToken
            if 'nextToken' not in _events: break
            # continue with the loop
            _filter_dict['nextToken'] = _events['nextToken']
        
        return all_events
    
    except Exception as e:
        # log error
        error_message = str(e)
        print(error_message)
        raise 

## DuckDB Lambda Functions

In [None]:
# variables used in the DuckDBLambda related classes

# specify the logging pattern using the duckdb lambda function script. 
# note: the regex pattern is '{ $.request_id = "*" && $.status = "*" }'. there might be rendering issues when viewing on githubduckdb_lambda_function_log_pattern = r'{ $.request_id = "*" && $.status = "*" }' # used in DuckDBLambda_Logs class

# specify the lambda function name associated with your AWS account. 
duckdb_lambda_function_name = 'duckdb_qry_v2' # used in DuckDBLambda_Invoke class

# specify the CloudWatch Logs path associated with the lambda function
duckdb_cloudwatch_log_group = '/aws/lambda/duckdb_qry_v2' # used in DuckDBLambda_Invoke class

### Lambda Invoke
* sync and async lambda calls

In [None]:
class DuckDBLambda_Invoke:
    """ 
    Function for sync and async lambda invoke
    """
    
    function_name = duckdb_lambda_function_name
    
    @staticmethod
    def _create_payload(qry: str):
        """ 
        Function to convert the qry to payload matching the logging in the lambda function name function
        """ 
        return json.dumps({'qry':qry})
        
    @staticmethod
    def _lambda_invoke(lambda_client: 'LambdaClient',
                      qry: str,
                      invocation_type: str) -> dict:
        """ 
        Function to invoke lambda function
        invoke_type: RequestResponse or Event
        """
        _payload = DuckDBLambda_Invoke._create_payload(qry)
        response = lambda_client.invoke(
            FunctionName=DuckDBLambda_Invoke.function_name,
            InvocationType=invocation_type,
            Payload=_payload
        )

        return response.get('ResponseMetadata')
    
    # sync lambda invoke
    sync_lambda_invoke = staticmethod(partial(_lambda_invoke,invocation_type='RequestResponse'))
    # async lambda invoke
    async_lambda_invoke = staticmethod(partial(_lambda_invoke,invocation_type='Event'))
    
    @staticmethod
    def async_invoke_tasks(lambda_client,tasks: dict[dict], task_qry_key: str) -> dict[dict] :
        """ 
        Function to invoke multiple lambda tasks. 
        
        """
        tasks_lambda_response = {}
        print(f"invoking #{len(tasks)} tasks")
        for task_name,task in tasks.items():
            qry = task[task_qry_key]
            tasks_lambda_response[task_name] = DuckDBLambda_Invoke.async_lambda_invoke(lambda_client, qry)
            
        return tasks_lambda_response

### Lambda_Logs
* Get CloudWatch Logs and Parse events

In [None]:
class DuckDBLambda_Logs: 
    """ 
    Functions get and parse the CloudWatch logs for DuckDBLambda
    """
    
    group_name = duckdb_cloudwatch_log_group
    # https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html
    event_status_patten = duckdb_lambda_function_log_pattern
    
    # these should match the logging values in the lambda function 
    status_started = 'started'
    status_success = 'success'
    status_error = 'error'
    
    ## Functions for getting the CloudWatchLogs
    @staticmethod
    def _get_cloudwatch_logs(log_client,start_time_ms) -> list[dict]:
        """ 
        """
        return get_events_from_cloudwatch_logs(
                log_client=log_client,
                log_groupname=DuckDBLambda_Logs.group_name,
                start_time_ms=start_time_ms,
                pattern = DuckDBLambda_Logs.event_status_patten
        )
    @staticmethod
    def get_cloudwatch_logs(log_client: str,start_time_ms) -> None:
        """
        Function to get the cloudwatch events.
        If no events found, wait for 10 seconds and try again.
        Raise error if no events found after that. 
        """
        print(f"Getting cloudwatch events")        
        
        _events = DuckDBLambda_Logs._get_cloudwatch_logs(log_client,start_time_ms)
        if not _events:
            sleep(10)
        _events = DuckDBLambda_Logs._get_cloudwatch_logs(log_client,start_time_ms)
        assert(_events),f"No events found"
        
        return _events
    
    
    ## Functions to parse the logs
    @staticmethod
    def parse_event_message(event_message: str) -> dict:
        """ 
        function to parse the event message to extract the 
        """
        
        _pattern = r'\{.*\}'
        _match = re.findall(_pattern,event_message)
        if _match: 
            return json.loads(_match[0])
        else:
            return {}
    
    @staticmethod
    def parse_event(event:dict):
        """ 
        """
        parsed_event_message = DuckDBLambda_Logs.parse_event_message(event.get('message'))
        request_id = parsed_event_message.get('request_id')
        status = parsed_event_message.get('status')
        message = parsed_event_message.get('message')
        log_stream_name = event.get('logStreamName')
        assert(all([request_id,status])),f"{event} does not contains request_id and/or status "
        
        return dict(
            request_id=request_id,
            log_stream_name=log_stream_name,
            status=status,
            message=message,
        )
        
    @staticmethod
    def get_default_status(log_stream_name: str) -> dict:
        """ 
        """
        return {
            'log_stream_name':log_stream_name,
            'started':False,
            'success':False,
            'error_message':None
        }
    
    @staticmethod
    def validate_event(events: list[dict]) -> None:
        """ 
        Function to validate the event. 
        """
        required_keys = ['message','logStreamName']
        invalid_events = []
        for _event in events:
            if not all([_key in _event for _key in required_keys]):
                invalid_events.append(_event)
                
        if invalid_events:
            print("The following events are invalid")
            print(invalid_events)
            raise ValueError("Invalid events")

        return None
    
    @staticmethod
    def update_request_id_status(current_status: dict,parsed_event: dict) -> dict:
        """ 
        Function to update the request_id status based on the parsed event
        """
        status = parsed_event['status']
        
        if status == DuckDBLambda_Logs.status_started:
            current_status['started'] = True
        elif status == DuckDBLambda_Logs.status_success:
            current_status['success'] = True
        elif status == DuckDBLambda_Logs.status_error:
            current_status['success'] = False
            current_status['error_message'] = parsed_event['message']
        
        return current_status
        
    @staticmethod
    def get_request_id_status(request_ids_status: dict,parsed_event: dict) -> dict:
        """ 
        Function to the the request_id status. 
        if not present, initiate default status
        """
        
        request_id = parsed_event['request_id']
        log_stream_name = parsed_event['log_stream_name']
        
        # key dooes not exist, create default status
        if request_id not in request_ids_status:
            status = DuckDBLambda_Logs.get_default_status(log_stream_name)
        else:
            # key exists, return value for key
            status = request_ids_status[request_id]
        
        return status
        
    @staticmethod
    def process_events(events:list[dict]) -> dict:
        """ 
        Function to process all events 
        """
        # validate the events list
        _= DuckDBLambda_Logs.validate_event(events)
        
        request_ids_status = {}
        for event in events:
            parsed_event = DuckDBLambda_Logs.parse_event(event)
            id_status = DuckDBLambda_Logs.get_request_id_status(request_ids_status,parsed_event)
            id_status = DuckDBLambda_Logs.update_request_id_status(id_status,parsed_event)
            request_ids_status[parsed_event['request_id']] = id_status
        
        return request_ids_status

### Lambda Tasks Status
* Track and Summarize task status

In [None]:
class DuckDBLambda_Status:
    """ 
    Fucntion to get and summarize tasks status
    """
    
    @staticmethod
    def _get_default_status():
        """ 
        If the request_id is not logged in the Cloudwatch, then return default status 
        with log_stream_name as Missing
        """
        return DuckDBLambda_Logs.get_default_status(log_stream_name='Missing')
        
    @staticmethod
    def _get_request_id_status(lambda_response: dict, processed_events: dict) -> dict:
        """ 
        Function to get the status of the task. 
        the task_lambda_response contains the request_id associated with the task

        """
        request_id = lambda_response.get('RequestId')
        # initiate default status with request ud
        status = {'request_id':request_id}
        request_id_status = processed_events.get(request_id)
        if not request_id_status:
            request_id_status = DuckDBLambda_Status._get_default_status()
        # update the the status info
        status.update(request_id_status)
        
        return status
    
    @staticmethod
    def get_tasks_status(tasks_lambda_response,processed_events) -> dict[dict]:
        """ 
        For each task, get the request_id and update the status
        """
        tasks_status = {}
        for task_name,lambda_reponse in tasks_lambda_response.items():
            tasks_status[task_name] = DuckDBLambda_Status._get_request_id_status(lambda_reponse,processed_events)
        
        return tasks_status

    
    ## Functions to summarize status
    @staticmethod
    def _get_task_status(tasks_status,status_key,display_info,invert=False):
        """ 
        get the task status based on the status_key
        """
        ntasks = len(tasks_status)
        _bool = not invert
        _invert_prefix = 'Not ' if invert else ''
        task_status=[k for k,v in tasks_status.items() if v.get(status_key) == _bool]
        if display_info:
            print(f"{_invert_prefix}{status_key} tasks #{len(task_status)}/{ntasks} tasks")
        
        return task_status
    
    # function for various status
    get_started_tasks = staticmethod(partial(_get_task_status,status_key='started'))
    get_completed_tasks = staticmethod(partial(_get_task_status,status_key='success'))
    get_not_started_tasks = staticmethod(partial(_get_task_status,status_key='started',invert=True))
    get_failed_tasks = staticmethod(partial(_get_task_status,status_key='success',invert=True))
    
    @staticmethod
    def get_status_summary(tasks_status,display_info):
        """ 
        """
        _invert_display = False
        started = DuckDBLambda_Status.get_started_tasks(tasks_status,display_info=display_info)
        not_started = DuckDBLambda_Status.get_not_started_tasks(tasks_status,display_info=_invert_display)
        completed = DuckDBLambda_Status.get_completed_tasks(tasks_status,display_info=display_info)
        failed = DuckDBLambda_Status.get_failed_tasks(tasks_status,display_info=_invert_display)
            
        
        return dict(
            started=started,
            not_started=not_started,
            failed=failed,
            completed=completed,
        )
    

### Lambda Tasks
* Orchestator for invoke lambda, get and parse logs, task and summarize task status

In [None]:
class DuckdbLambda_Tasks:
    """ 
    Orchestrates the execution and monitoring of the tasks executes on Duckdb Lambda
    """
    
    def __init__(self,
                 tasks: dict,
                 lambda_client: 'LambdaClient',
                 log_client: 'CloudWatchLogsClient'):
        """ 
        """
        self.tasks = tasks
        self.lambda_client = lambda_client
        self.log_client = log_client    
        
        self.cloudwatch_event = None
        self.start_time_ms = None
        self.processed_events = None
        self.lambda_response_meta = {} 
        self.tasks_status = {}
        
    
    
    def event_invoke_tasks(self,qry_key: str ='qry', re_run: bool = False):
        """ 
        Function to invoke the tasks
        """
        
        # if not already invoked
        if not self.lambda_response_meta or re_run:
            # log the time. We use this to filter the cloudWatchLogs
            self.start_time = datetime.now()
            self.start_time_ms = get_starttime_ms()
            self.lambda_response_meta = DuckDBLambda_Invoke.async_invoke_tasks(
                lambda_client=self.lambda_client,
                tasks=self.tasks,
                task_qry_key=qry_key
            )

    
    def get_cloudwatch_events(self) -> None:
        """
        Function to get the cloudwatch events.
        If no events found, wait for 3 seconds and try again.
        Raise error if no events found after that. 
        """

        self.cloudwatch_event = DuckDBLambda_Logs.get_cloudwatch_logs(
            log_client=self.log_client,
            start_time_ms=self.start_time_ms
        )
    
    def process_cloudwatch_events(self) -> None:
        """ 
        """
        if not self.cloudwatch_event:
            self.get_cloudwatch_events()
        
        self.processed_events = DuckDBLambda_Logs.process_events(self.cloudwatch_event)

    
    def update_tasks_status(self,refresh: bool) -> None:
        """ 
        """
        
        if refresh:
            self.get_cloudwatch_events()
            self.process_cloudwatch_events()
        
        self.tasks_status = DuckDBLambda_Status.get_tasks_status(
            tasks_lambda_response=self.lambda_response_meta,
            processed_events=self.processed_events
        )
        
        
    def summarize_tasks_status(self,display_info) -> dict:
        """ 
        Function to summarize the tasks status
        """
        
        return DuckDBLambda_Status.get_status_summary(self.tasks_status,display_info)
    
    def wait_until_tasks_complete(self,
                                  total_wait_time_secs: int,
                                  interval_check_time_sec: int) -> None:
        """ 
        Function to wait until tasks complete
        """
        
        start_time = datetime.now()
        iteration = 1        
        _str = f""" 
        Waiting until Tasks comeplete. 
        Max wait time: {total_wait_time_secs} seconds. 
        Check interval: {interval_check_time_sec} seconds
        ******************
        """
        print(textwrap.dedent(_str))
        
        while (datetime.now() - start_time).seconds < total_wait_time_secs:
            print(f"iteration: {iteration}. cummulative time: {get_elapsed_time(start_time,display=False)} seconds")
            # refresh the cloud watch events and process the events
            self.update_tasks_status(refresh=True)
            summary_info = self.summarize_tasks_status(display_info=True)
            
            # break if there are no failed tasks
            if len(summary_info['failed']) ==0:
                break
            
            sleep(interval_check_time_sec)
            iteration +=1
            print("")
        _=get_elapsed_time(start_time)
        print("******************")
        if summary_info['failed']:
            raise ValueError("Not all tasks have completed")

        return None