# Download and analyse sbeacon log events

## Downloading log events

We are going to analyse the log events of user `admin@example.com` (an admin/manager user).

To perform this task, you must have aws console access, because the keys are needed to access aws console via boto3 library.

Please ensure keys are added to the jupyter environment before you run the code.


In [1]:
user = "admin@example.com"
scope = "individuals"
log_group_name = "/aws/lambda/sbeacon-backend-getIndividuals"
region = "ap-southeast-2"
name = "individuals"


START_TIME = "2025-05-12T00:00:00Z"
END_TIME = "2025-05-27T23:59:59Z"

In [2]:
from datetime import datetime, timezone
import json
import boto3

client = boto3.client('logs', region_name=region)


def iso_to_epoch_millis(iso_str):
    dt = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%SZ")
    dt = dt.replace(tzinfo=timezone.utc)
    epoch_millis = int(dt.timestamp() * 1000)
    return epoch_millis

def epoch_millis_to_iso(epoch_millis):
    dt = datetime.fromtimestamp(epoch_millis / 1000, tz=timezone.utc)
    return dt.strftime('%Y-%m-%dT%H:%M:%SZ')


def get_all_log_events(log_group_name, log_stream_name, region, start_time, end_time):
    kwargs = {
        'logGroupName': log_group_name,
        'logStreamName': log_stream_name,
        'startFromHead': True,
        'startTime': start_time,
        'endTime': end_time
    }
    
    events = []
    while True:
        response = client.get_log_events(**kwargs)
        events.extend(response['events'])
        # Pagination: nextForwardToken only changes if there's more data
        next_token = response.get('nextForwardToken')
        if not next_token or next_token == kwargs.get('nextToken'):
            break
        kwargs['nextToken'] = next_token

    return events


def get_log_streams(log_group_name, start_time, end_time):
    paginator = client.get_paginator('describe_log_streams')
    page_iterator = paginator.paginate(
        logGroupName=log_group_name,
        orderBy='LastEventTime',
        descending=True
    )
    log_streams = []
    
    for page in page_iterator:
        for stream in page['logStreams']:
            log_stream_name = stream['logStreamName']
            first_event_timestamp = stream.get('firstEventTimestamp')
            last_event_timestamp = stream.get('lastEventTimestamp')

            if last_event_timestamp is not None and last_event_timestamp < start_time:
                break
    
            if (last_event_timestamp is not None and first_event_timestamp is not None and
                last_event_timestamp >= start_time and first_event_timestamp <= end_time):
                log_streams.append(log_stream_name)
        else:
            continue
        break
    return log_streams


start_time = iso_to_epoch_millis(START_TIME)
end_time = iso_to_epoch_millis(END_TIME)
streams = get_log_streams(log_group_name, start_time, end_time)

for stream in streams:
    # print(f"Stream - {stream}")
    events = get_all_log_events(log_group_name, stream, region, start_time, end_time)
    safe_stream_name = stream.replace("/", "_")
    with open(f"{name}_{safe_stream_name}.json", "w+") as fo:
        fo.write(json.dumps(events))

Alternatively you can change the script to download logs related to other beacon endpoinds. The relevant logs groups are as follows.

- /aws/lambda/sbeacon-backend-admin
- /aws/lambda/sbeacon-backend-dataPortal
- /aws/lambda/sbeacon-backend-deidentifyFiles
- /aws/lambda/sbeacon-backend-generateCohortVCfs
- /aws/lambda/sbeacon-backend-generateReports
- /aws/lambda/sbeacon-backend-getAnalyses
- /aws/lambda/sbeacon-backend-getBiosamples
- /aws/lambda/sbeacon-backend-getConfiguration
- /aws/lambda/sbeacon-backend-getDatasets
- /aws/lambda/sbeacon-backend-getEntryTypes
- /aws/lambda/sbeacon-backend-getFilteringTerms
- /aws/lambda/sbeacon-backend-getGenomicVariants
- /aws/lambda/sbeacon-backend-getIndividuals
- /aws/lambda/sbeacon-backend-getInfo
- /aws/lambda/sbeacon-backend-getMap
- /aws/lambda/sbeacon-backend-getProjects
- /aws/lambda/sbeacon-backend-getRuns
- /aws/lambda/sbeacon-backend-indexer
- /aws/lambda/sbeacon-backend-performQuery
- /aws/lambda/sbeacon-backend-splitQuery
- /aws/lambda/sbeacon-backend-submitDataset
- /aws/lambda/sbeacon-backend-updateFiles


## Loading the events


In [3]:
from glob import glob
import json

def iterate_log_entries():
    entries = []
    for file in glob(f"{scope}_*.json"):
        with open(file, "r") as f:
            data = f.read()
            data = data.replace("[]\n", "")
            entries +=  json.loads(data)
    
    log_entry = []
    for entry in entries:
        log_entry.append(entry)
        if entry["message"].startswith("REPORT"):
            yield log_entry
            log_entry = []


## sBeacon individuals scoped events for the user admin@example.com


In [4]:
from textwrap import indent
import re
from urllib.parse import unquote

re_individuals_id_biosamples = re.compile(r"^/individuals/.*/biosamples$")

for log_entry in iterate_log_entries():
    log_event = list(filter(lambda x: x["message"].startswith("Event Received"), log_entry))[0]
    event = log_event["message"]
    event = event.replace("Event Received: ", "")
    event = json.loads(event)
    

    if not event["requestContext"]["authorizer"]["claims"]["email"] == user:
        continue

    if event["httpMethod"] == "GET" and event["path"] == "/individuals/filtering_terms":
        print(f"User {user} listed filtering terms at {epoch_millis_to_iso(log_event['timestamp'])}")
        print("\tQuery params:")
        print(indent(json.dumps(event["queryStringParameters"], indent=4), "\t"))

    elif event["httpMethod"] == "POST" and event["path"] == "/individuals" or event["path"] == "/individuals/":
        print(f"User {user} listed individuals at {epoch_millis_to_iso(log_event['timestamp'])}")
        print("\tQuery body:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))

    elif event["httpMethod"] == "POST" and re_individuals_id_biosamples.match(event["path"]):
        print(f'User {user} listed individual: "{event['path'].split('/')[-2]}" biosamples at {epoch_millis_to_iso(log_event['timestamp'])}')
        print("\tQuery body:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))


    else:
        print("MISSED EVENT", event["httpMethod"], event["path"])




User admin@example.com listed individuals at 2025-05-12T02:16:35Z
	Query body:
	{
	    "projects": [
	        "Example Query Project"
	    ],
	    "query": {
	        "filters": [],
	        "requestedGranularity": "record",
	        "pagination": {
	            "skip": 0,
	            "limit": 100
	        }
	    },
	    "meta": {
	        "apiVersion": "v2.0"
	    }
	}
User admin@example.com listed individuals at 2025-05-12T02:22:15Z
	Query body:
	{
	    "projects": [
	        "Example Query Project"
	    ],
	    "query": {
	        "filters": [],
	        "requestedGranularity": "record",
	        "pagination": {
	            "skip": 0,
	            "limit": 100
	        }
	    },
	    "meta": {
	        "apiVersion": "v2.0"
	    }
	}
User admin@example.com listed individuals at 2025-05-12T02:22:24Z
	Query body:
	{
	    "projects": [
	        "Example Query Project"
	    ],
	    "query": {
	        "filters": [
	            {
	                "scope": "individuals",
	               