# Download and analyse dataportal log events

## Downloading log events

We are going to analyse the log events of user `admin@example.com` (an admin/manager user).

To perform this task, you must have aws console access, because the keys are needed to access aws console via boto3 library.

Please ensure keys are added to the jupyter environment before you run the code.



In [1]:
user = "admin@example.com"
region = "ap-southeast-2"

In [2]:
from datetime import datetime, timezone
import json
import boto3

client = boto3.client('logs', region_name=region)


def iso_to_epoch_millis(iso_str):
    dt = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%SZ")
    dt = dt.replace(tzinfo=timezone.utc)
    epoch_millis = int(dt.timestamp() * 1000)
    return epoch_millis

def epoch_millis_to_iso(epoch_millis):
    dt = datetime.fromtimestamp(epoch_millis / 1000, tz=timezone.utc)
    return dt.strftime('%Y-%m-%dT%H:%M:%SZ')


def get_all_log_events(log_group_name, log_stream_name, region, start_time, end_time):
    kwargs = {
        'logGroupName': log_group_name,
        'logStreamName': log_stream_name,
        'startFromHead': True,
        'startTime': start_time,
        'endTime': end_time
    }
    
    events = []
    while True:
        response = client.get_log_events(**kwargs)
        events.extend(response['events'])
        # Pagination: nextForwardToken only changes if there's more data
        next_token = response.get('nextForwardToken')
        if not next_token or next_token == kwargs.get('nextToken'):
            break
        kwargs['nextToken'] = next_token

    return events


def get_log_streams(log_group_name, start_time, end_time):
    paginator = client.get_paginator('describe_log_streams')
    page_iterator = paginator.paginate(
        logGroupName=log_group_name,
        orderBy='LastEventTime',
        descending=True
    )
    log_streams = []
    
    for page in page_iterator:
        for stream in page['logStreams']:
            log_stream_name = stream['logStreamName']
            first_event_timestamp = stream.get('firstEventTimestamp')
            last_event_timestamp = stream.get('lastEventTimestamp')

            if last_event_timestamp is not None and last_event_timestamp < start_time:
                break
    
            if (last_event_timestamp is not None and first_event_timestamp is not None and
                last_event_timestamp >= start_time and first_event_timestamp <= end_time):
                log_streams.append(log_stream_name)
        else:
            continue
        break
    return log_streams




In [3]:
name = "dataportal"
log_group_name = "/aws/lambda/sbeacon-backend-dataPortal"

START_TIME = "2025-06-03T00:00:00Z"
END_TIME = "2025-06-05T23:59:59Z"

start_time = iso_to_epoch_millis(START_TIME)
end_time = iso_to_epoch_millis(END_TIME)
streams = get_log_streams(log_group_name, start_time, end_time)

for stream in streams:
    # print(f"Stream - {stream}")
    events = get_all_log_events(log_group_name, stream, region, start_time, end_time)
    safe_stream_name = stream.replace("/", "_")
    with open(f"{name}_{safe_stream_name}.json", "w+") as fo:
        fo.write(json.dumps(events))

## Loading the events for dataportal log group


In [4]:
from glob import glob
import json

def iterate_log_entries():
    entries = []
    for file in glob("dataportal_*.json"):
        with open(file, "r") as f:
            data = f.read()
            data = data.replace("[]\n", "")
            entries +=  json.loads(data)
    
    log_entry = []
    for entry in entries:
        log_entry.append(entry)
        if entry["message"].startswith("REPORT"):
            yield log_entry
            log_entry = []


Admins can use the sub of this user to track their login and logout activities in cloudtrail. You can get sub of this user using the following command.


In [5]:
# aws cognito-idp list-users --user-pool-id <user-pool-id> --filter "email = \"<email>\""

# for example
# aws cognito-idp list-users --user-pool-id ap-southeast-2_3ZrrcagIG --filter "email = \"admin@example.com\"" --region ap-southeast-2

Output would look like follows

```json
{
  "Users": [
    {
      "Username": "admin@example.com",
      "Attributes": [
        {
          "Name": "email",
          "Value": "admin@example.com"
        },
        {
          "Name": "email_verified",
          "Value": "true"
        },
        {
          "Name": "family_name",
          "Value": "Admin"
        },
        {
          "Name": "given_name",
          "Value": "Admin"
        },
        {
          "Name": "custom:terraform",
          "Value": "true"
        },
        {
          "Name": "custom:identity_id",
          "Value": "ap-southeast-2:099e873d-80b5-cb64-b9b4-0f64c663bd46"
        },
        {
          "Name": "sub",
          "Value": "f98e24c8-2011-70ae-9d93-084eb3f4b282"
        }
      ],
      "UserCreateDate": "2024-11-20T15:58:30.157000+10:30",
      "UserLastModifiedDate": "2025-03-13T15:09:11.817000+10:30",
      "Enabled": true,
      "UserStatus": "CONFIRMED"
    }
  ]
}
```


## Dataportal notebook events for the user admin@example.com


In [6]:
from textwrap import indent
import re
from urllib.parse import unquote

re_notebook_start = re.compile(r"^/dportal/notebooks/.*?/start$")
re_notebook_stop = re.compile(r"^/dportal/notebooks/.*?/stop$")
re_notebook = re.compile(r"^/dportal/notebooks/[a-zA-Z0-9-]+$")

for log_entry in iterate_log_entries():
    log_event = list(filter(lambda x: x["message"].startswith("Event Received"), log_entry))[0]
    event = log_event["message"]
    event = event.replace("Event Received: ", "")
    event = json.loads(event)
    

    if not event["requestContext"]["authorizer"]["claims"]["email"] == user:
        continue

    if event["httpMethod"] == "POST" and event["path"] == "/dportal/notebooks":
        print(f"User {user} created a notebook at {epoch_millis_to_iso(log_event['timestamp'])}")
        print("\tNotebook properties:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))

    elif re_notebook_start.match(event["path"]):
        print(f"User {user} started notebook: {event['path'].split('/')[-2]}, at {epoch_millis_to_iso(log_event['timestamp'])}")
    
    elif re_notebook_stop.match(event["path"]):
        print(f"User {user} stopped notebook: {event['path'].split('/')[-2]}, at {epoch_millis_to_iso(log_event['timestamp'])}")

    elif re_notebook.match(event["path"]):
        print(f"User {user} listed details of notebook: {event['path'].split('/')[-1]}, at {epoch_millis_to_iso(log_event['timestamp'])}")

    elif "/dportal/notebooks" == event["path"]:
        print(f"User {user} listed notebooks at {epoch_millis_to_iso(log_event['timestamp'])}")


User admin@example.com listed notebooks at 2025-06-04T02:27:47Z


## Dataportal manager tasks performed by user admin@example.com


In [7]:
from textwrap import indent
import re
from urllib.parse import unquote

re_projects = re.compile(r"^/dportal/admin/projects$")
re_project = re.compile(r"^/dportal/admin/projects/[a-zA-Z%0-9]+$")
re_projects_ingest = re.compile(r"^/dportal/admin/projects/[a-zA-Z%0-9]+/ingest/[a-zA-Z%0-9-]+$")
re_notebook_delete = re.compile(r"^/dportal/admin/notebooks/[a-zA-Z-0-9]+/delete$")
re_notebook = re.compile(r"^/dportal/admin/notebooks/[a-zA-Z-0-9]+$")

for log_entry in iterate_log_entries():
    log_event = list(filter(lambda x: x["message"].startswith("Event Received"), log_entry))[0]
    event = log_event["message"]
    event = event.replace("Event Received: ", "")
    event = json.loads(event)
    


    if not event["requestContext"]["authorizer"]["claims"]["email"] == user or "/dportal/admin" not in event["path"]:
        continue

    #
    # Projects
    #
    
    if event["httpMethod"] == "POST" and re_projects.match(event["path"]):
        print(f"User {user} created a project at {epoch_millis_to_iso(log_event['timestamp'])}")
        print("\tProject properties:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))

    elif event["httpMethod"] == "GET" and re_projects.match(event["path"]):
        print(f"User {user} listed projects at {epoch_millis_to_iso(log_event['timestamp'])}")

    elif event["httpMethod"] == "GET" and re_project.match(event["path"]):
        print(f"User {user} listed details of project: {event['path'].split('/')[-1]}, at {epoch_millis_to_iso(log_event['timestamp'])}")


    elif event["httpMethod"] == "PUT" and re_project.match(event["path"]):
        print(f"User {user} updated details of project: {unquote(event['path'].split('/')[-1])}, at {epoch_millis_to_iso(log_event['timestamp'])}")
        print("\tProject properties:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))

    elif event["httpMethod"] == "POST" and re_projects_ingest.match(event["path"]):
        print(f"User {user} ingested data into project: {unquote(event['path'].split('/')[-3])}, at {epoch_millis_to_iso(log_event['timestamp'])}")
        print("\tIngest properties:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))

    # 
    # sBeacon 
    #

    elif event["httpMethod"] == "POST" and event["path"] == "/dportal/admin/sbeacon/index":
        print(f"User {user} indexed data into sBeacon at {epoch_millis_to_iso(log_event['timestamp'])}")

    # 
    # notebooks
    # 

    elif event["httpMethod"] == "GET" and event["path"] == "/dportal/admin/notebooks":
        print(f"User {user} listed notebooks at {epoch_millis_to_iso(log_event['timestamp'])}")

    elif event["httpMethod"] == "GET" and re_notebook.match(event["path"]):
        print(f"User {user} listed details of notebook: {unquote(event['path'].split('/')[-1])}, at {epoch_millis_to_iso(log_event['timestamp'])}")

    elif event["httpMethod"] == "POST" and re_notebook_delete.match(event["path"]):
        print(f"User {user} deleted notebook: {unquote(event['path'].split('/')[-1])}, at {epoch_millis_to_iso(log_event['timestamp'])}")

    elif event["httpMethod"] == "GET" and event["path"] == "/dportal/admin/folders":
        print(f"User {user} listed folders at {epoch_millis_to_iso(log_event['timestamp'])}")

    else:
        print("MISSED EVENT", event["httpMethod"], event["path"])



## Dataportal file delete events for the user admin@example.com


In [8]:
from textwrap import indent
import re
from urllib.parse import unquote

re_admin_projects = re.compile(r"^/dportal/admin/projects/[a-zA-Z0-9%-]+$")


for complete_log_entry in iterate_log_entries():
    log_event = list(filter(lambda x: x["message"].startswith("Event Received"), complete_log_entry))[0]
    event = log_event["message"]
    event = event.replace("Event Received: ", "")
    event = json.loads(event)
    

    if not event["requestContext"]["authorizer"]["claims"]["email"] == user:
        continue

    if event["httpMethod"] == "PUT" and re_admin_projects.match(event["path"]):
        print(f"User {user} updated project: {event['path'].split('/')[-2]}, at {log_event['timestamp']}")
        delete_log_event = list(filter(lambda x: x["message"].startswith("Deleting"), complete_log_entry))

        print("\tUpdate payload:")
        print(indent(json.dumps(json.loads(event["body"]), indent=4), "\t"))
        for delete_event in delete_log_event:
            print(f"\tAction: {delete_event['message'].strip()}")



## Dataportal file add events for the user admin@example.com

This tracks all file uploads regardless if they are invalid files or not.


In [9]:
log_group_name = "/aws/lambda/sbeacon-backend-deidentifyFiles"
name = "deidentify"

START_TIME = "2025-05-01T00:00:00Z"
END_TIME = "2025-05-30T23:59:59Z"

start_time = iso_to_epoch_millis(START_TIME)
end_time = iso_to_epoch_millis(END_TIME)
streams = get_log_streams(log_group_name, start_time, end_time)

for stream in streams:
    # print(f"Stream - {stream}")
    events = get_all_log_events(log_group_name, stream, region, start_time, end_time)
    safe_stream_name = stream.replace("/", "_")
    with open(f"{name}_{safe_stream_name}.json", "w+") as fo:
        fo.write(json.dumps(events))

## Loading the events for deidentify log group

This is the log group that records file uploads immediately after uploads are completed


In [10]:
from glob import glob
import json

def iterate_log_entries():
    entries = []
    for file in glob("deidentify_*.json"):
        with open(file, "r") as f:
            data = f.read()
            data = data.replace("[]\n", "")
            entries +=  json.loads(data)
    
    log_entry = []
    for entry in entries:
        log_entry.append(entry)
        if entry["message"].startswith("REPORT"):
            yield log_entry
            log_entry = []


In [11]:
from textwrap import indent
import re
from urllib.parse import unquote

for complete_log_entry in iterate_log_entries():
    log_event = list(filter(lambda x: x["message"].startswith("Backend Event Received:"), complete_log_entry))[0]

    if file_event := list(filter(lambda x: x["message"].startswith("File owner"), complete_log_entry)):
        pattern = r'File owner for "(.*?)" of project "(.*?)" is "(.*?)"'
        match = re.match(pattern, file_event[0]["message"])
        if match:
            file_name, project, user_sub = match.groups()
            print(f'User: "{user_sub}" created file: "{file_name}" in project: "{project}" at {log_event["timestamp"]}')




User: "admin@example.com" created file: "wrong.vcf.gz" in project: "My test project" at 1747871597084
