In [None]:
import json, os
for k, v in json.load(open("local.settings.json"))["Values"].items():
    os.environ[k] = v

In [None]:
import boto3, pandas as pd

# Configuration dictionary containing AWS credentials, database, bucket info, and SQL query
ingress = {
    "access_key": os.environ["DISTILLED_ACCESS_KEY"],  # AWS access key
    "secret_key": os.environ["DISTILLED_SECRET_KEY"],  # AWS secret access key
    "region": os.environ["DISTILLED_REGION"],          # AWS region
    "database": os.environ["DISTILLED_DATABASE"],      # Database name in Athena
    "bucket": os.environ["DISTILLED_BUCKET"],          # S3 bucket where query results will be stored
    "workgroup": os.environ["DISTILLED_WORKGROUP"],    # Athena workgroup
    "query": """
        SELECT
            json_extract(
                json_headers, 
                '$["User-Agent"][0]'
            ) AS user_agent
        FROM "pixel"."b2c"
    """,  # SQL query to execute
}

# Creating a boto3 session with AWS credentials and region information
session = boto3.Session(
    aws_access_key_id=os.getenv(ingress["access_key"], ingress["access_key"]),
    aws_secret_access_key=os.getenv(ingress["secret_key"], ingress["secret_key"]),
    region_name=os.getenv(ingress.get("region", ""), ingress.get("region", None)),
)

# Creating an Athena client from the boto3 session
athena_client = session.client("athena")

# Starting the execution of the query in Athena
execution_id = athena_client.start_query_execution(
    QueryString=ingress["query"],
    QueryExecutionContext={
        "Database": os.getenv(ingress["database"], ingress["database"])  # Specifying the database to use for the query
    },
    ResultConfiguration={
        "OutputLocation": "s3://" + os.getenv(ingress["bucket"], ingress["bucket"]),  # S3 path for saving the query results
    },
    WorkGroup=os.getenv(ingress["workgroup"], ingress["workgroup"]),  # Specifying the Athena workgroup
)["QueryExecutionId"]

# Loop to check the query execution status until it's either FAILED, CANCELLED, or SUCCEEDED
while True:
    status = athena_client.get_query_execution(QueryExecutionId=execution_id)
    if status["QueryExecution"]["Status"]["State"] in ("FAILED", "CANCELLED"):
        raise Exception(
            "Athena query [{}] failed or was cancelled".format(
                status["QueryExecution"]["Query"]
            )
        )
    if status["QueryExecution"]["Status"]["State"] in ("SUCCEEDED"):
        break  # Exit the loop if the query succeeded

# Retrieving the results into a pandas DataFrame
df = pd.read_csv(
    session.client("s3").generate_presigned_url(
        "get_object",
        Params={
            "Bucket": os.environ["DISTILLED_BUCKET"],  # Specifying the S3 bucket
            "Key": "/".join(
                status["QueryExecution"]["ResultConfiguration"]["OutputLocation"].split(
                    "/"
                )[3:]  # Extracting the key/path to the query result file in S3
            ),
        },
        ExpiresIn=ingress.get("expires_in", 60 * 60 * 48),  # Presigned URL expiration time
    )
)