# Query MyDataHelps Export Database

Use this notebook to run any query against the MDH Export Database and obtain the result as a pandas data frame.

An AWS profile from the AWS credentials file (specified in the config cell) will be used for authentication.

In [None]:
# One time setup. Uncomment the following lines if these libraries have not been installed in your python kernel/virtual environment.

#!pip install --upgrade boto3 pandas

### Gather Your Export Database Configuration Settings and Credentials

To obtain the configuration settings for your project's Export Database, open MyDataHelps Designer and navigate to the `Settings` tab for your project. Click `Export Explorer`. The `External Applications` tab will provide the required configuration settings. 

In [None]:
PROJECT_SCHEMA_NAME = "{YOUR PROJECT SCHEMA NAME}"
AWS_PROFILE_NAME = "{YOUR AWS PROFILE NAME}"
ATHENA_OUTPUT_BUCKET_LOCATION ="{YOUR S3 OUTPUT LOCATION}"
ATHENA_WORKGROUP = "{YOUR WORKGROUP}"
QUERY_RESULT_TEMP_FILE_LOCATION = "athena_query_results.csv" # preferred location on your local machine

In [None]:
import boto3, pandas as pd, numpy as np, time

def execute_query(query_string: str, download_file_location: str = None) -> pd.DataFrame:
 
    session = boto3.session.Session(profile_name=AWS_PROFILE_NAME)
    
    athena_client = session.client("athena", region_name="us-east-1")

    query_start = athena_client.start_query_execution(
        QueryString = query_string,
        QueryExecutionContext = {"Database": PROJECT_SCHEMA_NAME}, 
        WorkGroup = ATHENA_WORKGROUP,
        ResultConfiguration = {"OutputLocation": f"{ATHENA_OUTPUT_BUCKET_LOCATION}/"}
    )
        
    query_execution = athena_client.get_query_execution(QueryExecutionId=query_start['QueryExecutionId'])
    while query_execution["QueryExecution"]["Status"]["State"] in {"RUNNING", "QUEUED"}:
        print(time.strftime("%H:%M:%S", time.localtime()), f"query status: {query_execution['QueryExecution']['Status']['State']}")
        time.sleep(5)
        query_execution = athena_client.get_query_execution(QueryExecutionId=query_start['QueryExecutionId'])

    print(time.strftime("%H:%M:%S", time.localtime()), f"query status: {query_execution['QueryExecution']['Status']['State']}")

    s3_client = session.client("s3")

    file_location = download_file_location if download_file_location is not None else QUERY_RESULT_TEMP_FILE_LOCATION

    result_uri = query_execution["QueryExecution"]["ResultConfiguration"]["OutputLocation"]
    bucket_name =  result_uri.split("/")[2]             
 
    s3_client.download_file(
        bucket_name,
        result_uri.replace(f"s3://{bucket_name}/", ""),
        file_location,
    )

    return pd.read_csv(file_location)

In [None]:
query_string = f"""
select 
    surveydefinitionkey, 
    surveykey,
    surveyname,
    versionnumber,
    inserteddate,
    definition
from
    surveydefinitions
"""

surveydefinitions = execute_query(query_string)
surveydefinitions

In [None]:
query_string = f"""
select
    participantidentifier,
    date, 
    restingheartrate,
    steps
from
    fitbitdailydata
"""

fitbitdailydata = execute_query(query_string)
fitbitdailydata

In [None]:
(fitbitdailydata[fitbitdailydata.participantidentifier.eq("{YOUR PARTICIPANT IDENTIFIER}")]
 .groupby("participantidentifier")[["restingheartrate", "steps"]]
 .agg("mean"))