# Create a federated table in bigquery

- Create a BigQuery federated table that connects to and directly reads from the RDR bucket
    
    - Fitbit
    
        - DAILY_ACTIVITY_SUMMARY: /NA/ACTIVITY_MEASUREMENTS
        - DAILY_FAT
        - DAILY_SLEEP_SUMMARY
        - DAILY_WEIGHT
        - FOOD
        - HEARTRATE
        - INTRADAY_STEPS
        - WATER
       
    - Apple Healthkit
    
    
- To create a table with the filename, create another empty table and change options to save query in that newly created table, run query

```
SELECT
*,
_FILE_NAME as filename
FROM
`R2019q4r3_deid_io_f.test3`
order by 1, 2
```

- or load into big query using `bq load --source_format=CSV -F '^' R2019q4r3_deid_io_f.test2 gs://ptsc-fitbit-data-all-of-us-rdr-prod/HEARTRATE/*.json`

- To delete: `run bq rm -f -t R2019q4r3_deid_io_f.test3` in cloud shell terminal

- parse raw into minute level data by running
```
SELECT
 CAST(JSON_EXTRACT_SCALAR(data,"$.activities-heart[0]['dateTime']") AS DATE) AS day,
 JSON_EXTRACT_SCALAR(params ,"$.time") as time,
 JSON_EXTRACT_SCALAR(params,"$.value") as heart_rate,
REGEXP_EXTRACT(filename, '/([0-9]+)/') as userid
FROM
`R2019q4r3_deid_io_f.heart_raw`, unnest(JSON_EXTRACT_ARRAY(data,"$.activities-heart-intraday['dataset']")) as params
```

## Directory structure

`source/datatype/payloadtype/category/YYYY/MM/DD/pid/startdate_enddate-uuid.[ext]`

- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/DAILY_ACTIVITY_SUMMARY/NA/ACTIVITY_MEASUREMENTS
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/DAILY_FAT/NA/PHYSICAL_MEASUREMENTS
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/DAILY_SLEEP_SUMMARY/NA/ACTIVITY_MEASUREMENTS
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/DAILY_WEIGHT/NA/PHYSICAL_MEASUREMENTS
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/FOOD/NA/NUTRITION_TRACKING
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/HEARTRATE/NA/VITAL_MEASUREMENTS
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/INTRADAY_STEPS/NA/ACTIVITY_MEASUREMENTS
- ptsc-health-data-prod/raw/health/2020/03/15/FITBIT/WATER/NA/NUTRITION_TRACKING

YYYY: 2020, 2021


Step 1: Created federated table

Step 2: Create staging table with data as string + add filename

Step 3: Parse the data

# Federated table

In [None]:
from google.cloud import bigquery

In [None]:
def create_federated_table(dataset_name, federated_table_name, uris, project_id='aou-res-curation-prod'):
    bq_client = bigquery.Client(project=project_id)
    table_ref = bq_client.dataset(dataset_name).table(federated_table_name)
    table = bigquery.Table(table_ref)
    extconfig = bigquery.ExternalConfig('CSV')
    extconfig.schema = [bigquery.SchemaField('data', 'STRING')]
    extconfig.options.autodetect = False
    extconfig.options.field_delimiter = '|' #u'\u00ff'
#     extconfig.options.quote_character = ''
    # extconfig.compression = 'GZIP'
    extconfig.options.allow_jagged_rows = True
    extconfig.options.ignore_unknown_values = True
    extconfig.options.allow_quoted_newlines = True
#     extconfig.max_bad_records = 10000000
    extconfig.max_bad_records = 0
    extconfig.source_uris = uris
    table.external_data_configuration = extconfig
    bq_client.delete_table(table, not_found_ok=True) 
    bq_client.create_table(table)

In [None]:
def delete_table(project_id, dataset_name, table):
    bq_client = bigquery.Client(project=project_id)
    table_ref = bq_client.dataset(dataset_name).table(table)
    table = bigquery.Table(table_ref)
    bq_client.delete_table(table, not_found_ok=True) 

# Staging

In [None]:
def load_federated_table_to_bigquery(federated_table_id, destination_table_id, project_id = 'aou-res-curation-prod'):
    # Construct a BigQuery client object.
    client = bigquery.Client(project=project_id)
    job_config = bigquery.QueryJobConfig(destination=destination_table_id, write_disposition = 'WRITE_APPEND')
    sql = """
        SELECT
            *,
            _FILE_NAME as filename
        FROM `{federated_table_id}`
    """
    # Start the query, passing in the extra configuration.
    query_job = client.query(sql.format(federated_table_id=federated_table_id), 
                             job_config=job_config)  # Make an API request.
    return query_job.result()  # Wait for the job to complete.

In [None]:
URIS = {
    'DAILY_ACTIVITY_SUMMARY': "gs://ptsc-health-data-prod/raw/health/{yyyy}/{mm}/{dd}/FITBIT/DAILY_ACTIVITY_SUMMARY/NA/ACTIVITY_MEASUREMENTS/*",
    'HEARTRATE': "gs://ptsc-health-data-prod/raw/health/{yyyy}/{mm}/{dd}/FITBIT/HEARTRATE/NA/VITAL_MEASUREMENTS/*",
    'INTRADAY_STEPS': "gs://ptsc-health-data-prod/raw/health/{yyyy}/{mm}/{dd}/FITBIT/INTRADAY_STEPS/NA/ACTIVITY_MEASUREMENTS/*",
    'DAILY_SLEEP_SUMMARY': "gs://ptsc-health-data-prod/raw/health/{yyyy}/{mm}/{dd}/FITBIT/DAILY_SLEEP_SUMMARY/NA/ACTIVITY_MEASUREMENTS/*",
}

In [None]:
# cur_project_id = 'aou-res-curation-prod'
# dataset_name = 'fitbit_ingest'

# for cat in URIS:
#     print("Uploading ", cat)
#     cat_lower_case = cat.lower()
#     federated_table_name = 'dev_' + cat_lower_case
#     federated_table_id = f"{cur_project_id}.{dataset_name}.dev_{cat_lower_case}"
#     destination_table_id = f"{cur_project_id}.{dataset_name}.staging_{cat_lower_case}"
#     project_id, dataset_name, table = destination_table_id.split(".")
#     delete_table(project_id, dataset_name, table)
#     yyyy = '2022'
#     print('Uploading year', yyyy)
#     for mm in ['01']:
#         print("Uploading month", mm)
#         for day in range(1, 32):
#             dd = str(day).zfill(2)
#             if dd <= "30":
#                 continue
#             if dd > "31":
#                 break
#             print("Uploading day", dd)
#             uris = URIS[cat].format(yyyy=yyyy, mm=mm, dd=dd)
#             create_federated_table(dataset_name, federated_table_name, uris)
#             load_federated_table_to_bigquery(federated_table_id, destination_table_id)
              

In [None]:
cur_project_id = 'aou-res-curation-prod'
dataset_name = 'fitbit_ingest'

for cat in URIS:
    print("Uploading ", cat)
    cat_lower_case = cat.lower()
    federated_table_name = 'dev_' + cat_lower_case
    federated_table_id = f"{cur_project_id}.{dataset_name}.dev_{cat_lower_case}"
    destination_table_id = f"{cur_project_id}.{dataset_name}.staging_{cat_lower_case}"
    project_id, dataset_name, table = destination_table_id.split(".")
    delete_table(project_id, dataset_name, table)
    yyyy = '2022'
    print('Uploading year', yyyy)
    for mm in ['07']:
        print("Uploading month", mm)
        for day in range(1, 32):
            dd = str(day).zfill(2)
            if dd <= "21":
                continue
            if dd > "31":
                break
            print("Uploading day", dd)
            uris = URIS[cat].format(yyyy=yyyy, mm=mm, dd=dd)
            create_federated_table(dataset_name, federated_table_name, uris)
            load_federated_table_to_bigquery(federated_table_id, destination_table_id)
              

In [None]:
import datetime
print(datetime.datetime.now())

next:

    - mm = 02, dd<=09, dd>31 -- Done
    `Run .4 staging`
    
    - mm = 03, dd<=00, dd>31 -- Done
    `Run .4 staging`
    
    - mm = 04, dd<=00, dd>31 -- Done
    `Run .4 staging`
    
    - mm = 05, dd<=00, dd>31-- Done
    `Run .4 staging`
    
    - mm = 06, dd<=00, dd>31-- Done
    `Run .4 staging`
    
    - mm = 07, dd<=00, dd>21-- Done
    `Run .4 staging`

# Parsing

In [None]:
def parse_table_to_bigquery(sql_query, destination_table_id, project_id):
    # Construct a BigQuery client object.
    client = bigquery.Client(project=project_id)
    job_config = bigquery.QueryJobConfig(destination= destination_table_id, 
                                         write_disposition = 'WRITE_APPEND')
    # Start the query, passing in the extra configuration.
    query_job = client.query(sql_query, job_config=job_config)  # Make an API request.
    return query_job.result()  # Wait for the job to complete.

In [None]:
src_table_id = {
    'activity_summary': 'staging_daily_activity_summary',
    'heart_rate_minute_level': 'staging_heartrate',
    'heart_rate_summary': 'staging_heartrate',
    'steps_intraday': 'staging_intraday_steps',
    'sleep_level': 'staging_daily_sleep_summary',
    'sleep_daily_summary': 'staging_daily_sleep_summary'
}

queries = {
'activity_summary': """
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'ACTIVITY_MEASUREMENTS/([0-9]+)/') AS INT64) AS vibrent_id,
    SAFE_CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    SAFE_CAST(REGEXP_EXTRACT(filename, '/([0-9]{{4}}-[0-9]{{2}}-[0-9]{{2}})[T|-]?') AS DATE) as date,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data, "$.summary['activityCalories']") AS FLOAT64) as activity_calories,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['caloriesBMR']") AS FLOAT64) as calories_bmr,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['caloriesOut']") AS FLOAT64) as calories_out,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['elevation']") AS FLOAT64) as elevation,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['fairlyActiveMinutes']") AS FLOAT64) as fairly_active_minutes,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['floors']") AS INT64) as floors,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['lightlyActiveMinutes']") AS FLOAT64) as lightly_active_minutes,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['marginalCalories']") AS FLOAT64) as marginal_calories,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['sedentaryMinutes']") AS FLOAT64) as sedentary_minutes,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['steps']") AS INT64) as steps,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['veryActiveMinutes']") AS FLOAT64) as very_active_minutes
FROM `{dataset}.staging_daily_activity_summary`""", 

'heart_rate_minute_level': """
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'VITAL_MEASUREMENTS/([0-9]+)/') AS INT64) as vibrent_id,
    SAFE_CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    DATETIME(CAST(JSON_EXTRACT_SCALAR(data,"$.activities-heart[0]['dateTime']") as DATE)
    , CAST(JSON_EXTRACT_SCALAR(params ,"$.time") AS TIME)) as datetime,
    SAFE_CAST(JSON_EXTRACT_SCALAR(params,"$.value") AS INT64) as heart_rate_value
FROM `{dataset}.staging_heartrate`, unnest(JSON_EXTRACT_ARRAY(data,"$.activities-heart-intraday['dataset']")) as params""", 


'heart_rate_summary': """
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'VITAL_MEASUREMENTS/([0-9]+)/') AS INT64) as vibrent_id,
    CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    CAST(JSON_EXTRACT_SCALAR(data,"$.activities-heart[0]['dateTime']") AS DATE) AS date,
    JSON_EXTRACT_SCALAR(zone,"$.name") as zone_name,
    SAFE_CAST(JSON_EXTRACT_SCALAR(zone,"$.min") AS INT64) as min_heart_rate,
    SAFE_CAST(JSON_EXTRACT_SCALAR(zone,"$.max") AS INT64) as max_heart_rate,
    SAFE_CAST(JSON_EXTRACT_SCALAR(zone,"$.minutes") AS INT64) as minute_in_zone,
    SAFE_CAST(JSON_EXTRACT_SCALAR(zone,"$.caloriesOut") AS FLOAT64) as calorie_count
FROM `{dataset}.staging_heartrate`, 
unnest(JSON_EXTRACT_ARRAY(data, "$.activities-heart[0]['value']['heartRateZones']")) as zone""",


'steps_intraday': """
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'ACTIVITY_MEASUREMENTS/([0-9]+)/') AS INT64) AS vibrent_id,
    CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    DATETIME(CAST(JSON_EXTRACT_SCALAR(data,"$.activities-steps[0]['dateTime']") as DATE), CAST(JSON_EXTRACT_SCALAR(params ,"$.time") AS TIME)) as datetime,
    SAFE_CAST(JSON_EXTRACT_SCALAR(params,"$.value") AS NUMERIC) as steps
FROM `{dataset}.staging_intraday_steps`, 
unnest(JSON_EXTRACT_ARRAY(data,"$.activities-steps-intraday['dataset']")) as params""",

'sleep_level': """
-- first sleep data
with first_long_data AS (
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'ACTIVITY_MEASUREMENTS/([0-9]+)/') AS INT64) as vibrent_id,
    CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['dateOfSleep']") AS DATE) AS sleep_date,
    JSON_EXTRACT_SCALAR(data,"$.sleep[0]['isMainSleep']") AS is_main_sleep,
    CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['logId']") AS NUMERIC) AS log_id,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['totalTimeInBed']") AS INTEGER) AS minute_in_bed,
    JSON_EXTRACT_SCALAR(level,"$.level") as level,
    SAFE_CAST(JSON_EXTRACT_SCALAR(level,"$.dateTime") AS DATETIME) as start_datetime,
    SAFE_CAST(JSON_EXTRACT_SCALAR(level,"$.seconds") AS FLOAT64)/60 as duration_in_min
FROM `{dataset}.staging_daily_sleep_summary`,
unnest(JSON_EXTRACT_ARRAY(data, "$.sleep[0]['levels']['data']")) as level
),

-- second sleep data
second_long_data AS (
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'ACTIVITY_MEASUREMENTS/([0-9]+)/') AS INT64) as vibrent_id,
    CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['dateOfSleep']") AS DATE) AS sleep_date,
    JSON_EXTRACT_SCALAR(data,"$.sleep[1]['isMainSleep']") AS is_main_sleep,
    CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['logId']") AS NUMERIC) AS log_id,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.summary['totalTimeInBed']") AS INTEGER) AS minute_in_bed,
    JSON_EXTRACT_SCALAR(level,"$.level") as level,
    SAFE_CAST(JSON_EXTRACT_SCALAR(level,"$.dateTime") AS DATETIME) as start_datetime,
    SAFE_CAST(JSON_EXTRACT_SCALAR(level,"$.seconds") AS FLOAT64)/60 as duration_in_min
FROM `{dataset}.staging_daily_sleep_summary`,
unnest(JSON_EXTRACT_ARRAY(data, "$.sleep[1]['levels']['data']")) as level
),

long_data AS (
SELECT *
FROM first_long_data
UNION ALL
SELECT *
FROM second_long_data
)

-- remove duplicates
SELECT * EXCEPT(rank, log_id, minute_in_bed)
FROM (
    SELECT * , rank() OVER(PARTITION BY vibrent_id, upload_date, sleep_date, log_id ORDER BY minute_in_bed DESC)  AS rank
    FROM long_data
    )
WHERE rank = 1
AND start_datetime IS NOT NULL
AND sleep_date IS NOT NULL
""",
    
    
'sleep_daily_summary': """
WITH first_sleep_data AS (
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'ACTIVITY_MEASUREMENTS/([0-9]+)/') AS INT64) as vibrent_id,
    CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['dateOfSleep']") AS DATE) AS sleep_date,
    JSON_EXTRACT_SCALAR(data,"$.sleep[0]['isMainSleep']") AS is_main_sleep,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['timeInBed']") AS INTEGER) AS minute_in_bed,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['minutesAsleep']") AS INTEGER) AS minute_asleep,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['minutesAfterWakeup']") AS INTEGER) AS minute_after_wakeup,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['minutesAwake']") AS INTEGER) AS minute_awake,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['levels']['summary']['restless']['minutes']") AS INTEGER) AS minute_restless,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['levels']['summary']['deep']['minutes']") AS INTEGER) AS minute_deep,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['levels']['summary']['light']['minutes']") AS INTEGER) AS minute_light,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['levels']['summary']['rem']['minutes']") AS INTEGER) AS minute_rem,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[0]['levels']['summary']['wake']['minutes']") AS INTEGER) AS minute_wake
FROM `{dataset}.staging_daily_sleep_summary`
),

second_sleep_data AS (
SELECT DISTINCT
    CAST(REGEXP_EXTRACT(filename, 'ACTIVITY_MEASUREMENTS/([0-9]+)/') AS INT64) as vibrent_id,
    CAST(REPLACE(REGEXP_EXTRACT(filename, 'health/([0-9]{{4}}/[0-9]{{2}}/[0-9]{{2}})/'), '/', '-') AS DATE) as upload_date,
    CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['dateOfSleep']") AS DATE) AS sleep_date,
    JSON_EXTRACT_SCALAR(data,"$.sleep[1]['isMainSleep']") AS is_main_sleep,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['timeInBed']") AS INTEGER) AS minute_in_bed,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['minutesAsleep']") AS INTEGER) AS minute_asleep,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['minutesAfterWakeup']") AS INTEGER) AS minute_after_wakeup,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['minutesAwake']") AS INTEGER) AS minute_awake,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['levels']['summary']['restless']['minutes']") AS INTEGER) AS minute_restless,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['levels']['summary']['deep']['minutes']") AS INTEGER) AS minute_deep,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['levels']['summary']['light']['minutes']") AS INTEGER) AS minute_light,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['levels']['summary']['rem']['minutes']") AS INTEGER) AS minute_rem,
    SAFE_CAST(JSON_EXTRACT_SCALAR(data,"$.sleep[1]['levels']['summary']['wake']['minutes']") AS INTEGER) AS minute_wake
FROM `{dataset}.staging_daily_sleep_summary`
),

long_data AS (
    SELECT * FROM first_sleep_data
    UNION ALL 
    SELECT * FROM second_sleep_data
)

SELECT * EXCEPT(rank)
FROM (
    SELECT * , RANK() OVER(PARTITION BY vibrent_id, upload_date, sleep_date, is_main_sleep ORDER BY minute_in_bed DESC) AS rank
    FROM long_data
    )
WHERE rank = 1
AND sleep_date IS NOT NULL
"""

}

In [None]:
project_id = 'aou-res-curation-prod'
dataset_name = 'fitbit_ingest'

for table_name, query in queries.items():
    source_table_id = src_table_id[table_name]
    sql_query = query.format(dataset=dataset_name)
    destination_table_id = project_id + '.' + dataset_name + '.' + table_name
    print("Loading", table_name)
    result = parse_table_to_bigquery(sql_query, destination_table_id, project_id)
    print(result)

In [None]:
print(datetime.datetime.now())

# Mapping Vibrent ID - Participant ID

In [None]:
import pandas as pd
import numpy as np
import mysql.connector
import sys
sys.path.insert(0, '../..')
from config import connect_options

con = mysql.connector.connect(**connect_options)

In [None]:
query = """
     SELECT DISTINCT
        participant_id,
        CAST(external_id AS UNSIGNED) as vibrent_id
    FROM rdr.participant
    WHERE external_id IS NOT NULL
"""

rdr_pids = pd.read_sql_query(query, con=con)

In [None]:
project_id = "aou-res-curation-prod"
destination_table="fitbit_ingest.id_mapping"

rdr_pids.to_gbq(destination_table, project_id, if_exists="replace")

In [None]:
print('Lastest update to fitbit data: '+str(datetime.datetime.now()))

# Creating Views

In [None]:
import pandas as pd
cur_project_id = 'aou-res-curation-prod'
dataset_name = 'fitbit_ingest'

#####
def create_view(view_query, destination_view_name, dataset_name = dataset_name, project_id = cur_project_id):

    client = bigquery.Client(project_id)

    query_job = client.query(view_query.format(cur_project_id = cur_project_id
                                               , dataset_name = dataset_name
                                               , destination_view_name = destination_view_name))  # Make an API request.
    query_job.result()  # Wait for the job to complete.    print(f"Created {view.table_type}: {str(view.reference)}")

In [None]:
device_view_query = ''' 
    CREATE VIEW `{cur_project_id}.{dataset_name}.{destination_view_name}`
    AS (SELECT * EXCEPT (rn, upload_date)
        FROM ( SELECT m.participant_id AS person_id, 
                t.* EXCEPT (vibrent_id),
                ROW_NUMBER() OVER(PARTITION BY vibrent_id, date ORDER BY upload_date DESC) AS rn
                FROM `{dataset_name}.device` t
                JOIN `{dataset_name}.id_mapping` m USING(vibrent_id)
                )
        WHERE rn = 1)
    '''

In [None]:
create_view(device_view_query, 'v_device')

In [None]:
## QC: CHECK IF THERE ARE DUP ROWS
df = pd.read_gbq('''
        SELECT * EXCEPT (rn, upload_date)
        FROM (
          SELECT 
            m.participant_id AS person_id, 
            t.* EXCEPT (vibrent_id),
            ROW_NUMBER() OVER(PARTITION BY vibrent_id, date ORDER BY upload_date DESC) AS rn
          FROM `{dataset_name}.device` t
          JOIN `{dataset_name}.id_mapping` m USING(vibrent_id)
        )
        WHERE rn = 1 
        ''')

n_rows = df.reset_index().groupby(df.columns.to_list()).nunique()
n_rows.columns = ['n_rows']
n_rows['n_rows'] = n_rows['n_rows'].astype('int64')
n_rows[n_rows.n_rows >1]# = n_rows