# Set Up

In [None]:
import pandas as pd
import os

In [None]:
#dataset = 'aou-res-curation-output-prod.C2022Q4R11'
dataset = 'aou-res-curation-output-prod.C2024Q3R4'
version = 'v8'

In [None]:
pd.read_gbq(f"""SELECT DISTINCT * FROM `{dataset}`.__TABLES__ LIMIT 1""")

## get size for tables and estimate size per row

In [None]:
pd.read_gbq(f"""SELECT table_id as table_name
FROM `{dataset}`.__TABLES__
WHERE table_id LIKE '%era%' or table_id like '%period%'
 """)

In [None]:
# Get the row count, and size for each v7 table
tables_size = pd.read_gbq(f"""SELECT table_id as table_name
                                    , CAST(row_count AS INTEGER) as total_row_count
                                    , CAST(size_bytes AS INTEGER) as total_size_bytes
                                    , CAST(size_bytes/row_count  AS NUMERIC) as byte_per_row
                                    , CAST(size_bytes/1000000000 AS NUMERIC) as total_size_GB
                                FROM `{dataset}`.__TABLES__
                                WHERE row_count >0
                                AND LOWER(TABLE_ID) NOT LIKE '%prep_%'
                                AND LOWER(TABLE_ID) NOT LIKE '%ds_%'
                                AND LOWER(TABLE_ID) NOT LIKE '%cb_%' 
                                AND LOWER(TABLE_ID) NOT LIKE '%achilles_%'
                                AND LOWER(TABLE_ID) NOT LIKE '%_cdr%'
                                AND TABLE_ID != 'drug_strength' #it is a concept table
                                """)

In [None]:
tables_size

In [None]:
all_tables = sorted(tables_size.table_name.unique())
all_tables

## Function - EST total data  size

In [None]:
def size_estimator(row_count_df, data_type, tables_size= tables_size):
    tbs_row_counts = row_count_df.merge(tables_size[['table_name','byte_per_row']].rename(columns = {'table_name':'data_table'})) 
    tbs_row_counts['est_size_byte'] = round((tbs_row_counts['data_row_count']*tbs_row_counts['byte_per_row'].astype('float')).astype('int64'))
    tbs_row_counts['est_size_GB'] = round((tbs_row_counts['est_size_byte']/1000000000).astype('float64'))
    
    data_size =tbs_row_counts.drop('data_table', axis = 1)
    data_size['Data Type'] = data_type
    data_size['Tables'] = ', '.join(tbs_row_counts['data_table'].unique())
    data_size = data_size.groupby(['Data Type', 'Tables']).sum()
    display(data_size)
    print('returning data_tables_sizes, grouped_data_size')
    return tbs_row_counts, data_size

-------------

# Query row counts per data type

## EHR

In [None]:
EHR_ext_tables = list(sorted(set([i for i in all_tables if '_ext' in i and i not in ['person_ext','survey_conduct_ext']])))         

In [None]:
#list(sorted(set([i for i in all_tables if '_ext' not in i and i not in EHR_ext_tables+EHR_tables])))

In [None]:
add_EHR_table = ['death', 'condition_era', 'drug_era', 'observation_period']
EHR_tables = list(sorted(set([i.replace('_ext','') for i in EHR_ext_tables]+add_EHR_table)))

In [None]:
ehr_ext_tbs = tuple(EHR_ext_tables)
ehr_ext_schemas = pd.read_gbq(f"""SELECT DISTINCT table_name, column_name
                                FROM `{dataset}`.INFORMATION_SCHEMA.COLUMNS
                                WHERE table_name in {ehr_ext_tbs}
                                and column_name != 'src_id'
                               
                                """)

In [None]:
ehr_tb0 = [t for t in EHR_tables if t not in add_EHR_table][0]
ehr_ext_col0 = ehr_ext_schemas[ehr_ext_schemas.table_name == ehr_tb0+'_ext'].column_name.values[0]
QUERY = f"""SELECT '{ehr_tb0}' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.{ehr_tb0}`
        JOIN `{dataset}.{ehr_tb0}_ext` USING({ehr_ext_col0})
        WHERE src_id LIKE '%EHR%' group by 1 
        
        UNION ALL
        SELECT 'death' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.death`
        
        UNION ALL
        SELECT 'condition_era' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.condition_era`
        ##JOIN `{dataset}.condition_era_ext` USING(condition_era_id)
        ##WHERE src_id LIKE '%EHR%' group by 1 
        
        UNION ALL
        SELECT 'drug_era' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.drug_era`
        ###JOIN `{dataset}.drug_era_ext` USING(drug_era_id)
        ###WHERE src_id LIKE '%EHR%' group by 1 
        
        UNION ALL
        SELECT 'observation_period' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.observation_period`
        ##JOIN `{dataset}.observation_period_ext` USING(observation_period_id)
        ##WHERE src_id LIKE '%EHR%' group by 1 
        """

for ehr_tb in [t for t in EHR_tables if t not in [ehr_tb0]+add_EHR_table]:
    ehr_ext_col = ehr_ext_schemas[ehr_ext_schemas.table_name == ehr_tb+'_ext'].column_name.values[0]
    query = f'''
        UNION ALL
        SELECT '{ehr_tb}' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.{ehr_tb}`
        JOIN `{dataset}.{ehr_tb}_ext` USING({ehr_ext_col})
        WHERE LOWER(src_id) LIKE '%ehr%' group by 1 '''
    
    QUERY = QUERY+query

In [None]:
ehr_tbs_row_counts = pd.read_gbq(QUERY)

In [None]:
ehr_tbs_row_counts1, ehr_data_size = size_estimator(ehr_tbs_row_counts, data_type = 'EHR')

In [None]:
ehr_data_size

In [None]:
ehr_tbs_row_counts1

In [None]:
ehr_tbs_row_counts1['Data Type'] = 'EHR'
ehr_tbs_row_counts1 = ehr_tbs_row_counts1.set_index(['Data Type','data_table'])
ehr_tbs_row_counts1

In [None]:
#check if est makes sense
ehr_tbs_row_counts1.merge(tables_size.rename(columns = {'table_name':'data_table'})) 

## Survey

In [None]:
survey_tb = 'observation'

In [None]:
pd.read_gbq(f'''SELECT distinct src_id FROM `{dataset}.{survey_tb}_ext`''')

In [None]:
s_query = f'''
        SELECT '{survey_tb}' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count

        FROM `{dataset}.{survey_tb}`
        JOIN `{dataset}.{survey_tb}_ext` USING(observation_id)
        WHERE src_id NOT LIKE '%EHR%' group by 1 
        
        
        
        '''

In [None]:
survey_tbs_row_counts = pd.read_gbq(s_query)

In [None]:
survey_tbs_row_counts1, s_data_size = size_estimator(survey_tbs_row_counts, data_type = 'Survey')

In [None]:
survey_tbs_row_counts1.merge(tables_size.rename(columns = {'table_name':'data_table'})) 

In [None]:
survey_tbs_row_counts1['Data Type'] = 'Survey'
survey_tbs_row_counts1 = survey_tbs_row_counts1.set_index(['Data Type','data_table'])
survey_tbs_row_counts1

## Fitbit

In [None]:
fitbit_tables = ['heart_rate_minute_level', 'heart_rate_summary' ,'sleep_level','sleep_daily_summary'
                 ,'activity_summary','steps_intraday', 'device']
fitbit_tables

In [None]:
fb_tb0 = fitbit_tables[0]
QUERY = f"""
        SELECT '{fb_tb0}' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.{fb_tb0}` group by 1      
        """          

for fb_tb in [t for t in fitbit_tables if t not in [fb_tb0]]:
    query = f'''
        UNION ALL
        SELECT '{fb_tb}' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.{fb_tb}` group by 1 '''
    
    QUERY = QUERY+query

In [None]:
fb_tbs_row_counts = pd.read_gbq(QUERY)

In [None]:
fb_tbs_row_counts1, fb_data_size = size_estimator(fb_tbs_row_counts, data_type = 'Fitbit')

In [None]:
fb_tbs_row_counts1.merge(tables_size.rename(columns = {'table_name':'data_table'})) 

In [None]:
fb_tbs_row_counts1.columns

In [None]:
fb_tbs_row_counts1['Data Type'] = 'Fitbit'
fb_tbs_row_counts1 = fb_tbs_row_counts1.set_index(['Data Type','data_table'])
fb_tbs_row_counts1

In [None]:
display(pd.read_gbq(f'''SELECT count(distinct person_id) n
        FROM (SELECT DISTINCT person_id FROM `{dataset}.device`)
        FULL OUTER JOIN (SELECT DISTINCT person_id FROM `{dataset}.cb_search_person` WHERE has_fitbit =1) using(person_id)
        '''))

## Physical Measurement

In [None]:
pm_tb = 'measurement'
pm_query = f'''
        SELECT '{pm_tb}' AS data_table
        , count(distinct person_id) as unique_participants_count, count(*) as data_row_count
        FROM `{dataset}.{pm_tb}`
        JOIN `{dataset}.{pm_tb}_ext` USING(measurement_id)
        ###WHERE LOWER(src_id) LIKE '%ppi%' 
        WHERE src_id NOT LIKE '%EHR%' group by 1 '''

pm_tbs_row_counts = pd.read_gbq(pm_query)

In [None]:
pm_tbs_row_counts1, pm_data_size = size_estimator(pm_tbs_row_counts, data_type = 'Physical Measurements')

In [None]:
pm_tbs_row_counts1.merge(tables_size.rename(columns = {'table_name':'data_table'})) 

In [None]:
pm_tbs_row_counts1['Data Type'] = 'Physical Measurements'
pm_tbs_row_counts1 = pm_tbs_row_counts1.set_index(['Data Type','data_table'])
pm_tbs_row_counts1

# Final DF

In [None]:
filename = f'CDR{version}_data_types_sizes.xlsx'
writer = pd.ExcelWriter(filename)

In [None]:
data_size_df = pd.concat([ehr_data_size, s_data_size, fb_data_size, pm_data_size])
data_size_df

In [None]:
display(pd.read_gbq(f'''SELECT count(distinct person_id) n_ehr
        FROM `{dataset}.cb_search_person` WHERE has_ehr_data = 1  '''))

display(pd.read_gbq(f'''SELECT count(distinct person_id) n_fitbit
        FROM `{dataset}.cb_search_person` WHERE has_fitbit = 1  '''))

display(pd.read_gbq(f'''SELECT count(distinct person_id) n_pm
        FROM `{dataset}.cb_search_person` WHERE has_physical_measurement_data = 1  '''))

display(pd.read_gbq(f'''SELECT count(distinct person_id) n_ppi
        FROM `{dataset}.cb_search_person` WHERE has_ppi_survey_data = 1  '''))

In [None]:
data_size_df.to_excel(writer, 'By Data Type')

In [None]:
data_table_size_df = pd.concat([ehr_tbs_row_counts1, survey_tbs_row_counts1, fb_tbs_row_counts1, pm_tbs_row_counts1]).drop('byte_per_row', axis = 1)
data_table_size_df

In [None]:
data_table_size_df.to_excel(writer, 'By Data Type and Table')

In [None]:
writer.close()