In [None]:
import pandas as pd
genomics_pids_not_in_cdr = pd.read_csv('genomics_pids_not_in_cdr.csv').drop_duplicates()

In [None]:
import mysql.connector as mysql
con = mysql.connect(**connect_options)

In [None]:
genomics_pids_not_in_cdr_set = set(genomics_pids_not_in_cdr.person_id)
pids =tuple(genomics_pids_not_in_cdr_set)

In [None]:
c2022q456_cutoff_date = '2022-09-30'

consented_pids = pd.read_sql(f'''SELECT DISTINCT research_id
            , participant_id, consent_for_study_enrollment as primary_consent
            , consent_for_study_enrollment_authored as primary_consent_date
            , consent_for_electronic_health_records as ehr_consent
            , consent_for_electronic_health_records_authored as ehr_consent_date
            , questionnaire_on_the_basics
            , p.withdrawal_status
            , p.suspension_status
            , deceased_status
            , date_of_birth
            , (DATEDIFF('{c2022q456_cutoff_date}', date_of_birth)/365.25) as age_at_cdr
            , (DATEDIFF(consent_for_study_enrollment_authored, date_of_birth)/365.25) as age_at_consent
            FROM participant p
            JOIN participant_summary using(participant_id)
            WHERE research_id in {pids}
            ORDER BY 7
            ''', con)

In [None]:
consented_pids

In [None]:
set(consented_pids.research_id) - set(genomics_pids_not_in_cdr_set)

In [None]:
set(genomics_pids_not_in_cdr_set) - set(consented_pids.research_id)

In [None]:
pids2 =tuple(set(consented_pids.participant_id.unique()))

def pids_in_cdr(dataset):
    if 'C' in dataset:
        project = 'aou-res-curation-output-prod'
    else:
        project = 'aou-res-curation-prod'
        
    df = pd.read_gbq(f'''SELECT COUNT(DISTINCT person_id) as n_in_{dataset}
            FROM `{dataset}.person`
            where person_id IN {pids2} or person_id in {pids}''', project_id = project)
    return df

In [None]:
#dataset = '2022q4r3_rdr'
check_cur_datasets = ['rdr20220908', '2022q4r3_rdr', 'C2022Q4R6', '2022q4r3_unioned_ehr']
for dataset in check_cur_datasets:
    display(pids_in_cdr(dataset = dataset))


In [None]:
obs = pd.read_gbq(f'''SELECT DISTINCT *
            FROM `rdr20220908.observation`
            where person_id IN {pids2} or person_id in {pids}''', project_id = 'aou-res-curation-prod')

In [None]:
dataset = 'rdr20220908'
ppi_query = f"""
    SELECT 
    count(distinct person_id) as n_participants
    ,concept_name as survey
    FROM 
    `{dataset}.concept` 
    join `{dataset}.concept_ancestor` on (concept_id=ancestor_concept_id)
    join `{dataset}.observation` on (descendant_concept_id=observation_concept_id)
    ##join `{dataset}.observation_ext` using(observation_id)
    WHERE observation_concept_id not in (40766240,43528428,1585389) --hcau vocab issues
    and concept_class_id='Module'
    and concept_name IN ('The Basics', 'Overall Health', 'Lifestyle')
    ##, 'Family History', 'Personal Medical History', 'Healthcare Access & Utilization') 
    ##and src_id='PPI/PM'
    ##and questionnaire_response_id is not null
    and (person_id IN {pids2} or person_id in {pids})
    GROUP BY 2"""

obs = pd.read_gbq(ppi_query)
obs

In [None]:
dataset = 'rdr20220908'
consent_query = f"""
    SELECT DISTINCT person_id, observation_date AS primary_consent_date
    FROM  `{dataset}.concept`
    inner join `{dataset}.concept_ancestor` on concept_id = ancestor_concept_id
    inner join `{dataset}.observation` on descendant_concept_id = observation_concept_id 

    WHERE concept_name = 'Consent PII' AND concept_class_id = 'Module'
    and (person_id IN {pids2} or person_id in {pids})
"""
consent = pd.read_gbq(consent_query)
consent

In [None]:
import pandas as pd

## List of participants with any EHR data
def get_ehr_df1(dataset):
    
    if 'C' in dataset:
        project = 'aou-res-curation-output-prod'
    else:
        project = 'aou-res-curation-prod'
        

    query = f"""
    WITH ehr as (
    SELECT
       DISTINCT person_id, measurement_date as ehr_data_date
    FROM `{dataset}.measurement` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, condition_start_date ehr_data_date
    FROM `{dataset}.condition_occurrence` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, device_exposure_start_date ehr_data_date
    FROM `{dataset}.device_exposure` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, drug_exposure_start_date as ehr_data_date
    FROM `{dataset}.drug_exposure` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, observation_date as ehr_data_date
    FROM `{dataset}.observation` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, procedure_date as ehr_data_date
    FROM `{dataset}.procedure_occurrence` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, visit_start_date as ehr_data_date
    FROM `{dataset}.visit_occurrence` AS m
    )
    SELECT DISTINCT person_id, DATE(MIN(ehr_data_date)) as min_ehr_data_date
    , DATE(MAX(ehr_data_date)) as max_ehr_data_date
    FROM ehr
    where (person_id IN {pids2} or person_id in {pids})
    group by 1
    order by min_ehr_data_date

    """

    ehr_df = pd.read_gbq(query, dialect = 'standard',  project_id=project)
    
    return ehr_df

In [None]:
!pip install sqldf

In [None]:
!pip install pandasql

In [None]:
# Import libraries
import pandas as pd
from pandasql import sqldf

# Create a dummy pd.Dataframe
df = ehr_data_dates

# Define a SQL (SQLite3) query
query = """
SELECT *
FROM df
where min_ehr_data_date > '1990-07-01'
"""

# Run the query
#sqldf.run(query)
sqldf(query)

In [None]:
ehr_data_dates = get_ehr_df1('2022q4r3_unioned_ehr')
ehr_data_dates

In [None]:
pd.to_datetime(ehr_data_dates_and_consent['max_ehr_data_date']) > pd.to_datetime(ehr_data_dates_and_consent['ehr_consent_date'])

In [None]:
ehr_data_dates_and_consent = ehr_data_dates.rename(columns = {'person_id':"participant_id"}).merge(consented_pids[['participant_id','ehr_consent_date']])
ehr_data_dates_and_consent['min_ehr_data_date > ehr_consent_date'] = pd.to_datetime(ehr_data_dates_and_consent['min_ehr_data_date']) > pd.to_datetime(ehr_data_dates_and_consent['ehr_consent_date'])
ehr_data_dates_and_consent['max_ehr_data_date > ehr_consent_date'] = pd.to_datetime(ehr_data_dates_and_consent['max_ehr_data_date']) > pd.to_datetime(ehr_data_dates_and_consent['ehr_consent_date'])
ehr_data_dates_and_consent

In [None]:
#EHR data before the EHR cut off
def get_ehr_df3(dataset):
    
    if 'C' in dataset:
        project = 'aou-res-curation-output-prod'
    else:
        project = 'aou-res-curation-prod'
        

    query = f"""
    WITH ehr as (
    SELECT
       DISTINCT person_id, measurement_date as ehr_data_date
    FROM `{dataset}.measurement` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, condition_start_date ehr_data_date
    FROM `{dataset}.condition_occurrence` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, device_exposure_start_date ehr_data_date
    FROM `{dataset}.device_exposure` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, drug_exposure_start_date as ehr_data_date
    FROM `{dataset}.drug_exposure` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, observation_date as ehr_data_date
    FROM `{dataset}.observation` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, procedure_date as ehr_data_date
    FROM `{dataset}.procedure_occurrence` AS m

    UNION DISTINCT
    SELECT
       DISTINCT person_id, visit_start_date as ehr_data_date
    FROM `{dataset}.visit_occurrence` AS m
    )
    SELECT DISTINCT person_id, DATE(MIN(ehr_data_date)) as min_ehr_data_date
    , DATE(MAX(ehr_data_date)) as max_ehr_data_date
    FROM ehr
    where (person_id IN {pids2} or person_id in {pids})
    and ehr_data_date <= '2022-07-01'
    group by 1
    order by min_ehr_data_date

    """

    ehr_df = pd.read_gbq(query, dialect = 'standard',  project_id=project)
    
    return ehr_df

In [None]:
ehr_data_before_cutoff = get_ehr_df3('2022q4r3_unioned_ehr')
ehr_data_before_cutoff

In [None]:
def check_actual_ehr_date(ehr_table, date_field, dataset = '2022q4r3_unioned_ehr'):
    
    if 'C' in dataset:
        project = 'aou-res-curation-output-prod'
    else:
        project = 'aou-res-curation-prod'
        
    query = f"""SELECT DISTINCT *
            FROM `{dataset}.{ehr_table}`
            where (person_id IN {pids2} or person_id in {pids})
            and {date_field} <= '2022-07-01'
            """
    ehr_df = pd.read_gbq(query, dialect = 'standard',  project_id=project)
    ehr_df = ehr_df.merge(consented_pids[['participant_id','ehr_consent_date']].rename(columns = {'participant_id':"person_id"}))
    ehr_df['ehr_data_after_ehr_consent'] = pd.to_datetime(ehr_df[date_field]) >= pd.to_datetime(ehr_df['ehr_consent_date'])
    ehr_df = ehr_df[ehr_df['ehr_data_after_ehr_consent'] == True]
    display(ehr_df.person_id.unique())
    return ehr_df

In [None]:
check_actual_ehr_date(ehr_table = 'condition_occurrence', date_field = 'condition_start_date', dataset = '2022q4r3_unioned_ehr')

In [None]:
get_ehr_df('2022q4r3_unioned_ehr')