Common tables used across analyses, we save them to the bucket here so that they're downloaded only once.

In [1]:
# Common imports and constants
import os
import pandas as pd

DATASET = os.environ["WORKSPACE_CDR"]
BUCKET = os.environ["WORKSPACE_BUCKET"]
DATA_DIR = f'{BUCKET}/data_v1'

## Demographics and event count table for all WGS participants

In [2]:
# Get event count statistics

count_events_sql = f"""
    SELECT *
    FROM (
        SELECT
            person_id,
            COUNT(DISTINCT condition_start_date) AS n_events,
            MIN(condition_start_date) AS first_event,
            MAX(condition_start_date) AS last_event
        FROM `{DATASET}.condition_occurrence`
        WHERE
            person_id IN (
                SELECT person_id FROM `{DATASET}.cb_search_person`
                WHERE has_whole_genome_variant = 1
            )
        GROUP BY person_id
    ) INNER JOIN (
        SELECT
            person_id,
            sex_at_birth,
            dob,
            age_at_cdr
        FROM `{DATASET}.cb_search_person`
    ) USING (person_id)
"""

count_events_df = pd.read_gbq(
    count_events_sql,
    index_col='person_id',
    dialect='standard',
    use_bqstorage_api=('BIGQUERY_STORAGE_API_ENABLED' in os.environ),
    progress_bar_type='tqdm_notebook'
).assign(
    first_event = lambda df: pd.to_datetime(df.first_event),
    last_event = lambda df: pd.to_datetime(df.last_event),
    dob = lambda df: pd.to_datetime(df.dob)
)

In [3]:
count_events_df

In [4]:
count_events_df.to_csv(f'{DATA_DIR}/demo.tsv.gz', sep='\t')

## Ancestry PCA table

In [5]:
ancestry_df = pd.read_table(
    'gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv',
    storage_options={'requester_pays': True}
)

In [6]:
# Ancestry PCA
import json

pca_df = pd.concat(
    [
        ancestry_df[['research_id']],
        ancestry_df.pca_features.apply(lambda x: pd.Series(json.loads(x))).rename(columns=lambda c: f'pca_{c}')
    ],
    axis='columns'
)

In [7]:
pca_df

In [8]:
pca_df.to_csv(f'{DATA_DIR}/ancestry_pca.tsv.gz', sep='\t', index=False)