We found that the condition code for Long QT has a surprisingly small overlap with the set of participants that appear to have long QT according to their vitals measurement data.
Consequently, we are going to compute Schwartz scores for all participants and include all participants that have a high probability of having long QT based on their Schwartz score in the long QT case pool as well.
Moreover, we will exclude those participants that have an intermediate or higher probability of having long QT from the case pool for long QT.

## To review

This notebook is a WIP.
Action items

- [x] Copy over relevant code from Nistha's project: https://workbench.researchallofus.org/workspaces/aou-rw-873402f7/networkbasedhypothesisforlongqtpatientsv8/analysis/preview/define_cohort.ipynb
- [x] Remove dependence on ChY detection (we'll go by sex assigned at birth)
- [x] Nistha uses ~~last~~ worst QT measurement for scoring, this is what we should use.
- [x] Clean up code to only what we need.
- [ ] Ensure that only participants with elingated LQT measurements get a Schwartz score
- [ ] Propagate the timestamp from the earliest LQT measurement that brings a participant over the threshold to the output "time of diagnosis"

# Schwartz calculation

In [1]:
# Common imports
import os
import pandas as pd

In [2]:
# Conditions for cohorts
conds_sql = """
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_concept_id,
        c_standard_concept.concept_name as standard_concept_name,
        c_standard_concept.concept_code as standard_concept_code,
        c_standard_concept.vocabulary_id as standard_vocabulary,
        c_occurrence.condition_start_datetime,
        c_occurrence.condition_end_datetime,
        c_occurrence.condition_type_concept_id,
        c_type.concept_name as condition_type_concept_name,
        c_occurrence.stop_reason,
        c_occurrence.visit_occurrence_id,
        visit.concept_name as visit_occurrence_concept_name,
        c_occurrence.condition_source_value,
        c_occurrence.condition_source_concept_id,
        c_source_concept.concept_name as source_concept_name,
        c_source_concept.concept_code as source_concept_code,
        c_source_concept.vocabulary_id as source_vocabulary,
        c_occurrence.condition_status_source_value,
        c_occurrence.condition_status_concept_id,
        c_status.concept_name as condition_status_concept_name 
    FROM
        ( SELECT
            * 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.condition_occurrence` c_occurrence 
        WHERE
            (
                condition_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                    WHERE
                        concept_id IN (135360, 314664, 4008859, 4135823, 4142566, 4169095, 4318712)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                c_occurrence.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        person_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
                    WHERE
                        has_whole_genome_variant = 1 ) 
                    AND cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                        WHERE
                            (concept_id IN (3026258, 46235174) 
                            AND is_standard = 1 )) criteria ) )
            )) c_occurrence 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_standard_concept 
            ON c_occurrence.condition_concept_id = c_standard_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_type 
            ON c_occurrence.condition_type_concept_id = c_type.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.visit_occurrence` v 
            ON c_occurrence.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` visit 
            ON v.visit_concept_id = visit.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_source_concept 
            ON c_occurrence.condition_source_concept_id = c_source_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_status 
            ON c_occurrence.condition_status_concept_id = c_status.concept_id"""

conds = pd.read_gbq(
    conds_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

conds.head(5)

In [3]:
# get conditions used for Schwartz score
schwartz_conds = conds[conds.standard_concept_name.isin(['Syncope and collapse',
                                                         'Syncope',
                                                         'Cardiac syncope',
                                                         'Stokes-Adams syncope',
                                                         'Bradycardia',  
                                                         'Torsades de pointes'])]
display(schwartz_conds.head(5))
# get conditions related to LQTS and QT prolongation
lqts =  conds[conds.standard_concept_name.isin(['Long QT syndrome', 'Prolonged QT interval'])]
display(lqts.head(5))

In [4]:
# QTc measurments
qtc_sql = """
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                    WHERE
                        concept_id IN (3026258, 46235174)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        person_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
                    WHERE
                        has_whole_genome_variant = 1 ) 
                    AND cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                        WHERE
                            (concept_id IN (3026258, 46235174) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `""" + os.environ["WORKSPACE_CDR"] + """.visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id"""

qtc_full = pd.read_gbq(
    qtc_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

qtc_full.head(5)

In [5]:
# Get demographic information for individuals with QTc measurements and srWGS
# This query represents dataset "Schwartz score cohorts" for domain "person" and was generated for All of Us Controlled Tier Dataset v7
demo_sql = """
    SELECT
        person.person_id,
        person.birth_datetime,
        p_sex_at_birth_concept.concept_name as sex_at_birth
    FROM
    
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person  
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id 
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) 
            AND cb_search_person.person_id IN (SELECT
                criteria.person_id 
            FROM
                (SELECT
                    DISTINCT person_id, entry_date, concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                WHERE
                    (concept_id IN (3026258, 46235174) 
                    AND is_standard = 1 )) criteria ) )"""

demo = pd.read_gbq(
    demo_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

demo.head(5)

In [6]:
# Get demographic information for individuals with srWGS
# This query represents dataset "Schwartz score cohorts" for domain "person" and was generated for All of Us Controlled Tier Dataset v7
demo_sql = """
    SELECT
        person.person_id,
        person.birth_datetime,
        p_sex_at_birth_concept.concept_name as sex_at_birth
    FROM
    
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person  
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id 
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) 
        )"""

demo = pd.read_gbq(
    demo_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

demo.head(5)

In [17]:
# get QTc measurments
corrected = qtc_full[qtc_full.standard_concept_name == 'Q-T interval corrected'][['person_id', 'measurement_datetime','value_as_number']].drop_duplicates().rename(columns={'value_as_number':'QTc'})
# get QTc (Bazett formula) measurments
corrected_bazett = qtc_full[qtc_full.standard_concept_name == 'Q-T interval corrected based on Bazett formula'][['person_id', 'measurement_datetime','value_as_number']].rename(columns={'value_as_number':'QTc_Bazett'})
# merge table
qtc_measure = corrected.merge(corrected_bazett, on = ['person_id',  'measurement_datetime'], how='outer').sort_values(['person_id', 'measurement_datetime']).drop_duplicates()
# create a table with person_id
schwartz_scores = demo.copy()
# individuals with bradycardia
bradycardia = schwartz_conds[schwartz_conds.standard_concept_name.isin(['Bradycardia'])].person_id
# individuals with tdp
tdp = schwartz_conds[schwartz_conds.standard_concept_name.isin(['Torsades de pointes'])].person_id
# individuals with syncopy
syncope = schwartz_conds[schwartz_conds.standard_concept_name.isin(['Syncope and collapse', 'Syncope', 'Cardiac syncope', 'Stokes-Adams syncope'])].person_id
# set default score to be 0
schwartz_scores['bradycardia_score'] = 0
schwartz_scores['tdp_score'] = 0
schwartz_scores['syncope_score'] = 0
# set scores for indivudals respective conditions
schwartz_scores.loc[schwartz_scores.person_id.isin(bradycardia), 'bradycardia_score'] = 0.5
schwartz_scores.loc[schwartz_scores.person_id.isin(tdp), 'tdp_score'] = 2
schwartz_scores.loc[schwartz_scores.person_id.isin(syncope), 'syncope_score'] = 1

In [18]:
qtc_measure.describe()

In [19]:
# Calculate Schwatrz score

qtc_measure = qtc_measure.merge(demo[['person_id', 'sex_at_birth']], on='person_id')
#schwartz_scores = schwartz_scores.merge(imputed_sex[['person_id', 'chrY']], on='person_id')

qtc_measure['qtc_category'] = 'No criteria met'

qtc_measure.loc[
    (qtc_measure.QTc_Bazett >= 480) | (qtc_measure.QTc >= 480),
    'qtc_category'
] = '>= 480 ms'

qtc_measure.loc[
    (
        (qtc_measure.QTc_Bazett >= 460) &
        (qtc_measure.QTc_Bazett <= 479)
    ) | (
        (qtc_measure.QTc >= 460) &
        (qtc_measure.QTc <= 479)
    ),
    'qtc_category'
] = '460-479 ms'

qtc_measure.loc[
    (
        (
            (qtc_measure.QTc_Bazett >= 450) &
            (qtc_measure.QTc_Bazett <= 459)
        ) | (
            (qtc_measure.QTc >= 450) &
            (qtc_measure.QTc <= 459)
        )
    ) &
    (qtc_measure.sex_at_birth == 'Male'),
    'qtc_category'
] = '450-469 ms (males only)'

qtc_measure['qtc_score'] = qtc_measure.qtc_category.map({
    '>= 480 ms': 3,
    '460-479 ms':2,
    '450-469 ms (males only)':1,
    'No criteria met': 0
})

In [20]:
# Merge without summarizing first
schwartz_scores = schwartz_scores.merge(
    qtc_measure[['person_id', 'qtc_score', 'measurement_datetime']],
    #qtc_measure[['person_id', 'qtc_score']].sort_values('qtc_score').drop_duplicates('person_id', keep='last'),
    on='person_id',
    how='left'
)

In [21]:
schwartz_scores[~schwartz_scores['qtc_score'].isna()]

In [26]:
schwartz_scores['qtc_score'] = schwartz_scores['qtc_score'].fillna(0)

# Only compute a Schwartz score if the QTC score is nonzero.
schwartz_scores['schwartz_score'] = schwartz_scores.qtc_score.mask(
    schwartz_scores.qtc_score > 0,
    schwartz_scores.bradycardia_score + schwartz_scores.tdp_score + schwartz_scores.syncope_score + schwartz_scores.qtc_score
)

schwartz_scores['schwartz_category'] = None
schwartz_scores.loc[schwartz_scores.schwartz_score <= 1, 'schwartz_category'] = 'low probability'
schwartz_scores.loc[(schwartz_scores.schwartz_score > 1)&(schwartz_scores.schwartz_score <= 3), 'schwartz_category'] = 'intermediate probability'
schwartz_scores.loc[(schwartz_scores.schwartz_score >= 3.5), 'schwartz_category'] = 'high probability'


In [45]:
# Given scores per timestamp per person, we now categorize cases and controls based on schwartz scores:
# If all the scores for a person are low probability, they are a case
# If any score for a person is high probability, they are a case, and we take the first high probability timestamp as event time:

schwartz_per_person = schwartz_scores.assign(
    hp=(schwartz_scores.schwartz_category == 'high probability'),
    lp=(schwartz_scores.schwartz_category == 'low probability'),
    hp_time=schwartz_scores.measurement_datetime.where(schwartz_scores.schwartz_category == 'high probability')
).groupby(
    'person_id'
).agg(
    control=('lp', 'all'),
    case=('hp', 'any'),
    schwartz_dx_time=('hp_time', 'min'),
    birth_datetime=('birth_datetime', 'first')
)

# Merge with pool definition based on LQT-related condition codes

In [47]:
# This query represents dataset "Long QT syndrome case pool" for domain "condition" and was generated for All of Us Controlled Tier Dataset v8
dataset_98227857_condition_sql = """
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_start_datetime 
    FROM
        ( SELECT
            * 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.condition_occurrence` c_occurrence 
        WHERE
            (
                condition_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                    WHERE
                        concept_id IN (314664)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                c_occurrence.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        person_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
                    WHERE
                        has_whole_genome_variant = 1 ) 
                    AND cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                                WHERE
                                    concept_id IN (314664)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) c_occurrence"""

lqt_condition_cases_df = pd.read_gbq(
    dataset_98227857_condition_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

lqt_condition_cases_df.head(5)

In [49]:
# This query represents dataset "Abnormal QT interval control pool" for domain "person" and was generated for All of Us Controlled Tier Dataset v8
dataset_22441734_person_sql = """
    SELECT
        person.person_id 
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person   
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) 
            AND cb_search_person.person_id NOT IN (SELECT
                criteria.person_id 
            FROM
                (SELECT
                    DISTINCT person_id, entry_date, concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                WHERE
                    (concept_id IN(SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id       
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                        WHERE
                            concept_id IN (4064627, 314664, 4008859)       
                            AND full_text LIKE '%_rank1]%'      ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1) 
                    AND is_standard = 1 )) criteria ) )"""

lqt_condition_controls_df = pd.read_gbq(
    dataset_22441734_person_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

lqt_condition_controls_df.head(5)

In [53]:
# Check that the Schwartz score frame has all the person_ids
assert lqt_condition_cases_df.person_id.isin(schwartz_per_person.index).all()
assert lqt_condition_controls_df.person_id.isin(schwartz_per_person.index).all()

# Annotate into Schwartz score frame
lqt_cohorts = schwartz_per_person.assign(
    condition_based_case = schwartz_per_person.index.isin(lqt_condition_cases_df.person_id),
    condition_based_control = schwartz_per_person.index.isin(lqt_condition_controls_df.person_id),
    condition_dx_time = lqt_condition_cases_df.groupby('person_id').condition_start_datetime.min()
)

lqt_cohorts

In [69]:
# Combined case and control pools

lqt_cohorts = lqt_cohorts.assign(
    combined_case = lqt_cohorts.case | lqt_cohorts.condition_based_case,
    combined_control = lqt_cohorts.control & lqt_cohorts.condition_based_control,
    # For some reason datetime min across columns is slow so we convert to integers first
    schwartz_dx_age = (lqt_cohorts.schwartz_dx_time - lqt_cohorts.birth_datetime).dt.days / 365.25,
    condition_dx_age = (lqt_cohorts.condition_dx_time - lqt_cohorts.birth_datetime).dt.days / 365.25
)

lqt_cohorts['condition_onset_age'] = lqt_cohorts[['schwartz_dx_age', 'condition_dx_age']].min(axis='columns')

In [75]:
lqt_cohorts.loc[lqt_cohorts['combined_case'], ['condition_onset_age']]

In [76]:
# Write tables
lqt_cohorts.loc[lqt_cohorts['combined_case'], ['condition_onset_age']].to_csv(
    f'{os.environ["WORKSPACE_BUCKET"]}/data_v1/long_qt_case_pool.tsv.gz',
    sep='\t'
)

lqt_cohorts.loc[lqt_cohorts['combined_control'], []].to_csv(
    f'{os.environ["WORKSPACE_BUCKET"]}/data_v1/long_qt_control_pool.tsv.gz',
    sep='\t'
)