# Caseness cohort breakdown

The purpose of this notebook is to provide a breakdown of the count of patient records that met our definition for the caseness of complex mental health difficulties.

## Imports

In [9]:
# Get helper functions.
%run 'UNSEEN_helper_functions.ipynb'
# Refresh stored variables, if they are present.
%store -r

## Load requisites

In [10]:
# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN create caseness array.ipynb"))
folder = folder_loc + '/codelists/'

## Load codelist CSV files.
We used [opencodelists.org](https://www.opencodelists.org) to define codelists that define the set of SNOMED-CT codes used to identify patients based on various attributes.

In [11]:
# Clinical codes of interest.
codes_to_query_mentalIllHealth = pandas.read_csv(folder + "mental_ill_health_codelist.txt", sep = '\t')
codes_to_query_bipolar = pandas.read_csv(folder + "ciaranmci-bipolar-disorder-6a0308d7.csv")
codes_to_query_schizophrenia = pandas.read_csv(folder + "ciaranmci-schizophrenia-05c53c03.csv")
# ## Exclude bipolar and schizophrenia from the study population.
codes_to_query_mentalIllHealth = pandas.DataFrame(
    list(
        set(codes_to_query_mentalIllHealth["Id"]).difference(
            set(codes_to_query_bipolar["code"]).union(
                set(codes_to_query_schizophrenia["code"])
            )
        )
    )
    ,columns = ["Id"]
)

## ## Create codelist for the cases.
#codes_to_query_caseness = pandas.read_csv(folder + "ciaranmci-unseen-snomed-codes-to-identify-cmhd-0e6bb986.csv")
#codes_to_query_devAcademicDisorder = pandas.read_csv(folder + "ciaranmci-developmental-academic-disorder-755c4650.csv")
## ## Exclude Developmental Academic Disorder from the cases.
#codes_to_query_caseness = pandas.DataFrame(
#   list(
#        set(codes_to_query_caseness["code"]).difference( set(codes_to_query_devAcademicDisorder["code"]) )
#        )
#    ,columns = ["code"]
#)
codes_to_query_borderline = pandas.read_csv(folder + "ciaranmci-borderline-personality-disorder-1ed4af38.csv")
codes_to_query_chronicDepression = pandas.read_csv(folder + "ciaranmci-chronic-depression-53a65598.csv")
codes_to_query_chronicPTSD = pandas.read_csv(folder + "ciaranmci-chronic-post-traumatic-stress-disorder-3a96e263.csv")
codes_to_query_complexPTSD = pandas.read_csv(folder + "ciaranmci-complex-post-traumatic-stress-disorder-21876f2e.csv")
codes_to_query_dysthymia = pandas.read_csv(folder + "ciaranmci-dysthymia-6f6888c3.csv")
codes_to_query_personalityDisorder = pandas.read_csv(folder + "ciaranmci-personality-disorder-5c4cd31b.csv")

# Medications of interest.
medications_to_query_psychosisAndRelated = pandas.read_csv(folder + "UNSEEN_medications_psychosisAndRelated.csv")
medications_to_query_hypnoticsAndAnxiolytics = pandas.read_csv(folder + "UNSEEN_medications_hypnoticsAndAnxiolytics.csv")
medications_to_query_antidepressants = pandas.read_csv(folder + "UNSEEN_medications_antidepressants.csv")
#medications_to_query_all = pandas.read_csv(folder + "UNSEEN_medications_list.csv")

The script below is an edited version of the main script in `UNSESSN_create_caseness_variables.ipynb`. The main edit is that the `tbl_persons_with_caseness_codes` SQL Common Table Expression (CTE) is replaced by similar CTEs for each of the component diagnoses. I also replace `tbl_persons_with_medications` with similar CTEs for each of the component medications.

The list of component diagnoses are:
1. Borderline personality disorder
2. Chronic depression
3. Chronic posttraumatic stress disorder
4. Complex posttraumatic stress disorder
5. Dysthymia
6. Personality disorder

The list of component medications are:
1. Antidepressants
2. Hypnotics and anxiolytics
3. Medications associated with psychosis and related disorders

## Additional subqueries.

In [12]:
sql_caseness_components_codelist_CTEs = \
"""
,tbl_codes_borderlinePD AS (
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_borderline["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_codes_chronicDepression AS (
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicDepression["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_codes_chronicPTSD AS (
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_codes_complexPTSD AS (
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_complexPTSD["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_codes_dysthymia AS (
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_dysthymia["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_codes_personalityDisorder AS (
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_personalityDisorder["code"].tolist())) + """'
                ]) AS my_snomedcode
)


,tbl_medications_antidepressants AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_medications_hypnoticsAndAnxiolytics AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_medications_psychosisAndRelated AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_psychosisAndRelated["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
"""
sql_caseness_components_CTEs = \
"""
,tbl_persons_with_borderlinePD_codes AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS borderlinePD
    FROM
        tbl_persons_firstFilters
    # This join gets the diagnostic SNOMED-CT codes, and filters for 
    # the patients for which we have diagnostic codes because it is an
    # INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_persons_firstFilters.person_id = tbl_srcode.person_id
    # This join is filtering for patients with the diagnostic SNOMED-CT codes
    # of interest by using an INNER JOIN, which acts like an intersection in
    # set operations.
    JOIN 
        tbl_codes_borderlinePD
        ON tbl_srcode.snomedcode = tbl_codes_borderlinePD.my_snomedcode
    WHERE
        # This filters for diagnoses prior to the index date.
        tbl_srcode.dateevent < myIndexDate
)
,tbl_persons_with_chronicDepression_codes AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS chronicDepression
    FROM
        tbl_persons_firstFilters
    # This join gets the diagnostic SNOMED-CT codes, and filters for 
    # the patients for which we have diagnostic codes because it is an
    # INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_persons_firstFilters.person_id = tbl_srcode.person_id
    # This join is filtering for patients with the diagnostic SNOMED-CT codes
    # of interest by using an INNER JOIN, which acts like an intersection in
    # set operations.
    JOIN 
        tbl_codes_chronicDepression
        ON tbl_srcode.snomedcode = tbl_codes_chronicDepression.my_snomedcode
    WHERE
        # This filters for diagnoses prior to the index date.
        tbl_srcode.dateevent < myIndexDate
)
,tbl_persons_with_chronicPTSD_codes AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS chronicPTSD
    FROM
        tbl_persons_firstFilters
    # This join gets the diagnostic SNOMED-CT codes, and filters for 
    # the patients for which we have diagnostic codes because it is an
    # INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_persons_firstFilters.person_id = tbl_srcode.person_id
    # This join is filtering for patients with the diagnostic SNOMED-CT codes
    # of interest by using an INNER JOIN, which acts like an intersection in
    # set operations.
    JOIN 
        tbl_codes_chronicPTSD
        ON tbl_srcode.snomedcode = tbl_codes_chronicPTSD.my_snomedcode
    WHERE
        # This filters for diagnoses prior to the index date.
        tbl_srcode.dateevent < myIndexDate
)
,tbl_persons_with_complexPTSD_codes AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS complexPTSD
    FROM
        tbl_persons_firstFilters
    # This join gets the diagnostic SNOMED-CT codes, and filters for 
    # the patients for which we have diagnostic codes because it is an
    # INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_persons_firstFilters.person_id = tbl_srcode.person_id
    # This join is filtering for patients with the diagnostic SNOMED-CT codes
    # of interest by using an INNER JOIN, which acts like an intersection in
    # set operations.
    JOIN 
        tbl_codes_complexPTSD
        ON tbl_srcode.snomedcode = tbl_codes_complexPTSD.my_snomedcode
    WHERE
        # This filters for diagnoses prior to the index date.
        tbl_srcode.dateevent < myIndexDate
)
,tbl_persons_with_dysthymia_codes AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS dysthymia
    FROM
        tbl_persons_firstFilters
    # This join gets the diagnostic SNOMED-CT codes, and filters for 
    # the patients for which we have diagnostic codes because it is an
    # INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_persons_firstFilters.person_id = tbl_srcode.person_id
    # This join is filtering for patients with the diagnostic SNOMED-CT codes
    # of interest by using an INNER JOIN, which acts like an intersection in
    # set operations.
    JOIN 
        tbl_codes_dysthymia
        ON tbl_srcode.snomedcode = tbl_codes_dysthymia.my_snomedcode
    WHERE
        # This filters for diagnoses prior to the index date.
        tbl_srcode.dateevent < myIndexDate
)
,tbl_persons_with_personalityDisorder_codes AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS personalityDisorder
    FROM
        tbl_persons_firstFilters
    # This join gets the diagnostic SNOMED-CT codes, and filters for 
    # the patients for which we have diagnostic codes because it is an
    # INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_persons_firstFilters.person_id = tbl_srcode.person_id
    # This join is filtering for patients with the diagnostic SNOMED-CT codes
    # of interest by using an INNER JOIN, which acts like an intersection in
    # set operations.
    JOIN 
        tbl_codes_personalityDisorder
        ON tbl_srcode.snomedcode = tbl_codes_personalityDisorder.my_snomedcode
    WHERE
        # This filters for diagnoses prior to the index date.
        tbl_srcode.dateevent < myIndexDate
)


,tbl_persons_with_antidepressants_meds AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS antidepressants
    FROM
        tbl_persons_firstFilters
    # This join is adding the medication table so that I can query medications.
    # It also, effectively, removes any patients without a prescription because
    # it is an INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        ON tbl_persons_firstFilters.person_id = tbl_srprimarycaremedication.person_id
    # This cross join conveniently creates all possible combinations of values of the
    # previous join result and `tbl_medications`. This sets up my interim result to 
    # easily do a row-wise comparison of the medications of interest with the variously-
    # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_medications_antidepressants
    WHERE
        # This filters for the medications of interest.
        REGEXP_CONTAINS(nameofmedication, tbl_medications_antidepressants.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) BETWEEN 0 AND """ + str(Rx_window_caseness) + """
)
,tbl_persons_with_hypnoticsAndAnxiolytics_meds AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS hypnoticsAndAnxiolytics
    FROM
        tbl_persons_firstFilters
    # This join is adding the medication table so that I can query medications.
    # It also, effectively, removes any patients without a prescription because
    # it is an INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        ON tbl_persons_firstFilters.person_id = tbl_srprimarycaremedication.person_id
    # This cross join conveniently creates all possible combinations of values of the
    # previous join result and `tbl_medications`. This sets up my interim result to 
    # easily do a row-wise comparison of the medications of interest with the variously-
    # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_medications_hypnoticsAndAnxiolytics
    WHERE
        # This filters for the medications of interest.
        REGEXP_CONTAINS(nameofmedication, tbl_medications_hypnoticsAndAnxiolytics.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) BETWEEN 0 AND """ + str(Rx_window) + """
)
,tbl_persons_with_psychosisAndRelated_meds AS (
    SELECT
        DISTINCT tbl_persons_firstFilters.person_id
        ,1 AS psychosisAndRelated
    FROM
        tbl_persons_firstFilters
    # This join is adding the medication table so that I can query medications.
    # It also, effectively, removes any patients without a prescription because
    # it is an INNER JOIN.
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        ON tbl_persons_firstFilters.person_id = tbl_srprimarycaremedication.person_id
    # This cross join conveniently creates all possible combinations of values of the
    # previous join result and `tbl_medications`. This sets up my interim result to 
    # easily do a row-wise comparison of the medications of interest with the variously-
    # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_medications_psychosisAndRelated
    WHERE
        # This filters for the medications of interest.
        REGEXP_CONTAINS(nameofmedication, tbl_medications_psychosisAndRelated.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) BETWEEN 0 AND """ + str(Rx_window) + """
)

,tbl_studyPopulation_casenessBreakdown AS (
    SELECT
        DISTINCT tbl_studyPopulation_no_caseness.person_id
        ,CASE WHEN borderlinePD IS NULL THEN 0 ELSE 1 END AS borderlinePD
        ,CASE WHEN chronicDepression IS NULL THEN 0 ELSE 1 END AS chronicDepression
        ,CASE WHEN chronicPTSD IS NULL THEN 0 ELSE 1 END AS chronicPTSD
        ,CASE WHEN complexPTSD IS NULL THEN 0 ELSE 1 END AS complexPTSD
        ,CASE WHEN dysthymia IS NULL THEN 0 ELSE 1 END AS dysthymia
        ,CASE WHEN personalityDisorder IS NULL THEN 0 ELSE 1 END AS personalityDisorder
        
        ,CASE WHEN antidepressants IS NULL THEN 0 ELSE 1 END AS antidepressants
        ,CASE WHEN hypnoticsAndAnxiolytics IS NULL THEN 0 ELSE 1 END AS hypnoticsAndAnxiolytics
        ,CASE WHEN psychosisAndRelated IS NULL THEN 0 ELSE 1 END AS psychosisAndRelated
    FROM
        tbl_studyPopulation_no_caseness
    LEFT JOIN tbl_persons_with_borderlinePD_codes ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_borderlinePD_codes.person_id
    LEFT JOIN tbl_persons_with_chronicDepression_codes ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_chronicDepression_codes.person_id
    LEFT JOIN tbl_persons_with_chronicPTSD_codes ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_chronicPTSD_codes.person_id
    LEFT JOIN tbl_persons_with_complexPTSD_codes ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_complexPTSD_codes.person_id
    LEFT JOIN tbl_persons_with_dysthymia_codes ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_dysthymia_codes.person_id
    LEFT JOIN tbl_persons_with_personalityDisorder_codes ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_personalityDisorder_codes.person_id
    
    LEFT JOIN tbl_persons_with_antidepressants_meds ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_antidepressants_meds.person_id
    LEFT JOIN tbl_persons_with_hypnoticsAndAnxiolytics_meds ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_hypnoticsAndAnxiolytics_meds.person_id
    LEFT JOIN tbl_persons_with_psychosisAndRelated_meds ON tbl_studyPopulation_no_caseness.person_id = tbl_persons_with_psychosisAndRelated_meds.person_id
)
"""

## Final select.

In [13]:
sql_final_select =\
"""
SELECT * FROM tbl_studyPopulation_casenessBreakdown ORDER BY person_id
"""
caseness_breakdown_array = pandas.read_gbq(sql_declarations + sql_studyPopulation + sql_caseness_components_codelist_CTEs + sql_caseness_components_CTEs + sql_final_select)

## Calculate prevalence of components.

In [14]:
# Calculate base counts then redact and round.
counts_components = round(caseness_breakdown_array.iloc[:, ~caseness_breakdown_array.columns.isin(['person_id', 'count_of_diagnoses', 'count_of_medications'])].sum() / target_round) * target_round
percentage = round((counts_components / count_studyPopulation) * 100, 2 )
prevalence_per_thousand = round((counts_components / count_studyPopulation) * 1000, 2 )

caseness_breakdown_array['count_of_diagnoses'] = \
    pandas.DataFrame( caseness_breakdown_array.loc[:,
                                                   ~caseness_breakdown_array.columns.isin(['person_id', 'antidepressants', 'hypnoticsAndAnxiolytics',
                                                                                           'psychosisAndRelated', 'count_of_diagnoses', 'count_of_medications'])
                                                  ].sum(axis = 1), columns = ['count_of_diagnoses'] )
caseness_breakdown_array['count_of_medications'] = \
    pandas.DataFrame( caseness_breakdown_array.loc[:,
                                                   ['antidepressants', 'hypnoticsAndAnxiolytics', 'psychosisAndRelated']
                                                  ].sum(axis = 1), columns = ['count_of_medications'] )
overall_percentage_satisfying_caseness = \
    round(
        (
            round(
                len(
                    caseness_breakdown_array.loc[
                        (caseness_breakdown_array.count_of_diagnoses > 0) &
                        (caseness_breakdown_array.count_of_medications > 0)
                        ,:]
                ) / target_round
            ) * target_round
        ) / count_studyPopulation * 100
    , 1)
print(f'\033[1mNOTE: The overall percentage of patient records meeting our definition for caseness is {overall_percentage_satisfying_caseness}%\033[0m')    
    
display( pandas.DataFrame(data = {'counts' : counts_components, 'percentage' : percentage, 'prevalence_per_thousand' : prevalence_per_thousand} ) )

display( pandas.DataFrame(data = {'counts' : caseness_breakdown_array.iloc[:, caseness_breakdown_array.columns.isin(['count_of_diagnoses', 'count_of_medications'])].sum(),
                                  'percentage' : round(caseness_breakdown_array.iloc[:, caseness_breakdown_array.columns.isin(['count_of_diagnoses', 'count_of_medications'])].sum() / len(caseness_breakdown_array), 2),
                                  'prevalence_per_thousand' : round(caseness_breakdown_array.iloc[:, caseness_breakdown_array.columns.isin(['count_of_diagnoses', 'count_of_medications'])].sum() / len(caseness_breakdown_array) * 10, 2)
                                 } ) )

[1mNOTE: The overall percentage of patient records meeting our definition for caseness is 2.0%[0m


Unnamed: 0,counts,percentage,prevalence_per_thousand
borderlinePD,490.0,0.24,2.36
chronicDepression,1180.0,0.57,5.67
chronicPTSD,120.0,0.06,0.58
complexPTSD,0.0,0.0,0.0
dysthymia,520.0,0.25,2.5
personalityDisorder,3650.0,1.75,17.55
antidepressants,130520.0,62.75,627.47
hypnoticsAndAnxiolytics,55700.0,26.78,267.78
psychosisAndRelated,9740.0,4.68,46.82


Unnamed: 0,counts,percentage,prevalence_per_thousand
count_of_diagnoses,5962.0,0.03,0.29
count_of_medications,195951.0,0.94,9.42


### Percentage with each count of diagnoses

In [15]:
percentage_with_at_least_one_diagnosis = \
    round(
        (
            round(
                len(
                    caseness_breakdown_array.loc[
                        (caseness_breakdown_array.count_of_diagnoses > 0)
                    ]
                ) / target_round
            ) * target_round
        ) / count_studyPopulation * 100
    , 1)
print(f'\033[1mNOTE: The percentage of patient records with at least one diagnosis is {percentage_with_at_least_one_diagnosis}%\033[0m')

display(
    pandas.DataFrame(
        data = {
            'count_with_each_count_of_diagnoses' : round(caseness_breakdown_array.count_of_diagnoses.value_counts() / target_round) * target_round,
            'percentage_with_each_count_of_diagnoses' : round(round(caseness_breakdown_array.count_of_diagnoses.value_counts() / target_round) * target_round / count_studyPopulation * 100,2)
        }
    )
)

[1mNOTE: The percentage of patient records with at least one diagnosis is 2.5%[0m


Unnamed: 0,count_with_each_count_of_diagnoses,percentage_with_each_count_of_diagnoses
0.0,202720.0,97.46
1.0,4660.0,2.24
2.0,590.0,0.28
3.0,40.0,0.02
4.0,0.0,0.0


### Percentage with each count of medication (note that 0 is not the largest)

In [16]:
percentage_with_at_least_one_medication = \
    round(
        (
            round(
                len(
                    caseness_breakdown_array.loc[
                        (caseness_breakdown_array.count_of_medications > 0)
                    ]
                ) / target_round
            ) * target_round
        ) / count_studyPopulation * 100
    , 1)
print(f'\033[1mNOTE: The percentage of patient records with at least one diagnosis is {percentage_with_at_least_one_medication}%\033[0m')

display(
    pandas.DataFrame(
        data = {
            'count_with_each_count_of_medications' : round(caseness_breakdown_array.count_of_medications.value_counts() / target_round) * target_round,
            'percentage_with_each_count_of_medications' : round(round(caseness_breakdown_array.count_of_medications.value_counts() / target_round) * target_round / count_studyPopulation * 100,2)
        }
    )
)

[1mNOTE: The percentage of patient records with at least one diagnosis is 73.1%[0m


Unnamed: 0,count_with_each_count_of_medications,percentage_with_each_count_of_medications
1.0,112350.0,54.01
0.0,56010.0,26.93
2.0,35340.0,16.99
3.0,4310.0,2.07
