# Caseness array

The purpose of this notebook is to produce the caseness array. The caseness array is an n-by-2 array containing patient ID and a binary vector indicating whether the patient is clinically coded for complex mental health difficulties.

### Imports

In [1]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

### Prerequisites

In [2]:
# Set parameters for disclosivity adjustments.
redaction_threshold = 7
target_round = 10

# Set the database attributes.
global server_id
server_id = 'yhcr-prd-phm-bia-core'
global database_id
database_id = 'CB_FDM_PrimaryCare_V7'

# Instantiate BigQuery client.
client = bigquery.Client()

# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN create caseness array.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_bipolar = pandas.read_csv(folder + "ciaranmci-bipolar-disorder-6a0308d7.csv")
codes_to_query_borderline = pandas.read_csv(folder + "ciaranmci-borderline-personality-disorder-1ed4af38.csv")
codes_to_query_chronicDepression = pandas.read_csv(folder + "ciaranmci-chronic-depression-53a65598.csv")
codes_to_query_chronicPTSD = pandas.read_csv(folder + "ciaranmci-chronic-post-traumatic-stress-disorder-3a96e263.csv")
codes_to_query_complexPTSD = pandas.read_csv(folder + "ciaranmci-complex-post-traumatic-stress-disorder-21876f2e.csv")
codes_to_query_dysthymia = pandas.read_csv(folder + "ciaranmci-dysthymia-6f6888c3.csv")
codes_to_query_personalityDisorder = pandas.read_csv(folder + "ciaranmci-personality-disorder-243a2f24.csv")
codes_to_query_schizophrenia = pandas.read_csv(folder + "ciaranmci-schizophrenia-05c53c03.csv")
codes_to_query_all = pandas.read_csv(folder + "ciaranmci-unseen-snomed-codes-to-identify-cmhd-0b2abbef.csv")

# Medications of interest.
medications_to_query_psychosisAndRelated = pandas.read_csv(folder + "UNSEEN medications_psychosisAndRelated.csv")
medications_to_query_hypnoticsAndAnxiolytics = pandas.read_csv(folder + "UNSEEN medications_hypnoticsAndAnxiolytics.csv")
medications_to_query_antidepressants = pandas.read_csv(folder + "UNSEEN medications_antidepressants.csv")
medications_to_query_all = pandas.read_csv(folder + "UNSEEN medications list.csv")

%store server_id database_id

Stored 'server_id' (str)
Stored 'database_id' (str)


## Creating the array

In [5]:
sql = """
WITH
# The first CTE will specify the 'spine' of the data table by selecting the unique list of person IDs.
tbl_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.person
    # Limiting to age range 18-70.
    WHERE
        (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)

# The following CTEs extract each clinical codelist into a SQL table before querying the person_ID 
# associated with the clinical codes.
#
#  ## Bipolar disorder
,tbl_bipolar AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_bipolar["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_bipolar_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_bipolar
    WHERE
        snomedcode IN (tbl_bipolar.my_snomedcode)
)
#  ## Borderline personality disorder
,tbl_borderline AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_borderline["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_borderline_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_borderline
    WHERE
        snomedcode IN (tbl_borderline.my_snomedcode)
)
#  ## Chronic PTSD
,tbl_chronicPTSD AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_chronicPTSD_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_chronicPTSD
    WHERE
        snomedcode IN (tbl_chronicPTSD.my_snomedcode)
)
#  ## Complex PTSD
,tbl_complexPTSD AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_complexPTSD_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_complexPTSD
    WHERE
        snomedcode IN (tbl_complexPTSD.my_snomedcode)
)
#  ## Chronic depression
,tbl_chronicDepression AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicDepression["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_chronicDepression_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_chronicDepression
    WHERE
        snomedcode IN (tbl_chronicDepression.my_snomedcode)
)
#  ## Dysthymia
,tbl_dysthymia AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_dysthymia["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_dysthymia_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_dysthymia
    WHERE
        snomedcode IN (tbl_dysthymia.my_snomedcode)
)
#  ## Personality disorder
,tbl_personalityDisorder AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_personalityDisorder["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_personalityDisorder_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_personalityDisorder
    WHERE
        snomedcode IN (tbl_personalityDisorder.my_snomedcode)
)
#  ## Schizophrenia
,tbl_schizophrenia AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_schizophrenia["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_schizophrenia_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_schizophrenia
    WHERE
        snomedcode IN (tbl_schizophrenia.my_snomedcode)
)


# The following CTEs extract each medication list into a SQL table before querying the person_ID 
# associated with the medications (combined into medication type).
#
#  ## Drugs used in psychosis and related disorders.
,tbl_meds_psychosisAndRelated AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_psychosisAndRelated["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_meds_psychosisAndRelated_persons

AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_psychosisAndRelated
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_psychosisAndRelated.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(datemedicationstart AS DATE), MONTH) < 4
)
#  ## Hypnotics and anxiolyitcs
,tbl_meds_hypnoticsAndAnxiolytics AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_meds_hypnoticsAndAnxiolytics_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_hypnoticsAndAnxiolytics
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_hypnoticsAndAnxiolytics.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(datemedicationstart AS DATE), MONTH) < 4
)
#  ## Antidepressants
,tbl_meds_antidepressants AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_meds_antidepressants_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_antidepressants
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_antidepressants.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(datemedicationstart AS DATE), MONTH) < 4
)


# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# clinical code and medication group. The code and medication columns are populated by interger
# values with '1' indicating that the code or medication is present in patient record and '0' indicating
# otherwise.
SELECT
    DISTINCT tbl_persons.person_id
    ,CASE WHEN tbl_bipolar_persons.person_id IS NULL THEN 0 ELSE 1 END AS Bipolar # Bipolar is an exclusion for caseness.
    ,CASE WHEN tbl_borderline_persons.person_id IS NULL THEN 0 ELSE 1 END AS Borderline
    ,CASE WHEN tbl_chronicPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicPTSD
    ,CASE WHEN tbl_complexPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ComplexPTSD
    ,CASE WHEN tbl_chronicDepression_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicDepression
    ,CASE WHEN tbl_dysthymia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Dysthymia
    ,CASE WHEN tbl_personalityDisorder_persons.person_id IS NULL THEN 0 ELSE 1 END AS PersonalityDisorder
    ,CASE WHEN tbl_schizophrenia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Schizophrenia # Schizophrenia is an exclusion for caseness.
    ,CASE WHEN tbl_meds_psychosisAndRelated_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_PsychosisAndRelated
    ,CASE WHEN tbl_meds_hypnoticsAndAnxiolytics_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_hypnoticsAndAnxiolytics
    ,CASE WHEN tbl_meds_antidepressants_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_antidepressants
FROM tbl_persons
LEFT OUTER JOIN tbl_bipolar_persons ON tbl_persons.person_id = tbl_bipolar_persons.person_id
LEFT OUTER JOIN tbl_borderline_persons ON tbl_persons.person_id = tbl_borderline_persons.person_id
LEFT OUTER JOIN tbl_chronicPTSD_persons ON tbl_persons.person_id = tbl_chronicPTSD_persons.person_id
LEFT OUTER JOIN tbl_complexPTSD_persons ON tbl_persons.person_id = tbl_complexPTSD_persons.person_id
LEFT OUTER JOIN tbl_chronicDepression_persons ON tbl_persons.person_id = tbl_chronicDepression_persons.person_id
LEFT OUTER JOIN tbl_dysthymia_persons ON tbl_persons.person_id = tbl_dysthymia_persons.person_id
LEFT OUTER JOIN tbl_personalityDisorder_persons ON tbl_persons.person_id = tbl_personalityDisorder_persons.person_id
LEFT OUTER JOIN tbl_schizophrenia_persons ON tbl_persons.person_id = tbl_schizophrenia_persons.person_id
LEFT OUTER JOIN tbl_meds_psychosisAndRelated_persons ON tbl_persons.person_id = tbl_meds_psychosisAndRelated_persons.person_id
LEFT OUTER JOIN tbl_meds_hypnoticsAndAnxiolytics_persons ON tbl_persons.person_id = tbl_meds_hypnoticsAndAnxiolytics_persons.person_id
LEFT OUTER JOIN tbl_meds_antidepressants_persons ON tbl_persons.person_id = tbl_meds_antidepressants_persons.person_id
ORDER BY tbl_persons.person_id
"""

bqTable = client.query(sql).to_dataframe()

# Store bqTable for use in other notebaooks
%store bqTable

Stored 'bqTable' (DataFrame)


In [6]:
# Define CMHD-by-diagnosis as any patient with any of the diagnoses.
CMHD_dx_only = bqTable.loc[:, ~bqTable.columns.isin(['person_id',
                                                     'Meds_PsychosisAndRelated',
                                                     'Meds_hypnoticsAndAnxiolytics',
                                                     'Meds_antidepressants'])].max(axis = 1)
# Exclude patients with diagnoses for bipolar or schizophrenia.
CMHD_dx_only.loc[(bqTable['Bipolar'] == 1) | (bqTable['Schizophrenia'] == 1)] = 0
                 
# Define CMHD-by-prescription as any patient with any of the prescriptions.              
CMHD_rx_only = bqTable[['Meds_PsychosisAndRelated', 
                        'Meds_hypnoticsAndAnxiolytics',
                        'Meds_antidepressants']].max(axis = 1)
CMHD = []
for i_iter in range(len(CMHD_rx_only)):
    if (CMHD_dx_only[i_iter] == 1 & CMHD_rx_only[i_iter] == 1):
        # If the patient has a diagnostic code AND an active prescription,
        # then they are a definite CMHD = 2.
        CMHD.append(2)
    elif (CMHD_dx_only[i_iter] != 1 & CMHD_rx_only[i_iter] == 1):
        # If the patient has an active prescription but not a diagnostic code,
        # then they are a possible CMHD = 1.
        CMHD.append(1)
    else:
        # If the patient neither has a diagnostic code or an active prescriptions,
        # then they are definitely not CMHD = 0.
        CMHD.append(0)
        
caseness_array = \
    pandas.DataFrame(data = \
                     {"person_id" : bqTable['person_id'],
                      "CMHD_rx_not_dx" : [1 if i_row == 1 else 0 for i_row in CMHD],
                      "CMHD_dx_and_rx" : [1 if i_row == 2 else 0 for i_row in CMHD],
                      "CMHD"           : CMHD,
                      "CMHD_possdef"   : [1 if i_row == 2 else 0 if i_row == 1 else pandas.NA for i_row in CMHD],
                      "CMHD_control"   : [1 if i_row == 0 else 0 for i_row in CMHD]
                     }
                    )

# Clean up.
del(CMHD_dx_only, CMHD_rx_only)
# Make caseness_array available across notebooks.
%store caseness_array

Stored 'caseness_array' (DataFrame)


In [14]:
# Display the prevalences of the casenesss variables, and their upper and lower bounds. 
#
# ## Set the multiplicative factor that defines the acceptable prevalence range, based on
# ## our attempt to minimise the count of candidate feature sets by considering normalised
# ##mutual information.
prev_range_LB = 0.7
prev_range_UB = 1.4

# Calculate prevalences. Redact and round.
counts = list(caseness_array.loc[:, ~caseness_array.columns.isin(['person_id', 'CMHD_possdef', 'CMHD_control'])].astype(bool).sum())
counts = numpy.append(counts, caseness_array.loc[:, 'CMHD_possdef'].sum() )
counts_LB = (counts * prev_range_LB).astype(int).tolist()
counts_UB = (counts * prev_range_UB).astype(int).tolist()
ls_counts = [counts.tolist(), counts_LB, counts_UB]
ls_counts = \
    [[numpy.nan if j <= redaction_threshold else round(j / target_round) * target_round for j in i] for i in ls_counts]
denominator_as_int = round( len(caseness_array) / target_round ) * target_round
possdef_denominator_as_int = round( len( caseness_array[~caseness_array['CMHD_possdef'].isna()] ) / target_round ) * target_round
ls_prevalences = copy.deepcopy(ls_counts)
for i in range(len(ls_prevalences)):
    for j in range(len(ls_prevalences[i])-1):
        ls_prevalences[i][j] = ls_prevalences[i][j] / denominator_as_int * 100
    ls_prevalences[i][-1] = ls_prevalences[i][-1] / possdef_denominator_as_int * 100
# Display table.
display(
    Markdown(
f"""
## Prevalence of caseness (per hundred)

The table below shows the counts and prevalence values for the 'Possible caseness', 'Definite caseness',
'Multinomial caseness', and 'Possible-vs-Definite caseness' variables, after applying rounding and redaction
rules (i.e. counts  $\le7$ are redacted before remaining values are rounded to the nearest $10$). The upper
and lower bounds are thresholds informed by our simulation study to ensure feature sets have at least $80%$
normalised mutual information with the caseness variables, in the best-case scenario. [Further details in
associated publication]


| Caseness cohort          |                             Count                              |                           Lower bound                          |                          Upper bound                           |
| ------------------------ | -------------------------------------------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
| Possible                 | {int(ls_counts[0][0]):,}  ({round(ls_prevalences[0][0],3):,}%) | {int(ls_counts[1][0]):,}  ({round(ls_prevalences[1][0],3):,}%) | {int(ls_counts[2][0]):,}  ({round(ls_prevalences[2][0],3):,}%) |
| Definite                 | {int(ls_counts[0][1]):,}  ({round(ls_prevalences[0][1],3):,}%) | {int(ls_counts[1][1]):,}  ({round(ls_prevalences[1][1],3):,}%) | {int(ls_counts[2][1]):,}  ({round(ls_prevalences[2][1],3):,}%) |
| Multinomial              | {int(ls_counts[0][2]):,}  ({round(ls_prevalences[0][2],3):,}%) | {int(ls_counts[1][2]):,}  ({round(ls_prevalences[1][2],3):,}%) | {int(ls_counts[2][2]):,}  ({round(ls_prevalences[2][2],3):,}%) |
| Possible-vs-Definite     | {int(ls_counts[0][3]):,}  ({round(ls_prevalences[0][3],3):,}%) | {int(ls_counts[1][3]):,}  ({round(ls_prevalences[1][3],3):,}%) | {int(ls_counts[2][3]):,}  ({round(ls_prevalences[2][3],3):,}%) |


These data mean that:
- for the 'Possible caseness' variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][0]):,}$
patients' records and no more than ${int(ls_counts[2][0]):,}$ patients' records.
- for the 'Definite caseness' variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][1]):,}$
patients' records and no more than ${int(ls_counts[2][1]):,}$ patients' records.
- for the 'Multinomial caseness' variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][2]):,}$
patients' records and no more than ${int(ls_counts[2][2]):,}$ patients' records.
- for the 'Possible-vs-Definite caseness' variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][3]):,}$
patients' records and no more than ${int(ls_counts[2][3]):,}$ patients' records.
"""
    )
)

# Define and store variables for future reference.
possibleCaseness_count = ls_counts[0][0]
possibleCaseness_count_LB = ls_counts[1][0]
possibleCaseness_count_UB = ls_counts[2][0]
definiteCaseness_count = ls_counts[0][1]
definiteCaseness_count_LB = ls_counts[1][1]
definiteCaseness_count_UB = ls_counts[2][1]
multinomialCaseness_count = ls_counts[0][2]
multinomialCaseness_count_LB = ls_counts[1][2]
multinomialCaseness_count_UB = ls_counts[2][2]
possdefCaseness_count = ls_counts[0][3]
possdefCaseness_count_LB = ls_counts[1][3]
possdefCaseness_count_UB = ls_counts[2][3]
possibleCaseness_prevalence = ls_prevalences[0][0]
possibleCaseness_prevalence_LB = ls_prevalences[1][0]
possibleCaseness_prevalence_UB = ls_prevalences[2][0]
definiteCaseness_prevalence = ls_prevalences[0][1]
definiteCaseness_prevalence_LB = ls_prevalences[1][1]
definiteCaseness_prevalence_UB = ls_prevalences[2][1]
multinomialCaseness_prevalence = ls_prevalences[0][2]
multinomialCaseness_prevalence_LB = ls_prevalences[1][2]
multinomialCaseness_prevalence_UB = ls_prevalences[2][2]
possdefCaseness_prevalence = ls_prevalences[0][3]
possdefCaseness_prevalence_LB = ls_prevalences[1][3]
possdefCaseness_prevalence_UB = ls_prevalences[2][3]

%store denominator_as_int possdef_denominator_as_int \
    possibleCaseness_count possibleCaseness_count_LB possibleCaseness_count_UB \
    definiteCaseness_count definiteCaseness_count_LB definiteCaseness_count_UB \
    multinomialCaseness_count multinomialCaseness_count_LB multinomialCaseness_count_UB \
    possdefCaseness_count possdefCaseness_count_LB possdefCaseness_count_UB\
    possibleCaseness_prevalence possibleCaseness_prevalence_LB possibleCaseness_prevalence_UB \
    definiteCaseness_prevalence definiteCaseness_prevalence_LB definiteCaseness_prevalence_UB \
    multinomialCaseness_prevalence multinomialCaseness_prevalence_LB multinomialCaseness_prevalence_UB \
    possdefCaseness_prevalence possdefCaseness_prevalence_LB possdefCaseness_prevalence_UB

# Clean up.
del(ls_counts, ls_prevalences)


## Prevalence of caseness (per hundred)

The table below shows the counts and prevalence values for the 'Possible caseness', 'Definite caseness',
'Multinomial caseness', and 'Possible-vs-Definite caseness' variables, after applying rounding and redaction
rules (i.e. counts  $\le7$ are redacted before remaining values are rounded to the nearest $10$). The upper
and lower bounds are thresholds informed by our simulation study to ensure feature sets have at least $80%$
normalised mutual information with the caseness variables, in the best-case scenario. [Further details in
associated publication]


| Caseness cohort          |                             Count                              |                           Lower bound                          |                          Upper bound                           |
| ------------------------ | -------------------------------------------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
| Possible                 | 27,070  (3.685%) | 18,950  (2.58%) | 37,890  (5.158%) |
| Definite                 | 1,100  (0.15%) | 770  (0.105%) | 1,530  (0.208%) |
| Multinomial              | 28,160  (3.834%) | 19,710  (2.683%) | 39,430  (5.368%) |
| Possible-vs-Definite     | 1,100  (3.906%) | 770  (2.734%) | 1,530  (5.433%) |


These data mean that:
- for the 'Possible caseness' variable, we will only consider feature sets that are present in at least $18,950$
patients' records and no more than $37,890$ patients' records.
- for the 'Definite caseness' variable, we will only consider feature sets that are present in at least $770$
patients' records and no more than $1,530$ patients' records.
- for the 'Multinomial caseness' variable, we will only consider feature sets that are present in at least $19,710$
patients' records and no more than $39,430$ patients' records.
- for the 'Possible-vs-Definite caseness' variable, we will only consider feature sets that are present in at least $770$
patients' records and no more than $1,530$ patients' records.


Stored 'denominator_as_int' (int)
Stored 'possdef_denominator_as_int' (int)
Stored 'possibleCaseness_count' (int)
Stored 'possibleCaseness_count_LB' (int)
Stored 'possibleCaseness_count_UB' (int)
Stored 'definiteCaseness_count' (int)
Stored 'definiteCaseness_count_LB' (int)
Stored 'definiteCaseness_count_UB' (int)
Stored 'multinomialCaseness_count' (int)
Stored 'multinomialCaseness_count_LB' (int)
Stored 'multinomialCaseness_count_UB' (int)
Stored 'possdefCaseness_count' (int)
Stored 'possdefCaseness_count_LB' (int)
Stored 'possdefCaseness_count_UB' (int)
Stored 'possibleCaseness_prevalence' (float)
Stored 'possibleCaseness_prevalence_LB' (float)
Stored 'possibleCaseness_prevalence_UB' (float)
Stored 'definiteCaseness_prevalence' (float)
Stored 'definiteCaseness_prevalence_LB' (float)
Stored 'definiteCaseness_prevalence_UB' (float)
Stored 'multinomialCaseness_prevalence' (float)
Stored 'multinomialCaseness_prevalence_LB' (float)
Stored 'multinomialCaseness_prevalence_UB' (float)
Stored

In [8]:
# Prepare header and note for presentation.
display(
    Markdown(
f"""
## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest $10$.
Only then are proportions and subsequent prevalences-per-hundered calculated.

The prevalence values refer to the period up to {date.today().strftime('%d-%b-%Y')}.
"""
       )
)

# Define columns to ignore
cols_to_ignore = ['person_id', 'Bipolar', 'Schizophrenia']

# Define the denominator.
denominator = \
    numpy.repeat(denominator_as_int,
                 bqTable.shape[1]-len(cols_to_ignore), axis = 0)

# Define the prevlance dataframe for calculating the prevalence of caseness components.
df_prevalence = \
    pandas.DataFrame(data = {'numerator'   : bqTable.loc[:,~bqTable.columns.isin(cols_to_ignore)].sum(),
                             'denominator' : denominator})
# Redact low counts.
df_prevalence = \
    df_prevalence.applymap(lambda x: numpy.nan if x <= redaction_threshold else x)

# Round to nearest target_round value.
df_prevalence = \
    (round(df_prevalence / target_round) * target_round)

# Calculate porportions.
df_prevalence['proportion (n/N)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']), 3)

# Calculate prevalence.
df_prevalence['prevalence (%)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']) * 100, 3)

# Print prevalence table.
display(df_prevalence)


## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest $10$.
Only then are proportions and subsequent prevalences-per-hundered calculated.

The prevalence values refer to the period up to 18-Apr-2023.


Unnamed: 0,numerator,denominator,proportion (n/N),prevalence (%)
Borderline,520.0,734530.0,0.001,0.071
ChronicPTSD,120.0,734530.0,0.0,0.016
ComplexPTSD,120.0,734530.0,0.0,0.016
ChronicDepression,1220.0,734530.0,0.002,0.166
Dysthymia,550.0,734530.0,0.001,0.075
PersonalityDisorder,3840.0,734530.0,0.005,0.523
Meds_PsychosisAndRelated,2710.0,734530.0,0.004,0.369
Meds_hypnoticsAndAnxiolytics,2010.0,734530.0,0.003,0.274
Meds_antidepressants,25920.0,734530.0,0.035,3.529


In [9]:
# Calculate the minimum counts and proportions of any criteria diagnoses
# and criterion medications, and use these to define the 'Definite caseness'
# and 'Possible caseness' counts and prevalence thresholds.
#
# ## Medication prescriptions.
least_likely_criterion_count_possibleCaseness = \
    df_prevalence.loc[df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                'Meds_hypnoticsAndAnxiolytics',
                                                'Meds_antidepressants']),
                      'numerator'].min()
name_of_least_likely_criterion_count_possibleCaseness = \
    list(df_prevalence.loc[df_prevalence.numerator == least_likely_criterion_count_possibleCaseness].index)
least_likely_criterion_prop_possibleCaseness = \
    least_likely_criterion_count_possibleCaseness / denominator_as_int
most_likely_criterion_count_possibleCaseness = \
    df_prevalence.loc[df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                'Meds_hypnoticsAndAnxiolytics',
                                                'Meds_antidepressants']),
                      'numerator'].max()
name_of_most_likely_criterion_count_possibleCaseness = \
    list(df_prevalence.loc[df_prevalence.numerator == most_likely_criterion_count_possibleCaseness].index)
most_likely_criterion_prop_possibleCaseness = \
    most_likely_criterion_count_possibleCaseness / denominator_as_int

# ## Diagnoses.
least_likely_criterion_count_diag = \
    df_prevalence.loc[~df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                 'Meds_hypnoticsAndAnxiolytics',
                                                 'Meds_antidepressants']),
                      'numerator'].min()
name_of_least_likely_criterion_count_diag = \
    list(df_prevalence.loc[df_prevalence.numerator == least_likely_criterion_count_diag].index)
least_likely_criterion_count_definiteCaseness = \
    sum(x == 2 for x in numpy.add(
                                bqTable.loc[:,
                                            bqTable.columns.isin(name_of_least_likely_criterion_count_diag)].max(axis = 1),
                                bqTable.loc[:,
                                             bqTable.columns.isin(name_of_least_likely_criterion_count_possibleCaseness)].max(axis = 1)
                            )
       )
least_likely_criterion_prop_definiteCaseness = \
    least_likely_criterion_count_definiteCaseness / denominator_as_int

most_likely_criterion_count_diag = \
    df_prevalence.loc[~df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                 'Meds_hypnoticsAndAnxiolytics',
                                                 'Meds_antidepressants']),
                      'numerator'].max()
name_of_most_likely_criterion_count_diag = \
    list(df_prevalence.loc[df_prevalence.numerator == most_likely_criterion_count_diag].index)
most_likely_criterion_count_definiteCaseness = \
    sum(x == 2 for x in numpy.add(
                                bqTable.loc[:,
                                            bqTable.columns.isin(name_of_most_likely_criterion_count_diag)].max(axis = 1),
                                bqTable.loc[:,
                                             bqTable.columns.isin(name_of_most_likely_criterion_count_possibleCaseness)].max(axis = 1)
                            )
       )
most_likely_criterion_prop_definiteCaseness = \
    most_likely_criterion_count_definiteCaseness / denominator_as_int

# Display a summary of the least-likely routes to caseness.
display(
    Markdown(
f"""
## Count and prevalence bounds if components are prioritised

In the section entitled `Prevalence of caseness (per hundred)`, we considered caseness to be an irreducible concept. On the contrary, patients
can satisfy our definition of caseness of complex mental health difficulties by having 1) a record of any component diagnoses, and 2) a record of any
component medications recently prescribed. Therefore, patients can satisfy our caseness criteria by a range of routes:

- The probability of satisfying 'Possible caseness' by the least-likely component route is ${round(least_likely_criterion_prop_possibleCaseness * 100, 3) if least_likely_criterion_prop_possibleCaseness > 0.00001 else '<0.001'}\%$.
This least-likely route is via __{name_of_least_likely_criterion_count_possibleCaseness}__.
- The probability of satisfying 'Possible caseness' by the most-likely component route is ${round(most_likely_criterion_prop_possibleCaseness * 100, 3) if most_likely_criterion_prop_possibleCaseness > 0.00001 else '<0.001'}\%$.
This most-likely route is via __{name_of_most_likely_criterion_count_possibleCaseness}__.

- The probability of satisfying 'Definite caseness' by the least-likely component route is ${round(least_likely_criterion_prop_definiteCaseness * 100, 3) if least_likely_criterion_prop_definiteCaseness > 0.00001 else '<0.001'}\%$.
This least-likely route is via __{name_of_least_likely_criterion_count_diag}__ + __{name_of_least_likely_criterion_count_possibleCaseness}__.
- The probability of satisfying 'Definite caseness' by the most-likely component route is ${round(most_likely_criterion_prop_definiteCaseness * 100, 3) if most_likely_criterion_prop_definiteCaseness > 0.00001 else '<0.001'}\%$.
This most-likely route is via __{name_of_most_likely_criterion_count_diag}__ + __{name_of_most_likely_criterion_count_possibleCaseness}__.\n


So, _What is the probability of satisfying our definition of the caseness?_ If we use data from `Prevalence of caseness (per hundred)`, then our answer is
${round(possibleCaseness_prevalence, 3)}\%$ for 'Possible caseness', ${round(definiteCaseness_prevalence, 3)}\%$ for 'Definite caseness', and
${round(multinomialCaseness_prevalence, 3)}\%$ for 'Multinomial caseness'. The bullet points above show that we get different answers if we use data about the components.
"""
    )
)


## Count and prevalence bounds if components are prioritised

In the section entitled `Prevalence of caseness (per hundred)`, we considered caseness to be an irreducible concept. On the contrary, patients
can satisfy our definition of caseness of complex mental health difficulties by having 1) a record of any component diagnoses, and 2) a record of any
component medications recently prescribed. Therefore, patients can satisfy our caseness criteria by a range of routes:

- The probability of satisfying 'Possible caseness' by the least-likely component route is $0.274\%$.
This least-likely route is via __['Meds_hypnoticsAndAnxiolytics']__.
- The probability of satisfying 'Possible caseness' by the most-likely component route is $3.529\%$.
This most-likely route is via __['Meds_antidepressants']__.

- The probability of satisfying 'Definite caseness' by the least-likely component route is $<0.001\%$.
This least-likely route is via __['ChronicPTSD', 'ComplexPTSD']__ + __['Meds_hypnoticsAndAnxiolytics']__.
- The probability of satisfying 'Definite caseness' by the most-likely component route is $0.099\%$.
This most-likely route is via __['PersonalityDisorder']__ + __['Meds_antidepressants']__.



So, _What is the probability of satisfying our definition of the caseness?_ If we use data from `Prevalence of caseness (per hundred)`, then our answer is
$3.685\%$ for 'Possible caseness', $0.15\%$ for 'Definite caseness', and
$3.834\%$ for 'Multinomial caseness'. The bullet points above show that we get different answers if we use data about the components.


## Calculating the entropy of the caseness

In [10]:
print("\n \'Possible caseness\' variable...")
entropy_caseness_scaled_possible = entropy_output(caseness_array['CMHD_rx_not_dx'])[1]
print("\n \'Definite caseness\' variable...")
entropy_caseness_scaled_definite = entropy_output(caseness_array['CMHD_dx_and_rx'])[1]
print("\n \'Multinomial caseness\' variable...")
entropy_caseness_scaled_multi = entropy_output(caseness_array['CMHD'])[1]
print("\n \'Possible-vs-Definite caseness\' variable...")
entropy_caseness_scaled_possdef = entropy_output(caseness_array['CMHD_possdef'])[1]
print("\n \'No caseness\' variable...")
entropy_caseness_scaled_control = entropy_output(caseness_array['CMHD_control'])[1]

# Make variables available across notebooks.
%store entropy_caseness_scaled_multi entropy_caseness_scaled_definite \
        entropy_caseness_scaled_possible entropy_caseness_scaled_possdef entropy_caseness_scaled_control


 'Possible caseness' variable...
	 Caseness variable entropy = 0.158 nats
	 Caseness variable scaled entropy = 22.766 %

 'Definite caseness' variable...
	 Caseness variable entropy = 0.011 nats
	 Caseness variable scaled entropy = 1.616 %

 'Multinomial caseness' variable...
	 Caseness variable entropy = 0.169 nats
	 Caseness variable scaled entropy = 24.373 %

 'Possible-vs-Definite caseness' variable...
	 Caseness variable entropy = 0.164 nats
	 Caseness variable scaled entropy = 23.73 %

 'No caseness' variable...
	 Caseness variable entropy = 0.163 nats
	 Caseness variable scaled entropy = 23.464 %
Stored 'entropy_caseness_scaled_multi' (float64)
Stored 'entropy_caseness_scaled_definite' (float64)
Stored 'entropy_caseness_scaled_possible' (float64)
Stored 'entropy_caseness_scaled_possdef' (float64)
Stored 'entropy_caseness_scaled_control' (float64)


## Calculating hit rates

In [11]:
print("\n \'Possible caseness\' variable...")
hitRate_none_possible, hitRate_all_possible = hitrate_output(caseness_array['CMHD_rx_not_dx'])
print("\n \'Definite caseness\' variable...")
hitRate_none_definite, hitRate_all_definite = hitrate_output(caseness_array['CMHD_dx_and_rx'])
print("\n \'Multinomial caseness\' variable...")
hitRate_none_multi, hitRate_all_multi = hitrate_output(caseness_array['CMHD'])
print("\n \'Possible-vs-Definite caseness\' variable...")
hitRate_none_possdef, hitRate_all_possdef = hitrate_output(caseness_array['CMHD_possdef'])


 'Possible caseness' variable...
	 Hit rate (all) = 3.685 %
	 Hit rate (none) = 96.315 %
	 Odds (No CMHD : CMHD) = 26 -times less likely to have CMHD than to have it.

 'Definite caseness' variable...
	 Hit rate (all) = 0.149 %
	 Hit rate (none) = 99.851 %
	 Odds (No CMHD : CMHD) = 669 -times less likely to have CMHD than to have it.

 'Multinomial caseness' variable...
	 Hit rate (all) = 3.834 %
	 Hit rate (none) = 96.166 %
	 Odds (No CMHD : CMHD) = 25 -times less likely to have CMHD than to have it.

 'Possible-vs-Definite caseness' variable...
	 Hit rate (all) = 3.892 %
	 Hit rate (none) = 96.108 %
	 Odds (No CMHD : CMHD) = 24 -times less likely to have CMHD than to have it.


In [12]:
display(
    Markdown(
f"""    
We now know that:
1. based on the scaled entropies, our variables for indicating caseness of complex mental health difficulties are $\le{round(max(entropy_caseness_scaled_multi, entropy_caseness_scaled_definite, entropy_caseness_scaled_possible), 3)}\%$ as uncertain/surprising/unforeseeable
as they could possibly be; _and_
2. we would correctly classify $\ge{round(min(hitRate_none_multi, hitRate_none_definite, hitRate_none_possible), 2)}\%$ of patients in this sample if we simply assumed that no one has complex mental health difficulties (depending on which caseness variable is considered).

The first point tells us that active caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. The second point defines a benchmark for the
indicative performance of any feature set that we evaluate in our study. Specifically, any feature set that we suggest to improve our certainty of knowing that someone has complex mental
health difficulties must correctly identify $\ge{round(min(hitRate_none_multi, hitRate_none_definite, hitRate_none_possible), 2)}\%$ of patients in our sample. Otherwise, the added feature set is a needless complication to our attempt to know
whether someone has complex mental health difficulties (which we can almost always safely assume they don't). This is such a high benchmark that we will be very unlikely to find such a
feature set.

We must remember that we are not trying to out-predict an identification rule based on caseness prevalence. Rather, we are trying to find feature sets that correlate with this
caseness prevalence. Large correlations would be difficult to find using variance-based methods like Pearson's product moment correlation or regression methods because the variance of the
caseness variable is so low. Our approach based on mutual-information is better suited to this situation because its fundamental concept is coincidence rather than covariance.
"""
        )
)

    
We now know that:
1. based on the scaled entropies, our variables for indicating caseness of complex mental health difficulties are $\le24.373\%$ as uncertain/surprising/unforeseeable
as they could possibly be; _and_
2. we would correctly classify $\ge96.17\%$ of patients in this sample if we simply assumed that no one has complex mental health difficulties (depending on which caseness variable is considered).

The first point tells us that active caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. The second point defines a benchmark for the
indicative performance of any feature set that we evaluate in our study. Specifically, any feature set that we suggest to improve our certainty of knowing that someone has complex mental
health difficulties must correctly identify $\ge96.17\%$ of patients in our sample. Otherwise, the added feature set is a needless complication to our attempt to know
whether someone has complex mental health difficulties (which we can almost always safely assume they don't). This is such a high benchmark that we will be very unlikely to find such a
feature set.

We must remember that we are not trying to out-predict an identification rule based on caseness prevalence. Rather, we are trying to find feature sets that correlate with this
caseness prevalence. Large correlations would be difficult to find using variance-based methods like Pearson's product moment correlation or regression methods because the variance of the
caseness variable is so low. Our approach based on mutual-information is better suited to this situation because its fundamental concept is coincidence rather than covariance.


In [13]:
# Below, I compute the cells of the contingency table for a rule that says no one has caseness of complex mental health difficulties.
#
# True positives. Zero because the rule says no one demonstrates 'Definite caseness' so no "positives" of any kind exist.
tp = 0
# False positives. Zero because the rule says no one demonstrates 'Definite caseness' so no "positives" of any kind exist.
fp = 0
# True negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
tn_possible = hitRate_none_possible / 100 * denominator_as_int
tn_definite = hitRate_none_definite / 100 * denominator_as_int
tn_multi = hitRate_none_multi / 100 * denominator_as_int
tn_possdef = hitRate_none_possdef / 100 * possdef_denominator_as_int
# False negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
fn_possible = hitRate_all_possible / 100 * denominator_as_int
fn_definite = hitRate_all_definite / 100 * denominator_as_int
fn_multi = hitRate_all_multi / 100 * denominator_as_int
fn_possdef = hitRate_all_possdef / 100 * possdef_denominator_as_int

# Below, I compute the evaluation statistics.
#
# Class balance accuracy.
cba_possible = round( 0.5 * ( (tp / max( (tp + fn_possible), (tp + fp) ) ) + (tn_possible / max( (tn_possible + fp), (tn_possible + fn_possible) ) ) ), 2)
cba_definite = round( 0.5 * ( (tp / max( (tp + fn_definite), (tp + fp) ) ) + (tn_definite / max( (tn_definite + fp), (tn_definite + fn_definite) ) ) ), 2)
cba_multi = round( 0.5 * ( (tp / max( (tp + fn_multi), (tp + fp) ) ) + (tn_multi / max( (tn_multi + fp), (tn_multi + fn_multi) ) ) ), 2)
cba_possdef = round( 0.5 * ( (tp / max( (tp + fn_possdef), (tp + fp) ) ) + (tn_possdef / max( (tn_possdef + fp), (tn_possdef + fn_possdef) ) ) ), 2)
# Odds ratio.
OR_possible = 'Not a number because one of the odds is zero.' if min( (tp * tn_possible) , (fp * fn_possible) ) == 0 else round( (tp * tn_possible) / (fp * fn_possible), 2)
OR_definite = 'Not a number because one of the odds is zero.' if min( (tp * tn_definite) , (fp * fn_definite) ) == 0 else round( (tp * tn_definite) / (fp * fn_definite), 2)
OR_multi = 'Not a number because one of the odds is zero.' if min( (tp * tn_multi) , (fp * fn_multi) ) == 0 else round( (tp * tn_multi) / (fp * fn_multi), 2)
OR_possdef = 'Not a number because one of the odds is zero.' if min( (tp * tn_possdef) , (fp * fn_possdef) ) == 0 else round( (tp * tn_possdef) / (fp * fn_possdef), 2)
# Positive predictive value.
ppv = 0 if (tp + fp) == 0 else round( tp / (tp + fp), 2)
# Negative predictive value.
npv_possible = 0 if (tn_possible + fn_possible) == 0 else round( tn_possible / (tn_possible + fn_possible), 2)
npv_definite = 0 if (tn_definite + fn_definite) == 0 else round( tn_definite / (tn_definite + fn_definite), 2)
npv_multi = 0 if (tn_multi + fn_multi) == 0 else round( tn_multi / (tn_multi + fn_multi), 2)
npv_possdef = 0 if (tn_possdef + fn_possdef) == 0 else round( tn_possdef / (tn_possdef + fn_possdef), 2)

display(
    Markdown(
f"""    
Assuming a rule that says no one demonstrates caseness of complex mental health difficulties, we get the following approximate values for our evaluation statistics:

| Caseness             | Normalised mutual information  | Class balance accuracy |    Odds ratio    | Positive predictive value | Negative predictive value |
| -------------------- | ------------------------------ | ---------------------- | ---------------- | ------------------------- | ------------------------- |
| Possible             | x \u2192 0                     | {cba_possible}         | {OR_possible}    | {ppv}                     | {npv_possible}            |
| Definite             | x \u2192 0                     | {cba_definite}         | {OR_definite}    | {ppv}                     | {npv_definite}            |
| Multinomial          | x \u2192 0                     | {cba_multi}            | {OR_multi}       | {ppv}                     | {npv_multi}               |
| Possible-vs-Definite | x \u2192 0                     | {cba_possdef}          | {OR_possdef}     | {ppv}                     | {npv_possdef}             |

"""
    )
)

    
Assuming a rule that says no one demonstrates caseness of complex mental health difficulties, we get the following approximate values for our evaluation statistics:

| Caseness             | Normalised mutual information  | Class balance accuracy |    Odds ratio    | Positive predictive value | Negative predictive value |
| -------------------- | ------------------------------ | ---------------------- | ---------------- | ------------------------- | ------------------------- |
| Possible             | x → 0                     | 0.48         | Not a number because one of the odds is zero.    | 0                     | 0.96            |
| Definite             | x → 0                     | 0.5         | Not a number because one of the odds is zero.    | 0                     | 1.0            |
| Multinomial          | x → 0                     | 0.48            | Not a number because one of the odds is zero.       | 0                     | 0.96               |
| Possible-vs-Definite | x → 0                     | 0.48          | Not a number because one of the odds is zero.     | 0                     | 0.96             |

