# Caseness array

The purpose of this notebook is to produce the caseness array. The caseness array is an n-by-2 array containing patient ID and a binary vector indicating whether the patient is clinically coded for complex mental health difficulties.

### Imports

In [87]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

### Prerequisites

In [80]:
# Set index date. Usually CURRENT_DATE() but Dec 2021 will be used until cB fixed the missing prescriptions.
myIndexDate =  '2021-12-31'

# Set the capture window that indicates an active prescription of the medications of interest.
meds_catch_window = 4

# Set parameters for disclosivity adjustments.
redaction_threshold = 7
target_round = 10

# Set the database attributes.
global server_id
server_id = 'yhcr-prd-phm-bia-core'
global database_id
database_id = 'CB_FDM_PrimaryCare_V7'

# Instantiate BigQuery client.
client = bigquery.Client()

# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN create caseness array.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_bipolar = pandas.read_csv(folder + "ciaranmci-bipolar-disorder-6a0308d7.csv")
codes_to_query_borderline = pandas.read_csv(folder + "ciaranmci-borderline-personality-disorder-1ed4af38.csv")
codes_to_query_chronicDepression = pandas.read_csv(folder + "ciaranmci-chronic-depression-53a65598.csv")
codes_to_query_chronicPTSD = pandas.read_csv(folder + "ciaranmci-chronic-post-traumatic-stress-disorder-3a96e263.csv")
codes_to_query_complexPTSD = pandas.read_csv(folder + "ciaranmci-complex-post-traumatic-stress-disorder-21876f2e.csv")
codes_to_query_dysthymia = pandas.read_csv(folder + "ciaranmci-dysthymia-6f6888c3.csv")
codes_to_query_personalityDisorder = pandas.read_csv(folder + "ciaranmci-personality-disorder-243a2f24.csv")
codes_to_query_schizophrenia = pandas.read_csv(folder + "ciaranmci-schizophrenia-05c53c03.csv")
codes_to_query_all = pandas.read_csv(folder + "ciaranmci-unseen-snomed-codes-to-identify-cmhd-0b2abbef.csv")

# Medications of interest.
medications_to_query_psychosisAndRelated = pandas.read_csv(folder + "UNSEEN medications_psychosisAndRelated.csv")
medications_to_query_hypnoticsAndAnxiolytics = pandas.read_csv(folder + "UNSEEN medications_hypnoticsAndAnxiolytics.csv")
medications_to_query_antidepressants = pandas.read_csv(folder + "UNSEEN medications_antidepressants.csv")
medications_to_query_all = pandas.read_csv(folder + "UNSEEN medications list.csv")

%store server_id database_id

Stored 'server_id' (str)
Stored 'database_id' (str)


## Creating the array

In [95]:
sql = """
DECLARE myIndexDate DATE DEFAULT '""" + myIndexDate + """';

WITH
# The first CTE will specify the 'spine' of the data table by selecting the unique list of person IDs.
tbl_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.person
    # Limiting to age range 18-70.
    WHERE
        (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)

# The following CTEs extract each clinical codelist into a SQL table before querying the person_ID 
# associated with the clinical codes.
#
#  ## Bipolar disorder
,tbl_bipolar AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_bipolar["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_bipolar_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_bipolar
    WHERE
        snomedcode IN (tbl_bipolar.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Borderline personality disorder
,tbl_borderline AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_borderline["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_borderline_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_borderline
    WHERE
        snomedcode IN (tbl_borderline.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Chronic PTSD
,tbl_chronicPTSD AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_chronicPTSD_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_chronicPTSD
    WHERE
        snomedcode IN (tbl_chronicPTSD.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Complex PTSD
,tbl_complexPTSD AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_complexPTSD_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_complexPTSD
    WHERE
        snomedcode IN (tbl_complexPTSD.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Chronic depression
,tbl_chronicDepression AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicDepression["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_chronicDepression_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_chronicDepression
    WHERE
        snomedcode IN (tbl_chronicDepression.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Dysthymia
,tbl_dysthymia AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_dysthymia["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_dysthymia_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_dysthymia
    WHERE
        snomedcode IN (tbl_dysthymia.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Personality disorder
,tbl_personalityDisorder AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_personalityDisorder["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_personalityDisorder_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_personalityDisorder
    WHERE
        snomedcode IN (tbl_personalityDisorder.my_snomedcode)
        AND dateevent < myIndexDate
)
#  ## Schizophrenia
,tbl_schizophrenia AS ( 
    SELECT
        my_snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_schizophrenia["code"].tolist())) + """'
                ]) AS my_snomedcode
)
,tbl_schizophrenia_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode, tbl_schizophrenia
    WHERE
        snomedcode IN (tbl_schizophrenia.my_snomedcode)
        AND dateevent < myIndexDate
)


# The following CTEs extract each medication list into a SQL table before querying the person_ID 
# associated with the medications (combined into medication type).
#
#  ## Drugs used in psychosis and related disorders.
,tbl_meds_psychosisAndRelated AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_psychosisAndRelated["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_meds_psychosisAndRelated_persons

AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_psychosisAndRelated
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_psychosisAndRelated.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(myIndexDate, CAST(datemedicationstart AS DATE), MONTH) BETWEEN 0 AND """ + str(meds_catch_window) + """
)
#  ## Hypnotics and anxiolyitcs
,tbl_meds_hypnoticsAndAnxiolytics AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_meds_hypnoticsAndAnxiolytics_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_hypnoticsAndAnxiolytics
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_hypnoticsAndAnxiolytics.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(myIndexDate, CAST(datemedicationstart AS DATE), MONTH) BETWEEN 0 AND """ + str(meds_catch_window) + """
)
#  ## Antidepressants
,tbl_meds_antidepressants AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_meds_antidepressants_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_antidepressants
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_antidepressants.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(myIndexDate, CAST(datemedicationstart AS DATE), MONTH) BETWEEN 0 AND """ + str(meds_catch_window) + """
)


# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# clinical code and medication group. The code and medication columns are populated by interger
# values with '1' indicating that the code or medication is present in patient record and '0' indicating
# otherwise.
SELECT
    DISTINCT tbl_persons.person_id
    ,CASE WHEN tbl_bipolar_persons.person_id IS NULL THEN 0 ELSE 1 END AS Bipolar # Bipolar is an exclusion for caseness.
    ,CASE WHEN tbl_borderline_persons.person_id IS NULL THEN 0 ELSE 1 END AS Borderline
    ,CASE WHEN tbl_chronicPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicPTSD
    ,CASE WHEN tbl_complexPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ComplexPTSD
    ,CASE WHEN tbl_chronicDepression_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicDepression
    ,CASE WHEN tbl_dysthymia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Dysthymia
    ,CASE WHEN tbl_personalityDisorder_persons.person_id IS NULL THEN 0 ELSE 1 END AS PersonalityDisorder
    ,CASE WHEN tbl_schizophrenia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Schizophrenia # Schizophrenia is an exclusion for caseness.
    ,CASE WHEN tbl_meds_psychosisAndRelated_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_PsychosisAndRelated
    ,CASE WHEN tbl_meds_hypnoticsAndAnxiolytics_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_hypnoticsAndAnxiolytics
    ,CASE WHEN tbl_meds_antidepressants_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_antidepressants
FROM tbl_persons
LEFT OUTER JOIN tbl_bipolar_persons ON tbl_persons.person_id = tbl_bipolar_persons.person_id
LEFT OUTER JOIN tbl_borderline_persons ON tbl_persons.person_id = tbl_borderline_persons.person_id
LEFT OUTER JOIN tbl_chronicPTSD_persons ON tbl_persons.person_id = tbl_chronicPTSD_persons.person_id
LEFT OUTER JOIN tbl_complexPTSD_persons ON tbl_persons.person_id = tbl_complexPTSD_persons.person_id
LEFT OUTER JOIN tbl_chronicDepression_persons ON tbl_persons.person_id = tbl_chronicDepression_persons.person_id
LEFT OUTER JOIN tbl_dysthymia_persons ON tbl_persons.person_id = tbl_dysthymia_persons.person_id
LEFT OUTER JOIN tbl_personalityDisorder_persons ON tbl_persons.person_id = tbl_personalityDisorder_persons.person_id
LEFT OUTER JOIN tbl_schizophrenia_persons ON tbl_persons.person_id = tbl_schizophrenia_persons.person_id
LEFT OUTER JOIN tbl_meds_psychosisAndRelated_persons ON tbl_persons.person_id = tbl_meds_psychosisAndRelated_persons.person_id
LEFT OUTER JOIN tbl_meds_hypnoticsAndAnxiolytics_persons ON tbl_persons.person_id = tbl_meds_hypnoticsAndAnxiolytics_persons.person_id
LEFT OUTER JOIN tbl_meds_antidepressants_persons ON tbl_persons.person_id = tbl_meds_antidepressants_persons.person_id
ORDER BY tbl_persons.person_id
"""

bqTable = client.query(sql).to_dataframe()

# Store bqTable for use in other notebaooks
%store bqTable

Stored 'bqTable' (DataFrame)


In [96]:
# Define diagnosis-based CMHD as any patient with any of the diagnoses.
CMHD_dx_only = bqTable.loc[:, ~bqTable.columns.isin(['person_id',
                                                     'Meds_PsychosisAndRelated',
                                                     'Meds_hypnoticsAndAnxiolytics',
                                                     'Meds_antidepressants'])].max(axis = 1)
# ...but exclude patients with diagnoses for bipolar or schizophrenia.
CMHD_dx_only.loc[(bqTable['Bipolar'] == 1) | (bqTable['Schizophrenia'] == 1)] = 0
                 
# Define prescription-based CMHD as any patient with any of the prescriptions.              
CMHD_rx_only = bqTable[['Meds_PsychosisAndRelated', 
                        'Meds_hypnoticsAndAnxiolytics',
                        'Meds_antidepressants']].max(axis = 1)
CMHD = []
for i_iter in range(len(CMHD_rx_only)):
    if (CMHD_dx_only[i_iter] == 1 and CMHD_rx_only[i_iter] == 1):
        # If the patient has a diagnostic code AND an active prescription,
        # then they are a 'Definite caseness', CMHD = 3.
        CMHD.append(3)
    elif (CMHD_dx_only[i_iter] == 1 and CMHD_rx_only[i_iter] != 1):
        # If the patient DOES HAVE a diagnostic code BUT DOES NOT have an,
        # active prescription, then they are a 'Diagnosis-based caseness', CMHD = 2.
        CMHD.append(2)
    elif (CMHD_dx_only[i_iter] != 1 and CMHD_rx_only[i_iter] == 1):
        # If the patient DOES NOT have a diagnostic code BUT DOES HAVE an
        #  active prescription, then they are a 'Prescription-based caseness', CMHD = 1.
        CMHD.append(1)
    elif (CMHD_dx_only[i_iter] != 1 and CMHD_rx_only[i_iter] != 1):
        # If the patient neither has a diagnostic code nor an active prescription,
        # then they are a 'Control', CMHD = 0.
        CMHD.append(0)
        
caseness_array = \
    pandas.DataFrame(data = \
                     {#"person_id" : bqTable['person_id'],
                      "CMHD_dx_and_rx" : [1 if i_row == 3 else 0 for i_row in CMHD],
                      "CMHD_dx_not_rx" : [1 if i_row == 2 else 0 for i_row in CMHD],
                      "CMHD_rx_not_dx" : [1 if i_row == 1 else 0 for i_row in CMHD],
                      "CMHD_multi"     : CMHD,
                      "CMHD_prescriptionVsDefinite"   : [1 if i_row == 3 else 0 if i_row == 1 else pandas.NA for i_row in CMHD],
                      "CMHD_control"   : [1 if i_row == 0 else 0 for i_row in CMHD]
                     }
                    )

# Clean up.
del(CMHD_dx_only, CMHD_rx_only, CMHD)
# Make caseness_array available across notebooks.
%store caseness_array

Stored 'caseness_array' (DataFrame)


In [97]:
# Display the prevalences of the casenesss variables, and their upper and lower bounds. 
#
# ## Set the multiplicative factor that defines the acceptable prevalence range, based on
# ## our attempt to minimise the count of candidate feature sets by considering normalised
# ##mutual information.
prev_range_LB = 0.9
prev_range_UB = 2.0

# Calculate prevalences. Redact and round.
counts = list(caseness_array.loc[:, ~caseness_array.columns.isin(['person_id', 'CMHD_prescriptionVsDefinite', 'CMHD_control'])].astype(bool).sum())
counts = numpy.append(counts, caseness_array.loc[:, 'CMHD_prescriptionVsDefinite'].sum() )
counts_LB = (counts * prev_range_LB).astype(int).tolist()
counts_UB = (counts * prev_range_UB).astype(int).tolist()
ls_counts = [counts.tolist(), counts_LB, counts_UB]
ls_counts = \
    [[numpy.nan if j <= redaction_threshold else round(j / target_round) * target_round for j in i] for i in ls_counts]
denominator_as_int = round( len(caseness_array) / target_round ) * target_round
prescriptionVsDefinite_denominator_as_int = round( len( caseness_array[~caseness_array['CMHD_prescriptionVsDefinite'].isna()] ) / target_round ) * target_round
ls_prevalences = copy.deepcopy(ls_counts)
for i in range(len(ls_prevalences)):
    for j in range(len(ls_prevalences[i])-1):
        ls_prevalences[i][j] = ls_prevalences[i][j] / denominator_as_int * 100
    ls_prevalences[i][-1] = ls_prevalences[i][-1] / prescriptionVsDefinite_denominator_as_int * 100
# Display table.
display(
    Markdown(
f"""
## Prevalence of caseness (per hundred)

The table below shows the counts and prevalence values for the 'Definite caseness' (Dx and Rx), 
'Diagnosis-based caseness' (Dx not Rx), 'Prescription-based caseness' (Rx not Dx),
'Multinomial caseness', and 'Prescription-based -vs- Definite caseness' variables,
after applying rounding and redaction rules (i.e. counts  $\le7$ are redacted before remaining
values are rounded to the nearest $10$). The upper and lower bounds are thresholds informed by our
simulation study to ensure feature sets have at least $80\%$ normalised mutual information with the
caseness variables, in the best-case scenario. [Further details in associated publication]


| Caseness cohort                |                             Count                              |                           Lower bound                          |                          Upper bound                           |
| ------------------------------ | -------------------------------------------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
| Dx and Rx                      | {int(ls_counts[0][0]):,}  ({round(ls_prevalences[0][0],3):,}%) | {int(ls_counts[1][0]):,}  ({round(ls_prevalences[1][0],3):,}%) | {int(ls_counts[2][0]):,}  ({round(ls_prevalences[2][0],3):,}%) |
| Dx not Rx                      | {int(ls_counts[0][1]):,}  ({round(ls_prevalences[0][1],3):,}%) | {int(ls_counts[1][1]):,}  ({round(ls_prevalences[1][1],3):,}%) | {int(ls_counts[2][1]):,}  ({round(ls_prevalences[2][1],3):,}%) |
| Rx not Dx                      | {int(ls_counts[0][2]):,}  ({round(ls_prevalences[0][2],3):,}%) | {int(ls_counts[1][2]):,}  ({round(ls_prevalences[1][2],3):,}%) | {int(ls_counts[2][2]):,}  ({round(ls_prevalences[2][2],3):,}%) |
| Multinomial                    | {int(ls_counts[0][3]):,}  ({round(ls_prevalences[0][3],3):,}%) | {int(ls_counts[1][3]):,}  ({round(ls_prevalences[1][3],3):,}%) | {int(ls_counts[2][3]):,}  ({round(ls_prevalences[2][3],3):,}%) |
| Rx-not-Dx vs Dx-and-Rx         | {int(ls_counts[0][4]):,}  ({round(ls_prevalences[0][4],3):,}%) | {int(ls_counts[1][4]):,}  ({round(ls_prevalences[1][4],3):,}%) | {int(ls_counts[2][4]):,}  ({round(ls_prevalences[2][4],3):,}%) |


If we are applying the filters based on normalised mutual information, then these data mean that:
- for the 'Definite caseness' (Dx and Rx) variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][0]):,}$
patients' records and no more than ${int(ls_counts[2][0]):,}$ patients' records.
- for the 'Diagnosis-based caseness' (Dx not Rx) variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][1]):,}$
patients' records and no more than ${int(ls_counts[2][1]):,}$ patients' records.
- for the 'Prescription-based caseness' (Rx not Dx) variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][2]):,}$
patients' records and no more than ${int(ls_counts[2][2]):,}$ patients' records.
- for the 'Multinomial caseness' variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][3]):,}$
patients' records and no more than ${int(ls_counts[2][3]):,}$ patients' records.
- for the 'Prescription-based -vs- Definite caseness' variable, we will only consider feature sets that are present in at least ${int(ls_counts[1][4]):,}$
patients' records and no more than ${int(ls_counts[2][4]):,}$ patients' records.
"""
    )
)

# Define and store variables for future reference.
DxAndRxCaseness_count = ls_counts[0][0]
DxAndRxCaseness_count_LB = ls_counts[1][0]
DxAndRxCaseness_count_UB = ls_counts[2][0]
DxNotRxCaseness_count = ls_counts[0][1]
DxNotRxCaseness_count_LB = ls_counts[1][1]
DxNotRxCaseness_count_UB = ls_counts[2][1]
RxNotDxCaseness_count = ls_counts[0][2]
RxNotDxCaseness_count_LB = ls_counts[1][2]
RxNotDxCaseness_count_UB = ls_counts[2][2]
multinomialCaseness_count = ls_counts[0][3]
multinomialCaseness_count_LB = ls_counts[1][3]
multinomialCaseness_count_UB = ls_counts[2][3]
prescriptionVsDefiniteCaseness_count = ls_counts[0][4]
prescriptionVsDefiniteCaseness_count_LB = ls_counts[1][4]
prescriptionVsDefiniteCaseness_count_UB = ls_counts[2][4]

DxAndRxCaseness_prevalence = ls_prevalences[0][0]
DxAndRxCaseness_prevalence_LB = ls_prevalences[1][0]
DxAndRxCaseness_prevalence_UB = ls_prevalences[2][0]
DxNotRxCaseness_prevalence = ls_prevalences[0][1]
DxNotRxCaseness_prevalence_LB = ls_prevalences[1][1]
DxNotRxCaseness_prevalence_UB = ls_prevalences[2][1]
RxNotDxCaseness_prevalence = ls_prevalences[0][2]
RxNotDxCaseness_prevalence_LB = ls_prevalences[1][2]
RxNotDxCaseness_prevalence_UB = ls_prevalences[2][2]
multinomialCaseness_prevalence = ls_prevalences[0][3]
multinomialCaseness_prevalence_LB = ls_prevalences[1][3]
multinomialCaseness_prevalence_UB = ls_prevalences[2][3]
prescriptionVsDefiniteCaseness_prevalence = ls_prevalences[0][3]
prescriptionVsDefiniteCaseness_prevalence_LB = ls_prevalences[1][3]
prescriptionVsDefiniteCaseness_prevalence_UB = ls_prevalences[2][3]

%store denominator_as_int prescriptionVsDefinite_denominator_as_int \
    DxAndRxCaseness_count DxNotRxCaseness_count RxNotDxCaseness_count \
    multinomialCaseness_count prescriptionVsDefiniteCaseness_count \
    DxAndRxCaseness_count_LB DxNotRxCaseness_count_LB RxNotDxCaseness_count_LB \
    multinomialCaseness_count_LB prescriptionVsDefiniteCaseness_count_LB \
    DxAndRxCaseness_count_UB DxNotRxCaseness_count_UB RxNotDxCaseness_count_UB \
    multinomialCaseness_count_UB prescriptionVsDefiniteCaseness_count_UB \
    DxAndRxCaseness_prevalence DxNotRxCaseness_prevalence RxNotDxCaseness_prevalence \
    multinomialCaseness_prevalence prescriptionVsDefiniteCaseness_prevalence \
    DxAndRxCaseness_prevalence_LB DxNotRxCaseness_prevalence_LB RxNotDxCaseness_prevalence_LB \
    multinomialCaseness_prevalence_LB prescriptionVsDefiniteCaseness_prevalence_LB \
    DxAndRxCaseness_prevalence_UB DxNotRxCaseness_prevalence_UB RxNotDxCaseness_prevalence_UB \
    multinomialCaseness_prevalence_UB prescriptionVsDefiniteCaseness_prevalence_UB

# Clean up.
del(ls_counts, ls_prevalences)


## Prevalence of caseness (per hundred)

The table below shows the counts and prevalence values for the 'Definite caseness' (Dx and Rx), 
'Diagnosis-based caseness' (Dx not Rx), 'Prescription-based caseness' (Rx not Dx),
'Multinomial caseness', and 'Prescription-based -vs- Definite caseness' variables,
after applying rounding and redaction rules (i.e. counts  $\le7$ are redacted before remaining
values are rounded to the nearest $10$). The upper and lower bounds are thresholds informed by our
simulation study to ensure feature sets have at least $80\%$ normalised mutual information with the
caseness variables, in the best-case scenario. [Further details in associated publication]


| Caseness cohort                |                             Count                              |                           Lower bound                          |                          Upper bound                           |
| ------------------------------ | -------------------------------------------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
| Dx and Rx                      | 1,670  (0.227%) | 1,510  (0.205%) | 3,350  (0.455%) |
| Dx not Rx                      | 2,930  (0.398%) | 2,640  (0.358%) | 5,860  (0.795%) |
| Rx not Dx                      | 48,420  (6.572%) | 43,580  (5.915%) | 96,850  (13.146%) |
| Multinomial                    | 53,030  (7.198%) | 47,720  (6.477%) | 106,050  (14.394%) |
| Rx-not-Dx vs Dx-and-Rx         | 1,670  (3.333%) | 1,510  (3.014%) | 3,350  (6.687%) |


If we are applying the filters based on normalised mutual information, then these data mean that:
- for the 'Definite caseness' (Dx and Rx) variable, we will only consider feature sets that are present in at least $1,510$
patients' records and no more than $3,350$ patients' records.
- for the 'Diagnosis-based caseness' (Dx not Rx) variable, we will only consider feature sets that are present in at least $2,640$
patients' records and no more than $5,860$ patients' records.
- for the 'Prescription-based caseness' (Rx not Dx) variable, we will only consider feature sets that are present in at least $43,580$
patients' records and no more than $96,850$ patients' records.
- for the 'Multinomial caseness' variable, we will only consider feature sets that are present in at least $47,720$
patients' records and no more than $106,050$ patients' records.
- for the 'Prescription-based -vs- Definite caseness' variable, we will only consider feature sets that are present in at least $1,510$
patients' records and no more than $3,350$ patients' records.


Stored 'denominator_as_int' (int)
Stored 'prescriptionVsDefinite_denominator_as_int' (int)
Stored 'DxAndRxCaseness_count' (int)
Stored 'DxNotRxCaseness_count' (int)
Stored 'RxNotDxCaseness_count' (int)
Stored 'multinomialCaseness_count' (int)
Stored 'prescriptionVsDefiniteCaseness_count' (int)
Stored 'DxAndRxCaseness_count_LB' (int)
Stored 'DxNotRxCaseness_count_LB' (int)
Stored 'RxNotDxCaseness_count_LB' (int)
Stored 'multinomialCaseness_count_LB' (int)
Stored 'prescriptionVsDefiniteCaseness_count_LB' (int)
Stored 'DxAndRxCaseness_count_UB' (int)
Stored 'DxNotRxCaseness_count_UB' (int)
Stored 'RxNotDxCaseness_count_UB' (int)
Stored 'multinomialCaseness_count_UB' (int)
Stored 'prescriptionVsDefiniteCaseness_count_UB' (int)
Stored 'DxAndRxCaseness_prevalence' (float)
Stored 'DxNotRxCaseness_prevalence' (float)
Stored 'RxNotDxCaseness_prevalence' (float)
Stored 'multinomialCaseness_prevalence' (float)
Stored 'prescriptionVsDefiniteCaseness_prevalence' (float)
Stored 'DxAndRxCaseness_prev

In [98]:
# Prepare header and note for presentation.
display(
    Markdown(
f"""
## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest $10$.
Only then are proportions and subsequent prevalences-per-hundered calculated.

The prevalence values refer to the period up to {myIndexDate}.
"""
       )
)

# Define columns to ignore
cols_to_ignore = ['person_id', 'Bipolar', 'Schizophrenia']

# Define the denominator.
denominator = \
    numpy.repeat(denominator_as_int,
                 bqTable.shape[1]-len(cols_to_ignore), axis = 0)

# Define the prevlance dataframe for calculating the prevalence of caseness components.
df_prevalence = \
    pandas.DataFrame(data = {'numerator'   : bqTable.loc[:,~bqTable.columns.isin(cols_to_ignore)].sum(),
                             'denominator' : denominator})
# Redact low counts.
df_prevalence = \
    df_prevalence.applymap(lambda x: numpy.nan if x <= redaction_threshold else x)

# Round to nearest target_round value.
df_prevalence = \
    (round(df_prevalence / target_round) * target_round)

# Calculate porportions.
df_prevalence['proportion (n/N)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']), 3)

# Calculate prevalence.
df_prevalence['prevalence (%)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']) * 100, 3)

# Print prevalence table.
display(df_prevalence)


## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest $10$.
Only then are proportions and subsequent prevalences-per-hundered calculated.

The prevalence values refer to the period up to 2021-12-31.


Unnamed: 0,numerator,denominator,proportion (n/N),prevalence (%)
Borderline,500.0,736750.0,0.001,0.068
ChronicPTSD,120.0,736750.0,0.0,0.016
ComplexPTSD,120.0,736750.0,0.0,0.016
ChronicDepression,1190.0,736750.0,0.002,0.162
Dysthymia,540.0,736750.0,0.001,0.073
PersonalityDisorder,3690.0,736750.0,0.005,0.501
Meds_PsychosisAndRelated,3740.0,736750.0,0.005,0.508
Meds_hypnoticsAndAnxiolytics,4350.0,736750.0,0.006,0.59
Meds_antidepressants,46900.0,736750.0,0.064,6.366


## Calculating the entropy of the caseness

In [99]:
print("\n \'Definite caseness\' (Dx and Rx) variable...")
entropy_caseness_scaled_DxAndRx = entropy_output(caseness_array['CMHD_dx_and_rx'])[1]

print("\n \'Diagnosis-based caseness\' (Dx not Rx) variable...")
entropy_caseness_scaled_DxNotRx = entropy_output(caseness_array['CMHD_dx_not_rx'])[1]

print("\n \'Prescription-based caseness\' (Rx not Dx) variable...")
entropy_caseness_scaled_RxNotDx = entropy_output(caseness_array['CMHD_rx_not_dx'])[1]

print("\n \'Multinomial caseness\' variable...")
entropy_caseness_scaled_multinomial = entropy_output(caseness_array['CMHD_multi'])[1]

print("\n \'Prescription-vs-Definite caseness\' variable...")
entropy_caseness_scaled_prescriptionVsDefinite = entropy_output(caseness_array['CMHD_prescriptionVsDefinite'])[1]

print("\n \'No caseness\' variable...")
entropy_caseness_scaled_control = entropy_output(caseness_array['CMHD_control'])[1]


 'Definite caseness' (Dx and Rx) variable...
	 Caseness variable entropy = 0.016 nats
	 The caseness variable's entropy is 2.3 % of its theoretical maximum


 'Diagnosis-based caseness' (Dx not Rx) variable...
	 Caseness variable entropy = 0.026 nats
	 The caseness variable's entropy is 3.7 % of its theoretical maximum


 'Prescription-based caseness' (Rx not Dx) variable...
	 Caseness variable entropy = 0.242 nats
	 The caseness variable's entropy is 35.0 % of its theoretical maximum


 'Multinomial caseness' variable...
	 Caseness variable entropy = 0.284 nats
	 The caseness variable's entropy is 20.5 % of its theoretical maximum


 'Prescription-vs-Definite caseness' variable...
	 Caseness variable entropy = 0.146 nats
	 The caseness variable's entropy is 21.1 % of its theoretical maximum


 'No caseness' variable...
	 Caseness variable entropy = 0.259 nats
	 The caseness variable's entropy is 37.3 % of its theoretical maximum



## Calculating hit rates

In [100]:
print("\n \'Definite caseness\' (Dx and Rx) variable...")
hitRate_none_DxAndRx, hitRate_all_DxAndRx = hitrate_output(caseness_array['CMHD_dx_and_rx'])

print("\n \'Diagnosis-based caseness\' (Dx not Rx) variable...")
hitRate_none_DxNotRx, hitRate_all_DxNotRx = hitrate_output(caseness_array['CMHD_dx_not_rx'])

print("\n \'Prescription-based caseness\' (Rx not Dx) variable...")
hitRate_none_RxNotDx, hitRate_all_RxNotDx = hitrate_output(caseness_array['CMHD_rx_not_dx'])

print("\n \'Multinomial caseness\' variable...")
hitRate_none_multinomial, hitRate_all_multinomial = hitrate_output(caseness_array['CMHD_multi'])

print("\n \'Possible-vs-Definite caseness\' variable...")
hitRate_none_prescriptionVsDefinite, hitRate_all_prescriptionVsDefinite = hitrate_output(caseness_array['CMHD_prescriptionVsDefinite'])


 'Definite caseness' (Dx and Rx) variable...
	 Hit rate (all) = 0.227 %
	 Hit rate (none) = 99.773 %
	 Odds (No : Yes) = 439-times less likely to have definite caseness than to have it.

 'Diagnosis-based caseness' (Dx not Rx) variable...
	 Hit rate (all) = 0.398 %
	 Hit rate (none) = 99.602 %
	 Odds (No : Yes) = 250-times less likely to have diagnosis-based caseness than to have it.

 'Prescription-based caseness' (Rx not Dx) variable...
	 Hit rate (all) = 6.573 %
	 Hit rate (none) = 93.427 %
	 Odds (No : Yes) = 14-times less likely to have prescription-based caseness than to have it.

 'Multinomial caseness' variable...
	 Hit rate (all) = 7.197 %
	 Hit rate (none) = 92.803 %
	 Odds (No : Yes) = 12-times less likely to not have any caseness than to have one.

 'Possible-vs-Definite caseness' variable...
	 Hit rate (all) = 3.342 %
	 Hit rate (none) = 96.658 %
	 Odds (No : Yes) = 28-times less likely to have prescription-based caseness than definite caseness.


In [101]:
display(
    Markdown(
f"""    
We now know that:
1. based on the scaled entropies, our variables for indicating caseness of complex mental health difficulties are $\le{round(entropy_caseness_scaled_DxAndRx, 1)}\%$ as uncertain/surprising/unforeseeable
as they could possibly be; _and_
2. we would correctly classify $\ge{round(hitRate_none_DxAndRx, 1)}\%$ of patients in this sample if we simply assumed that no one has complex mental health difficulties.

The first point tells us that definite caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. This leaves very little room for improvement via feature sets.

The second point defines a benchmark for the indicative performance of any feature set that we evaluate in our study. Specifically, any feature set that we suggest to improve our certainty of knowing
that someone has complex mental health difficulties must correctly identify $\ge{round(hitRate_none_DxAndRx, 1)}\%$ of patients in our sample. Otherwise, the added feature set is a needless
complication to our attempt to know whether or not someone has complex mental health difficulties (which we can almost always safely assume they don't). This is such a high benchmark that we will be very
unlikely to find such a feature set.

We must remember that we are not trying to out-predict an identification rule based on caseness prevalence. Rather, we are trying to find feature sets that correlate with this
caseness prevalence. Large correlations would be difficult to find using variance-based methods like Pearson's product moment correlation or regression methods because the variance of the
caseness variable is so low. Our approach based on mutual-information is better suited to this situation because its fundamental concept is coincidence rather than covariance.
"""
        )
)

    
We now know that:
1. based on the scaled entropies, our variables for indicating caseness of complex mental health difficulties are $\le2.3\%$ as uncertain/surprising/unforeseeable
as they could possibly be; _and_
2. we would correctly classify $\ge99.8\%$ of patients in this sample if we simply assumed that no one has complex mental health difficulties.

The first point tells us that definite caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. This leaves very little room for improvement via feature sets.

The second point defines a benchmark for the indicative performance of any feature set that we evaluate in our study. Specifically, any feature set that we suggest to improve our certainty of knowing
that someone has complex mental health difficulties must correctly identify $\ge99.8\%$ of patients in our sample. Otherwise, the added feature set is a needless
complication to our attempt to know whether or not someone has complex mental health difficulties (which we can almost always safely assume they don't). This is such a high benchmark that we will be very
unlikely to find such a feature set.

We must remember that we are not trying to out-predict an identification rule based on caseness prevalence. Rather, we are trying to find feature sets that correlate with this
caseness prevalence. Large correlations would be difficult to find using variance-based methods like Pearson's product moment correlation or regression methods because the variance of the
caseness variable is so low. Our approach based on mutual-information is better suited to this situation because its fundamental concept is coincidence rather than covariance.


In [102]:
# Below, I compute the cells of the contingency table for a rule that says no one has caseness of complex mental health difficulties.
#
# True positives. Zero because the rule says no one demonstrates 'Definite caseness' so no "positives" of any kind exist.
tp = 0
# False positives. Zero because the rule says no one demonstrates 'Definite caseness' so no "positives" of any kind exist.
fp = 0
# True negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
tn_DxAndRx = hitRate_none_DxAndRx / 100 * denominator_as_int
tn_DxNotRx = hitRate_none_DxNotRx / 100 * denominator_as_int
tn_RxNotDx = hitRate_none_RxNotDx / 100 * denominator_as_int
tn_multinomial = hitRate_none_multinomial / 100 * denominator_as_int
tn_prescriptionVsDefinite = hitRate_none_prescriptionVsDefinite / 100 * prescriptionVsDefinite_denominator_as_int
# False negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
fn_DxAndRx = hitRate_all_DxAndRx / 100 * denominator_as_int
fn_DxNotRx = hitRate_all_DxNotRx / 100 * denominator_as_int
fn_RxNotDx = hitRate_all_RxNotDx / 100 * denominator_as_int
fn_multinomial = hitRate_all_multinomial / 100 * denominator_as_int
fn_prescriptionVsDefinite = hitRate_all_prescriptionVsDefinite / 100 * prescriptionVsDefinite_denominator_as_int

# Below, I compute the evaluation statistics.
#
# Class balance accuracy.
cba_DxAndRx = round( 0.5 * ( (tp / max( (tp + fn_DxAndRx), (tp + fp) ) ) + (tn_DxAndRx / max( (tn_DxAndRx + fp), (tn_DxAndRx + fn_DxAndRx) ) ) ), 2)
cba_DxNotRx = round( 0.5 * ( (tp / max( (tp + fn_DxNotRx), (tp + fp) ) ) + (tn_DxNotRx / max( (tn_DxNotRx + fp), (tn_DxNotRx + fn_DxNotRx) ) ) ), 2)
cba_RxNotDx = round( 0.5 * ( (tp / max( (tp + fn_RxNotDx), (tp + fp) ) ) + (tn_RxNotDx / max( (tn_RxNotDx + fp), (tn_RxNotDx + fn_RxNotDx) ) ) ), 2)
cba_multinomial = round( 0.5 * ( (tp / max( (tp + fn_multinomial), (tp + fp) ) ) + (tn_multinomial / max( (tn_multinomial + fp), (tn_multinomial + fn_multinomial) ) ) ), 2)
cba_prescriptionVsDefinite = round( 0.5 * ( (tp / max( (tp + fn_prescriptionVsDefinite), (tp + fp) ) ) \
                                           + (tn_prescriptionVsDefinite / max( (tn_prescriptionVsDefinite + fp), (tn_prescriptionVsDefinite + fn_prescriptionVsDefinite) ) ) ), 2)
# Odds ratio.
OR_DxAndRx = 'Not a number because one of the odds is zero.' if min( (tp * tn_DxAndRx) , (fp * fn_DxAndRx) ) == 0 else round( (tp * tn_DxAndRx) / (fp * fn_DxAndRx), 2)
OR_DxNotRx = 'Not a number because one of the odds is zero.' if min( (tp * tn_DxNotRx) , (fp * fn_DxNotRx) ) == 0 else round( (tp * tn_DxNotRx) / (fp * fn_DxNotRx), 2)
OR_RxNotDx = 'Not a number because one of the odds is zero.' if min( (tp * tn_RxNotDx) , (fp * fn_RxNotDx) ) == 0 else round( (tp * tn_RxNotDx) / (fp * fn_RxNotDx), 2)
OR_multinomial = 'Not a number because one of the odds is zero.' if min( (tp * tn_multinomial) , (fp * fn_multinomial) ) == 0 else round( (tp * tn_multinomial) / (fp * fn_multinomial), 2)
OR_prescriptionVsDefinite = 'Not a number because one of the odds is zero.' \
                                if min( (tp * tn_prescriptionVsDefinite) , (fp * fn_prescriptionVsDefinite) ) == 0 \
                                else round( (tp * tn_prescriptionVsDefinite) / (fp * fn_prescriptionVsDefinite), 2)
# Positive predictive value.
ppv = 0 if (tp + fp) == 0 else round( tp / (tp + fp), 2)
# Negative predictive value.
npv_DxAndRx = 0 if (tn_DxAndRx + fn_DxAndRx) == 0 else round( tn_DxAndRx / (tn_DxAndRx + fn_DxAndRx), 2)
npv_DxNotRx = 0 if (tn_DxNotRx + fn_DxNotRx) == 0 else round( tn_DxNotRx / (tn_DxNotRx + fn_DxNotRx), 2)
npv_RxNotDx = 0 if (tn_RxNotDx + fn_RxNotDx) == 0 else round( tn_RxNotDx / (tn_RxNotDx + fn_RxNotDx), 2)
npv_multinomial = 0 if (tn_multinomial + fn_multinomial) == 0 else round( tn_multinomial / (tn_multinomial + fn_multinomial), 2)
npv_prescriptionVsDefinite = 0 if (tn_prescriptionVsDefinite + fn_prescriptionVsDefinite) == 0 else round( tn_prescriptionVsDefinite / (tn_prescriptionVsDefinite + fn_prescriptionVsDefinite), 2)

display(
    Markdown(
f"""    
If we assume a rule that says no one demonstrates caseness of complex mental health difficulties, then we get the following approximate values for our evaluation statistics:

| Caseness               | Normalised mutual information  | Class balance accuracy      |    Odds ratio              | Positive predictive value | Negative predictive value    |
| ---------------------- | ------------------------------ | --------------------------- | -------------------------- | ------------------------- | ---------------------------- |
| Dx and Rx              | x \u2192 0                     | {cba_DxAndRx}               | {OR_DxAndRx}               | {ppv}                     | {npv_DxAndRx}                |
| Dx not Rx              | x \u2192 0                     | {cba_DxNotRx}               | {OR_DxNotRx}               | {ppv}                     | {npv_DxNotRx}                |
| Rx not Dx              | x \u2192 0                     | {cba_RxNotDx}               | {OR_RxNotDx}               | {ppv}                     | {npv_RxNotDx}                |
| Multinomial            | x \u2192 0                     | {cba_multinomial}           | {OR_multinomial}           | {ppv}                     | {npv_multinomial}            |
| Rx-not-Dx vs Dx-and-Rx | x \u2192 0                     | {cba_prescriptionVsDefinite}| {OR_prescriptionVsDefinite}| {ppv}                     | {npv_prescriptionVsDefinite}|

"""
    )
)

    
If we assume a rule that says no one demonstrates caseness of complex mental health difficulties, then we get the following approximate values for our evaluation statistics:

| Caseness               | Normalised mutual information  | Class balance accuracy      |    Odds ratio              | Positive predictive value | Negative predictive value    |
| ---------------------- | ------------------------------ | --------------------------- | -------------------------- | ------------------------- | ---------------------------- |
| Dx and Rx              | x → 0                     | 0.5               | Not a number because one of the odds is zero.               | 0                     | 1.0                |
| Dx not Rx              | x → 0                     | 0.5               | Not a number because one of the odds is zero.               | 0                     | 1.0                |
| Rx not Dx              | x → 0                     | 0.47               | Not a number because one of the odds is zero.               | 0                     | 0.93                |
| Multinomial            | x → 0                     | 0.46           | Not a number because one of the odds is zero.           | 0                     | 0.93            |
| Rx-not-Dx vs Dx-and-Rx | x → 0                     | 0.48| Not a number because one of the odds is zero.| 0                     | 0.97|

