# Caseness array

The purpose of this notebook is to produce the caseness array. The caseness array is an n-by-2 array containing patient ID and a binary vector indicating whether the patient is clinically coded for complex mental health difficulties.

### Imports

In [4]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

### Prerequisites

In [5]:
client = bigquery.Client()

folder_loc = os.path.dirname(os.path.abspath("UNSEEN create caseness array.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_bipolar = pandas.read_csv(folder + "ciaranmci-bipolar-disorder-6a0308d7.csv")
codes_to_query_borderline = pandas.read_csv(folder + "ciaranmci-borderline-personality-disorder-1ed4af38.csv")
codes_to_query_chronicDepression = pandas.read_csv(folder + "ciaranmci-chronic-depression-53a65598.csv")
codes_to_query_chronicPTSD = pandas.read_csv(folder + "ciaranmci-chronic-post-traumatic-stress-disorder-3a96e263.csv")
codes_to_query_complexPTSD = pandas.read_csv(folder + "ciaranmci-complex-post-traumatic-stress-disorder-21876f2e.csv")
codes_to_query_dysthymia = pandas.read_csv(folder + "ciaranmci-dysthymia-6f6888c3.csv")
codes_to_query_personalityDisorder = pandas.read_csv(folder + "ciaranmci-personality-disorder-243a2f24.csv")
codes_to_query_schizophrenia = pandas.read_csv(folder + "ciaranmci-schizophrenia-05c53c03.csv")
codes_to_query_all = pandas.read_csv(folder + "ciaranmci-unseen-snomed-codes-to-identify-cmhd-0b2abbef.csv")

# Medications of interest.
medications_to_query_psychosisAndRelated = pandas.read_csv(folder + "UNSEEN medications_psychosisAndRelated.csv")
medications_to_query_hypnoticsAndAnxiolytics = pandas.read_csv(folder + "UNSEEN medications_hypnoticsAndAnxiolytics.csv")
medications_to_query_antidepressants = pandas.read_csv(folder + "UNSEEN medications_antidepressants.csv")
medications_to_query_all = pandas.read_csv(folder + "UNSEEN medications list.csv")

## Creating the array

In [6]:
sql = """
WITH
# The first CTE will specify the 'spine' of the data table by selecting the unique list of person IDs.
tbl_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.person
    # Limiting to age range 18-70.
    WHERE
        (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)

# The following CTEs extract each clinical codelist into a SQL table before querying the person_ID 
# associated with the clinical codes.
#
#  ## Bipolar disorder
,tbl_bipolar AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_bipolar["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_bipolar_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_bipolar
    WHERE
        src_snomedcode IN (tbl_bipolar.snomedcode)
)
#  ## Borderline personality disorder
,tbl_borderline AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_borderline["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_borderline_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_borderline
    WHERE
        src_snomedcode IN (tbl_borderline.snomedcode)
)
#  ## Chronic PTSD
,tbl_chronicPTSD AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_chronicPTSD_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_chronicPTSD
    WHERE
        src_snomedcode IN (tbl_chronicPTSD.snomedcode)
)
#  ## Complex PTSD
,tbl_complexPTSD AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_complexPTSD_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_complexPTSD
    WHERE
        src_snomedcode IN (tbl_complexPTSD.snomedcode)
)
#  ## Chronic depression
,tbl_chronicDepression AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicDepression["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_chronicDepression_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_chronicDepression
    WHERE
        src_snomedcode IN (tbl_chronicDepression.snomedcode)
)
#  ## Dysthymia
,tbl_dysthymia AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_dysthymia["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_dysthymia_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_dysthymia
    WHERE
        src_snomedcode IN (tbl_dysthymia.snomedcode)
)
#  ## Personality disorder
,tbl_personalityDisorder AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_personalityDisorder["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_personalityDisorder_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_personalityDisorder
    WHERE
        src_snomedcode IN (tbl_personalityDisorder.snomedcode)
)
#  ## Schizophrenia
,tbl_schizophrenia AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_schizophrenia["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_schizophrenia_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_schizophrenia
    WHERE
        src_snomedcode IN (tbl_schizophrenia.snomedcode)
)


# The following CTEs extract each medication list into a SQL table before querying the person_ID 
# associated with the medications (combined into medication type).
#
#  ## Drugs used in psychosis and related disorders.
,tbl_meds_psychosisAndRelated AS (
    SELECT
        Medication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_psychosisAndRelated["Medication"].tolist())) + """'
                ]) AS Medication
)
,tbl_meds_psychosisAndRelated_persons

AS (
    SELECT
      DISTINCT person_id
    FROM
      yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRPrimaryCareMedication, tbl_meds_psychosisAndRelated
    WHERE
        src_nameofmedication LIKE CAST(CONCAT('%',tbl_meds_psychosisAndRelated.Medication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(src_datemedicationstart AS DATE), MONTH) < 4
)
#  ## Hypnotics and anxiolyitcs
,tbl_meds_hypnoticsAndAnxiolytics AS (
    SELECT
        Medication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS Medication
)
,tbl_meds_hypnoticsAndAnxiolytics_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRPrimaryCareMedication, tbl_meds_hypnoticsAndAnxiolytics
    WHERE
        src_nameofmedication LIKE CAST(CONCAT('%',tbl_meds_hypnoticsAndAnxiolytics.Medication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(src_datemedicationstart AS DATE), MONTH) < 4
)
#  ## Antidepressants
,tbl_meds_antidepressants AS (
    SELECT
        Medication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS Medication
)
,tbl_meds_antidepressants_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRPrimaryCareMedication, tbl_meds_antidepressants
    WHERE
        src_nameofmedication LIKE CAST(CONCAT('%',tbl_meds_antidepressants.Medication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(src_datemedicationstart AS DATE), MONTH) < 4
)


# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# clinical code and medication group. The code and medication columns are populated by interger
# values with '1' indicating that the code or medication is present in patient record and '0' indicating
# otherwise.
SELECT
    DISTINCT tbl_persons.person_id
    ,CASE WHEN tbl_bipolar_persons.person_id IS NULL THEN 0 ELSE 1 END AS Bipolar
    ,CASE WHEN tbl_borderline_persons.person_id IS NULL THEN 0 ELSE 1 END AS Borderline
    ,CASE WHEN tbl_chronicPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicPTSD
    ,CASE WHEN tbl_complexPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ComplexPTSD
    ,CASE WHEN tbl_chronicDepression_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicDepression
    ,CASE WHEN tbl_dysthymia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Dysthymia
    ,CASE WHEN tbl_personalityDisorder_persons.person_id IS NULL THEN 0 ELSE 1 END AS PersonalityDisorder
    ,CASE WHEN tbl_schizophrenia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Schizophrenia
    ,CASE WHEN tbl_meds_psychosisAndRelated_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_PsychosisAndRelated
    ,CASE WHEN tbl_meds_hypnoticsAndAnxiolytics_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_hypnoticsAndAnxiolytics
    ,CASE WHEN tbl_meds_antidepressants_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_antidepressants
FROM tbl_persons
LEFT OUTER JOIN tbl_bipolar_persons ON tbl_persons.person_id = tbl_bipolar_persons.person_id
LEFT OUTER JOIN tbl_borderline_persons ON tbl_persons.person_id = tbl_borderline_persons.person_id
LEFT OUTER JOIN tbl_chronicPTSD_persons ON tbl_persons.person_id = tbl_chronicPTSD_persons.person_id
LEFT OUTER JOIN tbl_complexPTSD_persons ON tbl_persons.person_id = tbl_complexPTSD_persons.person_id
LEFT OUTER JOIN tbl_chronicDepression_persons ON tbl_persons.person_id = tbl_chronicDepression_persons.person_id
LEFT OUTER JOIN tbl_dysthymia_persons ON tbl_persons.person_id = tbl_dysthymia_persons.person_id
LEFT OUTER JOIN tbl_personalityDisorder_persons ON tbl_persons.person_id = tbl_personalityDisorder_persons.person_id
LEFT OUTER JOIN tbl_schizophrenia_persons ON tbl_persons.person_id = tbl_schizophrenia_persons.person_id
LEFT OUTER JOIN tbl_meds_psychosisAndRelated_persons ON tbl_persons.person_id = tbl_meds_psychosisAndRelated_persons.person_id
LEFT OUTER JOIN tbl_meds_hypnoticsAndAnxiolytics_persons ON tbl_persons.person_id = tbl_meds_hypnoticsAndAnxiolytics_persons.person_id
LEFT OUTER JOIN tbl_meds_antidepressants_persons ON tbl_persons.person_id = tbl_meds_antidepressants_persons.person_id
ORDER BY tbl_persons.person_id
"""

bqTable = client.query(sql).to_dataframe()

# Remove patients with codes for the exclusions: {bipolar, schizophrenia}.
bqTable.drop(bqTable.loc[(bqTable.Bipolar == 1) | (bqTable.Schizophrenia == 1)].index, inplace=True)
bqTable.drop(['Bipolar', 'Schizophrenia'], axis=1, inplace=True)
bqTable.reset_index(drop = True, inplace = True)

# Store bqTable for use in other notebaooks
%store bqTable

Stored 'bqTable' (DataFrame)


In [7]:
CMHD_dx_only = bqTable.loc[:, ~bqTable.columns.isin(['person_id',
                                                     'Meds_PsychosisAndRelated',
                                                     'Meds_hypnoticsAndAnxiolytics',
                                                     'Meds_antidepressants'])].max(axis = 1)
CMHD_rx_only = bqTable[['Meds_PsychosisAndRelated', 
                        'Meds_hypnoticsAndAnxiolytics',
                        'Meds_antidepressants']].max(axis = 1)
CMHD = []
for i_iter in range(len(CMHD_rx_only)):
    if (CMHD_dx_only[i_iter] == 1 & CMHD_rx_only[i_iter] == 1):
        # If the patient has a diagnostic code AND an active prescription,
        # then they are a definite CMHD = 2.
        CMHD.append(2)
    elif (CMHD_dx_only[i_iter] != 1 & CMHD_rx_only[i_iter] == 1):
        # If the patient has an active prescription but not a diagnostic code,
        # then they are a possible CMHD = 1.
        CMHD.append(1)
    else:
        # If the patient neither has a diagnostic code or an active prescriptions,
        # then they are definitely not CMHD = 0.
        CMHD.append(0)
        
caseness_array = \
    pandas.DataFrame(data = \
                     {"person_id" : bqTable['person_id'],
                      "CMHD" : CMHD,
                      "CMHD_dx_and_rx" : [1 if i_row == 2 else 0 for i_row in CMHD],
                      "CMHD_rx_not_dx" : [1 if i_row == 1 else 0 for i_row in CMHD],
                      "CMHD_control"   : [1 if i_row == 0 else 0 for i_row in CMHD],
                     }
                    )

# Clean up.
del(CMHD_dx_only, CMHD_rx_only)
# Make caseness_array available across notebooks.
%store caseness_array

Stored 'caseness_array' (DataFrame)


In [8]:
# Set parameters for disclosivity adjustments.
redaction_threshold = 7
target_round = 10

# Prepare header and note for presentation.
now = date.today()
#then = (now - relativedelta(years = 1)).strftime('%d-%b-%Y')
now = now.strftime('%d-%b-%Y')
display(
    Markdown("""
## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest 10.
Only then are proportions and subsequent prevalences-per-hundered calculated.

The prevalence values refer to the period up to %s.
       """
       %(now)
       )
)

# Define the denominator.
denominator_as_int = len(caseness_array)
denominator_as_int = round(denominator_as_int / target_round) * target_round
denominator = \
    numpy.repeat(denominator_as_int,
                 bqTable.shape[1]-1, axis = 0)

# Define the prevlance dataframe for calculating the prevalence of caseness components.
df_prevalence = \
    pandas.DataFrame(data = {'numerator'   : bqTable.loc[:,bqTable.columns !=  'person_id'].sum(),
                             'denominator' : denominator})
# Redact low counts.
df_prevalence = \
    df_prevalence.applymap(lambda x: numpy.nan if x <= redaction_threshold else x)

# Round to nearest target_round value.
df_prevalence = \
    (round(df_prevalence / target_round) * target_round)

# Compute porportions.
df_prevalence['proportion (n/N)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']), 3)

# Compute prevalence.
df_prevalence['prevalence (%)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']) * 100, 3)

# Print prevalence table.
display(df_prevalence)

# Calculate the minimum counts and proportions of any criteria diagnoses
# and criterion medications.
min_criterion_count_diag = \
    df_prevalence.loc[~df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                 'Meds_hypnoticsAndAnxiolytics',
                                                 'Meds_antidepressants']),
                      'numerator'].min()
min_criterion_count_meds = \
    df_prevalence.loc[df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                'Meds_hypnoticsAndAnxiolytics',
                                                'Meds_antidepressants']),
                      'numerator'].min()
min_criterion_prop_diag = min_criterion_count_diag / denominator_as_int
min_criterion_prop_meds = min_criterion_count_meds / denominator_as_int
min_criterion_prop = min_criterion_prop_diag * min_criterion_prop_meds
max_criterion_prop = 0.5

# Define count equivalents of criteria prevalences.
min_criterion_count = int(min_criterion_prop * denominator_as_int)
max_criterion_count = int(max_criterion_prop * denominator_as_int)

# Display message.
display(
    Markdown(
"""
Minimum prevalence of any criterion diagnosis is __%s%%__ of %s patients.

Minimum prevalence of any criterion medication is __%s%%__ of %s patients.
"""
        %(round(min_criterion_prop_diag * 100, 3),
          f'{denominator_as_int:,}',
          round(min_criterion_prop_meds * 100, 3),
          f'{denominator_as_int:,}'
          )
    )
)
if min_criterion_count > target_round:
    display(
        Markdown(
    """
    Therefore, the minimum criterion prevalence is __%s%%__ (i.e. the product of the minimum diagnostic and medication prevalence values).
    This equates to a minimum criterion count of __%s__.
    """
            %(round(min_criterion_prop * 100, 3)
              ,min_criterion_count)
        )
    )
else:
    # Redefine the minimum criterion prevalence and count to mitigate disclosivity.
    min_criterion_count = 10
    min_criterion_prop = round(min_criterion_count / denominator_as_int, 3)
    display(
        Markdown(
    """
    Therefore, the minimum criterion count - i.e. the count equivalent of the product of the minimum diagnostic
    and medication prevalence values - is vanishingly small and equivalent to a patient count below our redaction and rounding thresholds.\n
    Given this finding, the minimum criterion count will be set to __%s__.
    """
            %(min_criterion_count)
        )
    )

# Make variables available across notebooks.
%store denominator_as_int min_criterion_prop_diag min_criterion_prop_meds min_criterion_prop max_criterion_prop \
min_criterion_count_diag min_criterion_count_meds min_criterion_count max_criterion_count


## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest 10.
Only then are proportions and subsequent prevalences-per-hundered calculated.

The prevalence values refer to the period up to 21-Mar-2023.
       

Unnamed: 0,numerator,denominator,proportion (n/N),prevalence (%)
Borderline,370.0,698620.0,0.001,0.053
ChronicPTSD,110.0,698620.0,0.0,0.016
ComplexPTSD,110.0,698620.0,0.0,0.016
ChronicDepression,1110.0,698620.0,0.002,0.159
Dysthymia,480.0,698620.0,0.001,0.069
PersonalityDisorder,2960.0,698620.0,0.004,0.424
Meds_PsychosisAndRelated,10.0,698620.0,0.0,0.001
Meds_hypnoticsAndAnxiolytics,,698620.0,,
Meds_antidepressants,100.0,698620.0,0.0,0.014



Minimum prevalence of any criterion diagnosis is __0.016%__ of 698,620 patients.

Minimum prevalence of any criterion medication is __0.001%__ of 698,620 patients.



    Therefore, the minimum criterion count - i.e. the count equivalent of the product of the minimum diagnostic
    and medication prevalence values - is vanishingly small and equivalent to a patient count below our redaction and rounding thresholds.

    Given this finding, the minimum criterion count will be set to __10__.
    

Stored 'denominator_as_int' (int)
Stored 'min_criterion_prop_diag' (float64)
Stored 'min_criterion_prop_meds' (float64)
Stored 'min_criterion_prop' (float)
Stored 'max_criterion_prop' (float)
Stored 'min_criterion_count_diag' (float64)
Stored 'min_criterion_count_meds' (float64)
Stored 'min_criterion_count' (int)
Stored 'max_criterion_count' (int)


## Calculating the entropy of the caseness.

In [9]:
print("\n \'Multinomial caseness\' variable...")
entropy_caseness_scaled_multi = entropy_output(caseness_array['CMHD'])[0]
print("\n \'Definite caseness\' variable...")
entropy_caseness_scaled_definite = entropy_output(caseness_array['CMHD_dx_and_rx'])[0]
print("\n \'Possible caseness\' variable...")
entropy_caseness_scaled_possibly = entropy_output(caseness_array['CMHD_rx_not_dx'])[0]
print("\n \'No caseness\' variable...")
entropy_caseness_scaled_control = entropy_output(caseness_array['CMHD_control'])[0]

# Make variables available across notebooks.
%store entropy_caseness_scaled_multi entropy_caseness_scaled_definite \
entropy_caseness_scaled_possibly entropy_caseness_scaled_control


 'Multinomial caseness' variable...
	 Caseness variable entropy = 0.002 nats
	 Caseness variable scaled entropy = 0.223 %

 'Definite caseness' variable...
	 Caseness variable entropy < 0.001 nats
	 Caseness variable scaled entropy < 0.001 %

 'Possible caseness' variable...
	 Caseness variable entropy = 0.001 nats
	 Caseness variable scaled entropy = 0.214 %

 'No caseness' variable...
	 Caseness variable entropy = 0.002 nats
	 Caseness variable scaled entropy = 0.22 %
Stored 'entropy_caseness_scaled_multi' (float64)
Stored 'entropy_caseness_scaled_definite' (float64)
Stored 'entropy_caseness_scaled_possibly' (float64)
Stored 'entropy_caseness_scaled_control' (float64)


## Calculating hit rates.

In [10]:
print("\n \'Definite caseness\' variable...")
hitRate_none, hitRate_all = hitrate_output(caseness_array['CMHD_dx_and_rx'])
print("\n \'Possible caseness\' variable...")
hitRate_none, hitRate_all = hitrate_output(caseness_array['CMHD_rx_not_dx'])


 'Definite caseness' variable...
	 Hit rate (all) < 0.001 %
	 Hit rate (none) ≈ 100 %
	 Odds (No CMHD : CMHD) ≈ infitely-times less likely to have CMHD than to have it.

 'Possible caseness' variable...
	 Hit rate (all) = 0.015 %
	 Hit rate (none) = 99.985 %
	 Odds (No CMHD : CMHD) = 6,589 -times less likely to have CMHD than to have it.


In [11]:
display(
    Markdown(
"""    
We now know that:
1. based on the scaled entropy, our variable for indicating active complex mental health difficulties is $%s\%%$ as uncertain/surprising/unforeseeable
as it could possibly be; _and_
2. we would correctly classify $%s\%%$ of patients in this sample if we simply assumed that no one has active complex mental health difficulties.

The first point tells us that active caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. This 
encourages us to find feature sets that are proxies for this variable.
Given this encourgement to continue, the second point defines a benchmark for the indicative performance of any feature set that we evaluate in our
study. Specifically, any feature set that we suggest to improve our certainty of knowing that someone has complex mental health difficulties
must correctly identify $\ge %s \%%$ of patients in our sample. Otherwise, the added feature set is a needless complication to our attempt to know
whether someone has complex mental health difficulties (which we can almost always safely assume they don't).
"""
        %(round(entropy_caseness_scaled_definite, 4)
          ,round(hitRate_none, 2)
          ,round(hitRate_none, 2)
         )
        )
)

    
We now know that:
1. based on the scaled entropy, our variable for indicating active complex mental health difficulties is $0.0001\%$ as uncertain/surprising/unforeseeable
as it could possibly be; _and_
2. we would correctly classify $99.98\%$ of patients in this sample if we simply assumed that no one has active complex mental health difficulties.

The first point tells us that active caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. This 
encourages us to find feature sets that are proxies for this variable.
Given this encourgement to continue, the second point defines a benchmark for the indicative performance of any feature set that we evaluate in our
study. Specifically, any feature set that we suggest to improve our certainty of knowing that someone has complex mental health difficulties
must correctly identify $\ge 99.98 \%$ of patients in our sample. Otherwise, the added feature set is a needless complication to our attempt to know
whether someone has complex mental health difficulties (which we can almost always safely assume they don't).


In [12]:
# Below, I compute the cells of the contingency table for a rule that says no one has 'Definite caseness' of complex mental health difficulties.
#
# True positives. Zero because the rule says no one demonstrates 'Definite caseness' so no "positives" of any kind exist.
tp = 0
# False positives. Zero because the rule says no one demonstrates 'Definite caseness' so no "positives" of any kind exist.
fp = 0
# True negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
tn = hitRate_none / 100 * denominator_as_int
# False negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
fn = hitRate_all / 100 * denominator_as_int

# Below, I compute the evaluation statistics.
#
# Class balance accuracy.
cba = round( 0.5 * ( (tp / max( (tp + fn), (tp + fp) ) ) + (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
# Odds ratio.
oddsRatio = 'Not\ a\ number\ because\ one\ of\ the\ odds\ is\ zero.' if min( (tp * tn) , (fp * fn) ) == 0 else round( (tp * tn) / (fp * fn), 2)
# Positive predictive value.
ppv = 0 if (tp + fp) == 0 else round( tp / (tp + fp), 2)
# Negative predictive value.
npv = 0 if (tn + fn) == 0 else round( tn / (tn + fn), 2)

display(
    Markdown(
"""    
Assuming a rule that says no one demonstrates active caseness of complex mental health difficulties, we get the following values for our evaluation statistics:

- Class Balance Accuracy \u2248 $%s$

- Odds ratio \u2248 $%s$

- Positive predicitve value \u2248 $%s$

- Negative predicitve value \u2248 $%s$
"""
        %(cba, oddsRatio, ppv, npv)
            )
    )

    
Assuming a rule that says no one demonstrates active caseness of complex mental health difficulties, we get the following values for our evaluation statistics:

- Class Balance Accuracy ≈ $0.5$

- Odds ratio ≈ $Not\ a\ number\ because\ one\ of\ the\ odds\ is\ zero.$

- Positive predicitve value ≈ $0$

- Negative predicitve value ≈ $1.0$
